Repository: deeppavlov/DeepPavlov Branch: master Commit: 5f9fbed0c719 Files: 411 Total size: 1.8 MB Directory structure: gitextract__x5jpadh/ ├── .github/ │ └── ISSUE_TEMPLATE/ │ ├── bug_report.md │ ├── config.yml │ └── feature-request.md ├── .gitignore ├── .readthedocs.yml ├── CNAME ├── Jenkinsfile ├── LICENSE ├── MANIFEST.in ├── README.md ├── _config.yml ├── _layouts/ │ └── default.html ├── deeppavlov/ │ ├── __init__.py │ ├── __main__.py │ ├── _meta.py │ ├── configs/ │ │ ├── __init__.py │ │ ├── classifiers/ │ │ │ ├── boolqa_rubert.json │ │ │ ├── few_shot_roberta.json │ │ │ ├── glue/ │ │ │ │ ├── glue_cola_roberta.json │ │ │ │ ├── glue_mnli_cased_bert_torch.json │ │ │ │ ├── glue_mnli_mm_cased_bert_torch.json │ │ │ │ ├── glue_mnli_roberta.json │ │ │ │ ├── glue_mrpc_roberta.json │ │ │ │ ├── glue_qnli_roberta.json │ │ │ │ ├── glue_qqp_roberta.json │ │ │ │ ├── glue_rte_cased_bert_torch.json │ │ │ │ ├── glue_rte_roberta_mnli.json │ │ │ │ ├── glue_sst2_roberta.json │ │ │ │ ├── glue_stsb_roberta.json │ │ │ │ └── glue_wnli_roberta.json │ │ │ ├── insults_kaggle_bert.json │ │ │ ├── paraphraser_convers_distilrubert_2L.json │ │ │ ├── paraphraser_convers_distilrubert_6L.json │ │ │ ├── paraphraser_rubert.json │ │ │ ├── query_pr.json │ │ │ ├── rusentiment_bert.json │ │ │ ├── rusentiment_convers_bert.json │ │ │ ├── rusentiment_convers_distilrubert_2L.json │ │ │ ├── rusentiment_convers_distilrubert_6L.json │ │ │ ├── sentiment_sst_conv_bert.json │ │ │ ├── sentiment_twitter.json │ │ │ ├── superglue/ │ │ │ │ ├── superglue_boolq_roberta_mnli.json │ │ │ │ ├── superglue_copa_roberta.json │ │ │ │ ├── superglue_record_roberta.json │ │ │ │ └── superglue_wic_bert.json │ │ │ └── topics_distilbert_base_uncased.json │ │ ├── doc_retrieval/ │ │ │ ├── en_ranker_pop_wiki.json │ │ │ ├── en_ranker_tfidf_wiki.json │ │ │ └── ru_ranker_tfidf_wiki.json │ │ ├── embedder/ │ │ │ ├── bert_embedder.json │ │ │ └── bert_sentence_embedder.json │ │ ├── entity_extraction/ │ │ │ ├── entity_detection_en.json │ │ │ ├── entity_detection_ru.json │ │ │ ├── entity_extraction_en.json │ │ │ ├── entity_extraction_ru.json │ │ │ ├── entity_linking_en.json │ │ │ └── entity_linking_ru.json │ │ ├── faq/ │ │ │ └── fasttext_logreg.json │ │ ├── kbqa/ │ │ │ ├── kbqa_cq_en.json │ │ │ ├── kbqa_cq_ru.json │ │ │ └── wiki_parser.json │ │ ├── morpho_syntax_parser/ │ │ │ ├── morpho_ru_syntagrus_bert.json │ │ │ ├── ru_syntagrus_joint_parsing.json │ │ │ └── syntax_ru_syntagrus_bert.json │ │ ├── multitask/ │ │ │ ├── mt_glue.json │ │ │ └── multitask_example.json │ │ ├── ner/ │ │ │ ├── ner_bert_base.json │ │ │ ├── ner_case_agnostic_mdistilbert.json │ │ │ ├── ner_collection3_bert.json │ │ │ ├── ner_conll2003_bert.json │ │ │ ├── ner_conll2003_deberta_crf.json │ │ │ ├── ner_ontonotes_bert.json │ │ │ ├── ner_ontonotes_bert_mult.json │ │ │ ├── ner_ontonotes_deberta_crf.json │ │ │ ├── ner_rus_bert.json │ │ │ ├── ner_rus_bert_probas.json │ │ │ ├── ner_rus_convers_distilrubert_2L.json │ │ │ └── ner_rus_convers_distilrubert_6L.json │ │ ├── odqa/ │ │ │ ├── en_odqa_infer_wiki.json │ │ │ ├── en_odqa_pop_infer_wiki.json │ │ │ └── ru_odqa_infer_wiki.json │ │ ├── ranking/ │ │ │ ├── path_ranking_nll_roberta_en.json │ │ │ ├── ranking_ubuntu_v2_torch_bert_uncased.json │ │ │ ├── rel_ranking_nll_bert_ru.json │ │ │ └── rel_ranking_roberta_en.json │ │ ├── regressors/ │ │ │ └── translation_ranker.json │ │ ├── relation_extraction/ │ │ │ ├── re_docred.json │ │ │ └── re_rured.json │ │ ├── russian_super_glue/ │ │ │ ├── russian_superglue_danetqa_rubert.json │ │ │ ├── russian_superglue_lidirus_rubert.json │ │ │ ├── russian_superglue_muserc_rubert.json │ │ │ ├── russian_superglue_parus_rubert.json │ │ │ ├── russian_superglue_rcb_rubert.json │ │ │ ├── russian_superglue_rucos_rubert.json │ │ │ ├── russian_superglue_russe_rubert.json │ │ │ ├── russian_superglue_rwsd_rubert.json │ │ │ └── russian_superglue_terra_rubert.json │ │ ├── sentence_segmentation/ │ │ │ └── sentseg_dailydialog_bert.json │ │ ├── spelling_correction/ │ │ │ ├── brillmoore_wikitypos_en.json │ │ │ └── levenshtein_corrector_ru.json │ │ └── squad/ │ │ ├── qa_multisberquad_bert.json │ │ ├── qa_nq_psgcls_bert.json │ │ ├── qa_squad2_bert.json │ │ ├── squad_bert.json │ │ ├── squad_ru_bert.json │ │ ├── squad_ru_convers_distilrubert_2L.json │ │ └── squad_ru_convers_distilrubert_6L.json │ ├── core/ │ │ ├── __init__.py │ │ ├── commands/ │ │ │ ├── __init__.py │ │ │ ├── infer.py │ │ │ ├── train.py │ │ │ └── utils.py │ │ ├── common/ │ │ │ ├── __init__.py │ │ │ ├── aliases.py │ │ │ ├── base.py │ │ │ ├── chainer.py │ │ │ ├── cross_validation.py │ │ │ ├── errors.py │ │ │ ├── file.py │ │ │ ├── log.py │ │ │ ├── log_events.py │ │ │ ├── metrics_registry.json │ │ │ ├── metrics_registry.py │ │ │ ├── params.py │ │ │ ├── params_search.py │ │ │ ├── paths.py │ │ │ ├── prints.py │ │ │ ├── registry.json │ │ │ ├── registry.py │ │ │ └── requirements_registry.json │ │ ├── data/ │ │ │ ├── __init__.py │ │ │ ├── data_fitting_iterator.py │ │ │ ├── data_learning_iterator.py │ │ │ ├── dataset_reader.py │ │ │ ├── simple_vocab.py │ │ │ └── utils.py │ │ ├── models/ │ │ │ ├── __init__.py │ │ │ ├── component.py │ │ │ ├── estimator.py │ │ │ ├── nn_model.py │ │ │ ├── serializable.py │ │ │ └── torch_model.py │ │ └── trainers/ │ │ ├── __init__.py │ │ ├── fit_trainer.py │ │ ├── nn_trainer.py │ │ ├── torch_trainer.py │ │ └── utils.py │ ├── dataset_iterators/ │ │ ├── __init__.py │ │ ├── basic_classification_iterator.py │ │ ├── huggingface_dataset_iterator.py │ │ ├── morphotagger_iterator.py │ │ ├── multitask_iterator.py │ │ ├── siamese_iterator.py │ │ ├── sqlite_iterator.py │ │ ├── squad_iterator.py │ │ └── typos_iterator.py │ ├── dataset_readers/ │ │ ├── __init__.py │ │ ├── basic_classification_reader.py │ │ ├── boolqa_reader.py │ │ ├── conll2003_reader.py │ │ ├── docred_reader.py │ │ ├── faq_reader.py │ │ ├── huggingface_dataset_reader.py │ │ ├── imdb_reader.py │ │ ├── line_reader.py │ │ ├── morphotagging_dataset_reader.py │ │ ├── multitask_reader.py │ │ ├── odqa_reader.py │ │ ├── paraphraser_reader.py │ │ ├── rel_ranking_reader.py │ │ ├── rured_reader.py │ │ ├── sq_reader.py │ │ ├── squad_dataset_reader.py │ │ ├── typos_reader.py │ │ └── ubuntu_v2_reader.py │ ├── deep.py │ ├── download.py │ ├── metrics/ │ │ ├── __init__.py │ │ ├── accuracy.py │ │ ├── bleu.py │ │ ├── correlation.py │ │ ├── elmo_metrics.py │ │ ├── fmeasure.py │ │ ├── google_bleu.py │ │ ├── log_loss.py │ │ ├── mse.py │ │ ├── recall_at_k.py │ │ ├── record_metrics.py │ │ ├── roc_auc_score.py │ │ └── squad_metrics.py │ ├── models/ │ │ ├── __init__.py │ │ ├── api_requester/ │ │ │ ├── __init__.py │ │ │ ├── api_requester.py │ │ │ └── api_router.py │ │ ├── classifiers/ │ │ │ ├── __init__.py │ │ │ ├── cos_sim_classifier.py │ │ │ ├── dnnc_proba2labels.py │ │ │ ├── proba2labels.py │ │ │ ├── re_bert.py │ │ │ ├── torch_classification_model.py │ │ │ ├── torch_nets.py │ │ │ └── utils.py │ │ ├── doc_retrieval/ │ │ │ ├── __init__.py │ │ │ ├── bpr.py │ │ │ ├── logit_ranker.py │ │ │ ├── pop_ranker.py │ │ │ ├── tfidf_ranker.py │ │ │ └── utils.py │ │ ├── embedders/ │ │ │ ├── __init__.py │ │ │ ├── abstract_embedder.py │ │ │ ├── fasttext_embedder.py │ │ │ ├── tfidf_weighted_embedder.py │ │ │ └── transformers_embedder.py │ │ ├── entity_extraction/ │ │ │ ├── __init__.py │ │ │ ├── entity_detection_parser.py │ │ │ ├── entity_linking.py │ │ │ ├── find_word.py │ │ │ └── ner_chunker.py │ │ ├── kbqa/ │ │ │ ├── __init__.py │ │ │ ├── query_generator.py │ │ │ ├── query_generator_base.py │ │ │ ├── rel_ranking_infer.py │ │ │ ├── ru_adj_to_noun.py │ │ │ ├── sentence_answer.py │ │ │ ├── template_matcher.py │ │ │ ├── tree_to_sparql.py │ │ │ ├── type_define.py │ │ │ ├── utils.py │ │ │ └── wiki_parser.py │ │ ├── morpho_syntax_parser/ │ │ │ ├── __init__.py │ │ │ ├── dependency_decoding.py │ │ │ ├── joint.py │ │ │ ├── spacy_lemmatizer.py │ │ │ └── syntax_parsing.py │ │ ├── preprocessors/ │ │ │ ├── __init__.py │ │ │ ├── dirty_comments_preprocessor.py │ │ │ ├── dnnc_preprocessor.py │ │ │ ├── mask.py │ │ │ ├── multitask_preprocessor.py │ │ │ ├── ner_preprocessor.py │ │ │ ├── odqa_preprocessors.py │ │ │ ├── one_hotter.py │ │ │ ├── re_preprocessor.py │ │ │ ├── response_base_loader.py │ │ │ ├── sanitizer.py │ │ │ ├── sentseg_preprocessor.py │ │ │ ├── squad_preprocessor.py │ │ │ ├── str_lower.py │ │ │ ├── str_token_reverser.py │ │ │ ├── str_utf8_encoder.py │ │ │ ├── torch_transformers_preprocessor.py │ │ │ └── transformers_preprocessor.py │ │ ├── ranking/ │ │ │ ├── __init__.py │ │ │ └── metrics.py │ │ ├── relation_extraction/ │ │ │ ├── __init__.py │ │ │ ├── losses.py │ │ │ └── relation_extraction_bert.py │ │ ├── sklearn/ │ │ │ ├── __init__.py │ │ │ └── sklearn_component.py │ │ ├── spelling_correction/ │ │ │ ├── __init__.py │ │ │ ├── brillmoore/ │ │ │ │ ├── __init__.py │ │ │ │ └── error_model.py │ │ │ ├── electors/ │ │ │ │ ├── __init__.py │ │ │ │ ├── kenlm_elector.py │ │ │ │ └── top1_elector.py │ │ │ └── levenshtein/ │ │ │ ├── __init__.py │ │ │ ├── levenshtein_searcher.py │ │ │ ├── searcher_component.py │ │ │ └── tabled_trie.py │ │ ├── tokenizers/ │ │ │ ├── __init__.py │ │ │ ├── lazy_tokenizer.py │ │ │ ├── nltk_moses_tokenizer.py │ │ │ ├── nltk_tokenizer.py │ │ │ ├── spacy_tokenizer.py │ │ │ ├── split_tokenizer.py │ │ │ └── utils.py │ │ ├── torch_bert/ │ │ │ ├── __init__.py │ │ │ ├── crf.py │ │ │ ├── multitask_transformer.py │ │ │ ├── torch_bert_ranker.py │ │ │ ├── torch_transformers_classifier.py │ │ │ ├── torch_transformers_el_ranker.py │ │ │ ├── torch_transformers_multiplechoice.py │ │ │ ├── torch_transformers_nll_ranking.py │ │ │ ├── torch_transformers_sequence_tagger.py │ │ │ ├── torch_transformers_squad.py │ │ │ └── torch_transformers_syntax_parser.py │ │ └── vectorizers/ │ │ ├── __init__.py │ │ └── hashing_tfidf_vectorizer.py │ ├── paramsearch.py │ ├── requirements/ │ │ ├── datasets.txt │ │ ├── dependency_decoding.txt │ │ ├── en_core_web_sm.txt │ │ ├── faiss.txt │ │ ├── fasttext.txt │ │ ├── hdt.txt │ │ ├── kenlm.txt │ │ ├── lxml.txt │ │ ├── opt_einsum.txt │ │ ├── protobuf.txt │ │ ├── pytorch.txt │ │ ├── rapidfuzz.txt │ │ ├── razdel.txt │ │ ├── ru_core_news_sm.txt │ │ ├── sacremoses.txt │ │ ├── sentencepiece.txt │ │ ├── slovnet.txt │ │ ├── sortedcontainers.txt │ │ ├── torchcrf.txt │ │ ├── transformers.txt │ │ ├── udapi.txt │ │ └── whapi.txt │ ├── settings.py │ ├── utils/ │ │ ├── __init__.py │ │ ├── benchmarks/ │ │ │ ├── __init__.py │ │ │ └── benchmarks.py │ │ ├── connector/ │ │ │ ├── __init__.py │ │ │ └── dialog_logger.py │ │ ├── pip_wrapper/ │ │ │ ├── __init__.py │ │ │ └── pip_wrapper.py │ │ ├── server/ │ │ │ ├── __init__.py │ │ │ ├── metrics.py │ │ │ └── server.py │ │ ├── settings/ │ │ │ ├── __init__.py │ │ │ ├── dialog_logger_config.json │ │ │ ├── log_config.json │ │ │ └── server_config.json │ │ └── socket/ │ │ ├── __init__.py │ │ └── socket.py │ └── vocabs/ │ ├── __init__.py │ ├── typos.py │ └── wiki_sqlite.py ├── docs/ │ ├── Makefile │ ├── _static/ │ │ ├── deeppavlov.css │ │ └── my_blocks.css │ ├── _templates/ │ │ └── footer.html │ ├── apiref/ │ │ ├── core/ │ │ │ ├── commands.rst │ │ │ ├── common.rst │ │ │ ├── data.rst │ │ │ ├── models.rst │ │ │ └── trainers.rst │ │ ├── core.rst │ │ ├── dataset_iterators.rst │ │ ├── dataset_readers.rst │ │ ├── metrics.rst │ │ ├── models/ │ │ │ ├── api_requester.rst │ │ │ ├── classifiers.rst │ │ │ ├── doc_retrieval.rst │ │ │ ├── embedders.rst │ │ │ ├── entity_extraction.rst │ │ │ ├── kbqa.rst │ │ │ ├── preprocessors.rst │ │ │ ├── relation_extraction.rst │ │ │ ├── sklearn.rst │ │ │ ├── spelling_correction.rst │ │ │ ├── tokenizers.rst │ │ │ ├── torch_bert.rst │ │ │ └── vectorizers.rst │ │ ├── models.rst │ │ └── vocabs.rst │ ├── conf.py │ ├── devguides/ │ │ ├── contribution_guide.rst │ │ └── registry.rst │ ├── features/ │ │ ├── hypersearch.rst │ │ ├── models/ │ │ │ ├── KBQA.ipynb │ │ │ ├── NER.ipynb │ │ │ ├── ODQA.ipynb │ │ │ ├── SQuAD.ipynb │ │ │ ├── bert.rst │ │ │ ├── classification.ipynb │ │ │ ├── entity_extraction.ipynb │ │ │ ├── few_shot_classification.ipynb │ │ │ ├── morpho_tagger.ipynb │ │ │ ├── multitask_bert.rst │ │ │ ├── neural_ranking.ipynb │ │ │ ├── popularity_ranking.rst │ │ │ ├── relation_extraction.ipynb │ │ │ ├── spelling_correction.ipynb │ │ │ ├── superglue.rst │ │ │ ├── syntax_parser.ipynb │ │ │ └── tfidf_ranking.ipynb │ │ ├── overview.rst │ │ └── pretrained_vectors.rst │ ├── index.rst │ ├── integrations/ │ │ ├── aws_ec2.rst │ │ ├── rest_api.rst │ │ ├── settings.rst │ │ └── socket_api.rst │ ├── internships/ │ │ └── internships.rst │ └── intro/ │ ├── configuration.rst │ ├── installation.rst │ ├── overview.rst │ ├── python.ipynb │ └── quick_start.rst ├── requirements.txt ├── setup.py ├── tests/ │ ├── __init__.py │ ├── test_configs/ │ │ └── doc_retrieval/ │ │ ├── en_ranker_pop_wiki_test.json │ │ ├── en_ranker_tfidf_wiki_test.json │ │ └── ru_ranker_tfidf_wiki_test.json │ └── test_quick_start.py └── utils/ ├── Docker/ │ ├── Dockerfile │ ├── README.md │ ├── cmd.sh │ └── docker-compose.yml ├── __init__.py └── prepare/ ├── __init__.py ├── hashes.py ├── optimize_ipynb.py ├── registry.py └── upload.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.md ================================================ --- name: Bug report about: Report on a bug you encountered title: '' labels: bug assignees: '' --- Want to contribute to DeepPavlov? Please read the [contributing guideline](http://docs.deeppavlov.ai/en/master/devguides/contribution_guide.html) first. Please enter all the information below, otherwise your issue may be closed without a warning. **DeepPavlov version** (you can look it up by running `pip show deeppavlov`): **Python version**: **Operating system** (ubuntu linux, windows, ...): **Issue**: **Content or a name of a configuration file**: ``` ``` **Command that led to error**: ``` ``` **Error (including full traceback)**: ``` ``` ================================================ FILE: .github/ISSUE_TEMPLATE/config.yml ================================================ blank_issues_enabled: false contact_links: - name: Ask a question url: https://forum.deeppavlov.ai/ about: If you have a different question, please ask it in the forum https://forum.deeppavlov.ai ================================================ FILE: .github/ISSUE_TEMPLATE/feature-request.md ================================================ --- name: Feature request about: Suggest a feature to improve the DeepPavlov library title: '' labels: enhancement assignees: '' --- Want to contribute to DeepPavlov? Please read the [contributing guideline](http://docs.deeppavlov.ai/en/master/devguides/contribution_guide.html) first. **What problem are we trying to solve?**: ``` ``` **How can we solve it?**: ``` ``` **Are there other issues that block this solution?**: ``` ``` ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # dotenv .env # virtualenv .venv venv/ ENV/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ #IDEA .idea/ #Atom IDE .ftpconfig #vscode IDE .vscode # Vim *.vim *.vimrc #GIT .git/ #Default usr dir download/ #project test /test/ .pytest_cache # project data /data/ # local dockerfiles /Dockerfile /entrypoint.sh /.dockerignore ================================================ FILE: .readthedocs.yml ================================================ # .readthedocs.yml version: 2 build: os: "ubuntu-20.04" tools: python: "3.10" formats: [] python: install: - method: pip path: . extra_requirements: - docs ================================================ FILE: CNAME ================================================ deeppavlov.ai ================================================ FILE: Jenkinsfile ================================================ node('cuda-module') { timestamps { try { stage('Clean') { sh "rm -rf .[^.] .??* *" } stage('Checkout') { checkout scm } stage('Setup') { env.TFHUB_CACHE_DIR="tfhub_cache" sh """ EPOCH=\$(date +%s) docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG build """ } stage('Tests') { sh """ docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG up py36 py37 docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG ps | grep Exit | grep -v 'Exit 0' && exit 1 docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG up py38 py39 docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG ps | grep Exit | grep -v 'Exit 0' && exit 1 docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG up py310 py311 docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG ps | grep Exit | grep -v 'Exit 0' && exit 1 || exit 0 """ currentBuild.result = 'SUCCESS' } } catch(e) { currentBuild.result = 'FAILURE' throw e } finally { sh """ docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG rm -f docker network rm \$(echo $BUILD_TAG | awk '{print tolower(\$0)}')_default """ emailext to: "\${DEFAULT_RECIPIENTS}", subject: "${env.JOB_NAME} - Build # ${currentBuild.number} - ${currentBuild.result}!", body: '${BRANCH_NAME} - ${BUILD_URL}', attachLog: true } } } ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2018 Neural Systems and Deep Learning Laboratory Moscow Institute of Physics and Technology Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: MANIFEST.in ================================================ include README.MD include LICENSE include requirements.txt include deeppavlov/requirements/*.txt recursive-include deeppavlov *.json recursive-include deeppavlov *.md ================================================ FILE: README.md ================================================ # DeepPavlov 1.0 [![License Apache 2.0](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](LICENSE) ![Python 3.6, 3.7, 3.8, 3.9, 3.10, 3.11](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-green.svg) [![Downloads](https://pepy.tech/badge/deeppavlov)](https://pepy.tech/project/deeppavlov) [![Static Badge](https://img.shields.io/badge/DeepPavlov%20Community-blue)](https://forum.deeppavlov.ai/) [![Static Badge](https://img.shields.io/badge/DeepPavlov%20Demo-blue)](https://demo.deeppavlov.ai/) DeepPavlov 1.0 is an open-source NLP framework built on [PyTorch](https://pytorch.org/) and [transformers](https://github.com/huggingface/transformers). DeepPavlov 1.0 is created for modular and configuration-driven development of state-of-the-art NLP models and supports a wide range of NLP model applications. DeepPavlov 1.0 is designed for practitioners with limited knowledge of NLP/ML. ## Quick Links |name|Description| |--|--| | ⭐️ [*Demo*](https://demo.deeppavlov.ai/)|Check out our NLP models in the online demo| | 📚 [*Documentation*](http://docs.deeppavlov.ai/)|How to use DeepPavlov 1.0 and its features| | 🚀 [*Model List*](http://docs.deeppavlov.ai/en/master/features/overview.html)|Find the NLP model you need in the list of available models| | 🪐 [*Contribution Guide*](http://docs.deeppavlov.ai/en/master/devguides/contribution_guide.html)|Please read the contribution guidelines before making a contribution| | 🎛 [*Issues*](https://github.com/deeppavlov/DeepPavlov/issues)|If you have an issue with DeepPavlov, please let us know| | ⏩ [*Forum*](https://forum.deeppavlov.ai/)|Please let us know if you have a problem with DeepPavlov| | 📦 [*Blogs*](https://medium.com/deeppavlov)|Read about our current development| | 🦙 [Extended colab tutorials](https://github.com/deeppavlov/dp_tutorials)|Check out the code tutorials for our models| | 🌌 [*Docker Hub*](https://hub.docker.com/u/deeppavlov/)|Check out the Docker images for rapid deployment| | 👩‍🏫 [*Feedback*](https://forms.gle/i64fowQmiVhMMC7f9)|Please leave us your feedback to make DeepPavlov better| ## Installation 0. DeepPavlov supports `Linux`, `Windows 10+` (through WSL/WSL2), `MacOS` (Big Sur+) platforms, `Python 3.6`, `3.7`, `3.8`, `3.9` and `3.10`. Depending on the model used, you may need from 4 to 16 GB RAM. 1. Create and activate a virtual environment: * `Linux` ``` python -m venv env source ./env/bin/activate ``` 2. Install the package inside the environment: ``` pip install deeppavlov ``` ## QuickStart There is a bunch of great pre-trained NLP models in DeepPavlov. Each model is determined by its config file. List of models is available on [the doc page](http://docs.deeppavlov.ai/en/master/features/overview.html) in the `deeppavlov.configs` (Python): ```python from deeppavlov import configs ``` When you're decided on the model (+ config file), there are two ways to train, evaluate and infer it: * via [Command line interface (CLI)](#command-line-interface-cli) and * via [Python](#python). #### GPU requirements By default, DeepPavlov installs models requirements from PyPI. PyTorch from PyPI could not support your device CUDA capability. To run supported DeepPavlov models on GPU you should have [CUDA](https://developer.nvidia.com/cuda-toolkit) compatible with used GPU and [PyTorch version](deeppavlov/requirements/pytorch.txt) required by DeepPavlov models. See [docs](https://docs.deeppavlov.ai/en/master/intro/quick_start.html#using-gpu) for details. GPU with Pascal or newer architecture and 4+ GB VRAM is recommended. ### Command line interface (CLI) To get predictions from a model interactively through CLI, run ```bash python -m deeppavlov interact [-d] [-i] ``` * `-d` downloads required data - pretrained model files and embeddings (optional). * `-i` installs model requirements (optional). You can train it in the same simple way: ```bash python -m deeppavlov train [-d] [-i] ``` Dataset will be downloaded regardless of whether there was `-d` flag or not. To train on your own data you need to modify dataset reader path in the [train config doc](http://docs.deeppavlov.ai/en/master/intro/config_description.html#train-config). The data format is specified in the corresponding model doc page. There are even more actions you can perform with configs: ```bash python -m deeppavlov [-d] [-i] ``` * `` can be * `install` to install model requirements (same as `-i`), * `download` to download model's data (same as `-d`), * `train` to train the model on the data specified in the config file, * `evaluate` to calculate metrics on the same dataset, * `interact` to interact via CLI, * `riseapi` to run a REST API server (see [doc](http://docs.deeppavlov.ai/en/master/integrations/rest_api.html)), * `predict` to get prediction for samples from *stdin* or from ** if `-f ` is specified. * `` specifies path (or name) of model's config file * `-d` downloads required data * `-i` installs model requirements ### Python To get predictions from a model interactively through Python, run ```python from deeppavlov import build_model model = build_model(, install=True, download=True) # get predictions for 'input_text1', 'input_text2' model(['input_text1', 'input_text2']) ``` where * `install=True` installs model requirements (optional), * `download=True` downloads required data from web - pretrained model files and embeddings (optional), * `` is model name (e.g. `'ner_ontonotes_bert_mult'`), path to the chosen model's config file (e.g. `"deeppavlov/configs/ner/ner_ontonotes_bert_mult.json"`), or `deeppavlov.configs` attribute (e.g. `deeppavlov.configs.ner.ner_ontonotes_bert_mult` without quotation marks). You can train it in the same simple way: ```python from deeppavlov import train_model model = train_model(, install=True, download=True) ``` To train on your own data you need to modify dataset reader path in the [train config doc](http://docs.deeppavlov.ai/en/master/intro/config_description.html#train-config). The data format is specified in the corresponding model doc page. You can also calculate metrics on the dataset specified in your config file: ```python from deeppavlov import evaluate_model model = evaluate_model(, install=True, download=True) ``` DeepPavlov also [allows](https://docs.deeppavlov.ai/en/master/intro/python.html) to build a model from components for inference using Python. ## License DeepPavlov is Apache 2.0 - licensed. ## Citation ``` @inproceedings{savkin-etal-2024-deeppavlov, title = "DeepPavlov 1.0: Your Gateway to Advanced NLP Models Backed by Transformers and Transfer Learning", author = "Savkin Maksim and Voznyuk Anastasia and Ignatov Fedor and Korzanova Anna and Karpov Dmitry and Popov Alexander and Konovalov Vasily" editor = "Hernandez Farias and Delia Irazu and Hope Tom and Li Manling", booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: System Demonstrations", month = nov, year = "2024", address = "Miami, Florida, USA", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2024.emnlp-demo.47", pages = "465--474", abstract = "We present DeepPavlov 1.0, an open-source framework for using Natural Language Processing (NLP) models by leveraging transfer learning techniques. DeepPavlov 1.0 is created for modular and configuration-driven development of state-of-the-art NLP models and supports a wide range of NLP model applications. DeepPavlov 1.0 is designed for practitioners with limited knowledge of NLP/ML. DeepPavlov is based on PyTorch and supports HuggingFace transformers. DeepPavlov is publicly released under the Apache 2.0 license and provides access to an online demo.", } ``` ================================================ FILE: _config.yml ================================================ theme: jekyll-theme-leap-day google_analytics: UA-139843736-5 include: - _static ================================================ FILE: _layouts/default.html ================================================ {% seo %}

{{ site.title | default: site.github.repository_name }}

{{ site.description | default: site.github.project_tagline }}

{{ content }}
{% if site.google_analytics %} {% endif %} ================================================ FILE: deeppavlov/__init__.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys from pathlib import Path from ._meta import __author__, __description__, __email__, __keywords__, __license__, __version__ from .configs import configs from .core.commands.infer import build_model from .core.commands.train import train_evaluate_model_from_config from .core.common.base import Element, Model from .core.common.chainer import Chainer from .core.common.log import init_logger from .download import deep_download # TODO: make better def train_model(config: [str, Path, dict], install: bool = False, download: bool = False, recursive: bool = False) -> Chainer: train_evaluate_model_from_config(config, install=install, download=download, recursive=recursive) return build_model(config, load_trained=True) def evaluate_model(config: [str, Path, dict], install: bool = False, download: bool = False, recursive: bool = False) -> dict: return train_evaluate_model_from_config(config, to_train=False, install=install, download=download, recursive=recursive) # check version assert sys.hexversion >= 0x3060000, 'Does not work in python3.5 or lower' # resolve conflicts with previous DeepPavlov installations versioned up to 0.0.9 dot_dp_path = Path('~/.deeppavlov').expanduser().resolve() if dot_dp_path.is_file(): dot_dp_path.unlink() # initiate logging init_logger() ================================================ FILE: deeppavlov/__main__.py ================================================ if __name__ == '__main__': from .deep import main main() ================================================ FILE: deeppavlov/_meta.py ================================================ __version__ = '1.7.0' __author__ = 'Neural Networks and Deep Learning lab, MIPT' __description__ = 'An open source library for building end-to-end dialog systems and training chatbots.' __keywords__ = ['NLP', 'NER', 'SQUAD', 'Intents', 'Chatbot'] __license__ = 'Apache License, Version 2.0' __email__ = 'info@deeppavlov.ai' ================================================ FILE: deeppavlov/configs/__init__.py ================================================ from pathlib import Path from typing import Iterator, Dict, Union, Iterable class Struct: def __iter__(self) -> Iterator[str]: return iter(self._keys) def __len__(self) -> int: return len(self._keys) def __init__(self, tree: Dict[str, Union[dict, Path]]) -> None: self._keys = set() for key, value in tree.items(): key = key.replace('.', '_') self._keys.add(key) setattr(self, key, Struct(value) if isinstance(value, dict) else value) self._keys = frozenset(self._keys) self.keys = lambda: self._keys def _asdict(self, *, to_string: bool=False) -> dict: res = [] for key in self._keys: value = getattr(self, key) if isinstance(value, Struct): value = value._asdict(to_string=to_string) elif to_string: value = str(value) res.append((key, value)) return dict(res) def __getitem__(self, key: str) -> Union[dict, Path]: if key not in self._keys: raise KeyError(key) item = getattr(self, key) if isinstance(item, Struct): item = item._asdict() return item def __dir__(self) -> Iterable: return self._keys def _ipython_key_completions_(self) -> Iterable: return self._keys def __str__(self) -> str: return str(self._asdict(to_string=True)) def __repr__(self) -> str: return f'Struct({repr(self._asdict())})' def _repr_pretty_(self, p, cycle): """method that defines ``Struct``'s pretty printing rules for iPython Args: p (IPython.lib.pretty.RepresentationPrinter): pretty printer object cycle (bool): is ``True`` if pretty detected a cycle """ if cycle: p.text('Struct(...)') else: with p.group(7, 'Struct(', ')'): p.pretty(self._asdict()) def _build_configs_tree() -> Struct: root = Path(__file__).resolve().parent tree = {} for config in root.glob('**/*.json'): leaf = tree for part in config.relative_to(root).parent.parts: if part not in leaf: leaf[part] = {} leaf = leaf[part] leaf[config.stem] = config return Struct(tree) configs = _build_configs_tree() ================================================ FILE: deeppavlov/configs/classifiers/boolqa_rubert.json ================================================ { "dataset_reader": { "class_name": "boolqa_reader", "data_path": "{DOWNLOADS_PATH}/boolqa_data", "language": "ru" }, "dataset_iterator": { "class_name": "basic_classification_iterator", "seed": 243 }, "chainer": { "in": ["text_a", "text_b"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": false, "max_seq_length": 128, "in": ["text_a", "text_b"], "out": ["bert_features"] }, { "class_name": "torch_transformers_classifier", "n_classes": 2, "pretrained_bert": "{TRANSFORMER}", "save_path": "{MODELS_PATH}/boolqa_rubert/model_rubert", "load_path": "{MODELS_PATH}/boolqa_rubert/model_rubert", "optimizer": "AdamW", "optimizer_parameters": {"lr": 2e-05}, "learning_rate_drop_patience": 3, "learning_rate_drop_div": 2.0, "in": ["bert_features"], "in_y": ["y"], "out": ["predictions"] } ], "out": ["predictions"] }, "train": { "epochs": 50, "batch_size": 32, "train_metrics": ["f1", "acc"], "metrics": ["f1", "acc"], "validation_patience": 5, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "evaluation_targets": ["valid", "train"], "show_examples": false, "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "TRANSFORMER": "DeepPavlov/rubert-base-cased" } } } ================================================ FILE: deeppavlov/configs/classifiers/few_shot_roberta.json ================================================ { "chainer": { "in": ["texts", "dataset"], "in_y": ["y_true"], "pipe": [ { "class_name": "dnnc_pair_generator", "in": ["texts", "dataset"], "out": ["x", "x_support", "x_populated", "y_support"], "bidirectional": true }, { "class_name": "torch_transformers_preprocessor", "in": ["x_populated", "x_support"], "out": ["bert_features"], "vocab_file": "{BASE_MODEL}", "do_lower_case": true, "max_seq_length": 128 }, { "class_name": "torch_transformers_classifier", "main": true, "in": ["bert_features"], "out": ["simmilarity_scores"], "n_classes": 2, "return_probas": true, "pretrained_bert": "{BASE_MODEL}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "is_binary": "{BINARY_CLASSIFICATION}" }, { "class_name": "dnnc_proba2labels", "is_binary": "{BINARY_CLASSIFICATION}", "in": ["simmilarity_scores", "x", "x_populated", "x_support", "y_support"], "out": ["y_pred"], "confidence_threshold": 0.0 } ], "out": ["y_pred"] }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "MODEL_PATH": "{ROOT_PATH}/models/fewshot/roberta_nli_mrpc_1_10", "BINARY_CLASSIFICATION": true, "BASE_MODEL": "roberta-base" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/classifiers/fewshot/roberta_nli_mrpc_1_10.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/classifiers/glue/glue_cola_roberta.json ================================================ { "dataset_reader": { "class_name": "huggingface_dataset_reader", "path": "{COMPETITION}", "name": "{TASK}", "train": "train", "valid": "validation", "test": "test" }, "dataset_iterator": { "class_name": "huggingface_dataset_iterator", "features": "sentence", "label": "label", "seed": 42 }, "chainer": { "in": ["x"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{BASE_MODEL}", "do_lower_case": false, "max_seq_length": 64, "in": ["x"], "out": ["bert_features"] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": ["y"], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": ["y"], "out": ["y_ids"] }, { "in": ["y_ids"], "out": ["y_onehot"], "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{BASE_MODEL}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-05 }, "learning_rate_drop_patience": 3, "learning_rate_drop_div": 2.0, "in": ["bert_features"], "in_y": ["y_ids"], "out": ["y_pred_probas"] }, { "in": ["y_pred_probas"], "out": ["y_pred_ids"], "class_name": "proba2labels", "max_proba": true }, { "in": ["y_pred_ids"], "out": ["y_pred_labels"], "ref": "classes_vocab" } ], "out": ["y_pred_labels"] }, "train": { "batch_size": 32, "metrics": ["matthews_correlation"], "validation_patience": 10, "val_every_n_batches": 250, "log_every_n_batches": 250, "show_examples": false, "evaluation_targets": ["train", "valid"], "class_name": "torch_trainer", "tensorboard_log_dir": "{MODEL_PATH}/", "pytest_max_batches": 2 }, "metadata": { "variables": { "BASE_MODEL": "roberta-large", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "COMPETITION": "glue", "TASK": "cola", "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/glue/glue_cola_roberta.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/classifiers/glue/glue_mnli_cased_bert_torch.json ================================================ { "dataset_reader": { "class_name": "huggingface_dataset_reader", "path": "glue", "name": "mnli", "train": "train", "valid": "validation_matched", "test": "test_matched" }, "dataset_iterator": { "class_name": "huggingface_dataset_iterator", "features": ["hypothesis", "premise"], "label": "label", "seed": 42 }, "chainer": { "in": ["hypothesis", "premise"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{BASE_MODEL}", "do_lower_case": false, "max_seq_length": 128, "in": ["hypothesis", "premise"], "out": ["bert_features"] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": ["y"], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": ["y"], "out": ["y_ids"] }, { "in": ["y_ids"], "out": ["y_onehot"], "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{BASE_MODEL}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-05 }, "learning_rate_drop_patience": 3, "learning_rate_drop_div": 2.0, "in": ["bert_features"], "in_y": ["y_ids"], "out": ["y_pred_probas"] }, { "in": ["y_pred_probas"], "out": ["y_pred_ids"], "class_name": "proba2labels", "max_proba": true }, { "in": ["y_pred_ids"], "out": ["y_pred_labels"], "ref": "classes_vocab" } ], "out": ["y_pred_labels"] }, "train": { "batch_size": 64, "metrics": ["accuracy"], "validation_patience": 10, "val_every_n_batches": 250, "log_every_n_batches": 250, "show_examples": false, "evaluation_targets": ["train", "valid"], "class_name": "torch_trainer", "tensorboard_log_dir": "{MODEL_PATH}/", "pytest_max_batches": 2 }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/classifiers/glue_mnli_torch_cased_bert", "BASE_MODEL": "bert-base-cased" } } } ================================================ FILE: deeppavlov/configs/classifiers/glue/glue_mnli_mm_cased_bert_torch.json ================================================ { "dataset_reader": { "class_name": "huggingface_dataset_reader", "path": "glue", "name": "mnli", "train": "train", "valid": "validation_mismatched", "test": "test_mismatched" }, "dataset_iterator": { "class_name": "huggingface_dataset_iterator", "features": ["hypothesis", "premise"], "label": "label", "seed": 42 }, "chainer": { "in": ["hypothesis", "premise"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{BASE_MODEL}", "do_lower_case": false, "max_seq_length": 128, "in": ["hypothesis", "premise"], "out": ["bert_features"] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": ["y"], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": ["y"], "out": ["y_ids"] }, { "in": ["y_ids"], "out": ["y_onehot"], "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{BASE_MODEL}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-05 }, "learning_rate_drop_patience": 3, "learning_rate_drop_div": 2.0, "in": ["bert_features"], "in_y": ["y_ids"], "out": ["y_pred_probas"] }, { "in": ["y_pred_probas"], "out": ["y_pred_ids"], "class_name": "proba2labels", "max_proba": true }, { "in": ["y_pred_ids"], "out": ["y_pred_labels"], "ref": "classes_vocab" } ], "out": ["y_pred_labels"] }, "train": { "batch_size": 64, "metrics": ["accuracy"], "validation_patience": 10, "val_every_n_batches": 250, "log_every_n_batches": 250, "show_examples": false, "evaluation_targets": ["train", "valid"], "class_name": "torch_trainer", "tensorboard_log_dir": "{MODEL_PATH}/", "pytest_max_batches": 2 }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/classifiers/glue_mnli_mm_torch_cased_bert", "BASE_MODEL": "bert-base-cased" } } } ================================================ FILE: deeppavlov/configs/classifiers/glue/glue_mnli_roberta.json ================================================ { "dataset_reader": { "class_name": "huggingface_dataset_reader", "path": "{COMPETITION}", "name": "{TASK}", "train": "train", "valid": "validation_matched", "test": "test_matched" }, "dataset_iterator": { "class_name": "huggingface_dataset_iterator", "features": ["hypothesis", "premise"], "label": "label", "seed": 42 }, "chainer": { "in": ["hypothesis", "premise"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{BASE_MODEL}", "do_lower_case": false, "max_seq_length": 128, "in": ["hypothesis", "premise"], "out": ["bert_features"] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": ["y"], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": ["y"], "out": ["y_ids"] }, { "in": ["y_ids"], "out": ["y_onehot"], "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{BASE_MODEL}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": { "lr": 1e-05 }, "learning_rate_drop_patience": 3, "learning_rate_drop_div": 2.0, "in": ["bert_features"], "in_y": ["y_ids"], "out": ["y_pred_probas"] }, { "in": ["y_pred_probas"], "out": ["y_pred_ids"], "class_name": "proba2labels", "max_proba": true }, { "in": ["y_pred_ids"], "out": ["y_pred_labels"], "ref": "classes_vocab" } ], "out": ["y_pred_labels"] }, "train": { "batch_size": 4, "metrics": ["accuracy"], "validation_patience": 10, "val_every_n_batches": 250, "log_every_n_batches": 250, "show_examples": false, "evaluation_targets": ["valid"], "class_name": "torch_trainer", "tensorboard_log_dir": "{MODEL_PATH}/", "pytest_max_batches": 2 }, "metadata": { "variables": { "BASE_MODEL": "roberta-large", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "COMPETITION": "glue", "TASK": "mnli", "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}" }, "download": [ { "url": "http://files.deeppavlov.ai/0.16/classifiers/glue_mnli.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/classifiers/glue/glue_mrpc_roberta.json ================================================ { "dataset_reader": { "class_name": "huggingface_dataset_reader", "path": "{COMPETITION}", "name": "{TASK}", "train": "train", "valid": "validation", "test": "test" }, "dataset_iterator": { "class_name": "huggingface_dataset_iterator", "features": ["sentence1", "sentence2"], "label": "label", "seed": 42 }, "chainer": { "in": ["sentence1", "sentence2"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{BASE_MODEL}", "do_lower_case": false, "max_seq_length": 256, "in": ["sentence1", "sentence2"], "out": ["bert_features"] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": ["y"], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": ["y"], "out": ["y_ids"] }, { "in": ["y_ids"], "out": ["y_onehot"], "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{BASE_MODEL}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": { "lr": 1e-06 }, "learning_rate_drop_patience": 3, "learning_rate_drop_div": 2.0, "in": ["bert_features"], "in_y": ["y_ids"], "out": ["y_pred_probas"] }, { "in": ["y_pred_probas"], "out": ["y_pred_ids"], "class_name": "proba2labels", "max_proba": true }, { "in": ["y_pred_ids"], "out": ["y_pred_labels"], "ref": "classes_vocab" } ], "out": ["y_pred_labels"] }, "train": { "batch_size": 4, "metrics": ["accuracy"], "epochs": 2, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "show_examples": false, "evaluation_targets": ["valid"], "class_name": "torch_trainer", "tensorboard_log_dir": "{MODEL_PATH}/", "pytest_max_batches": 2 }, "metadata": { "variables": { "BASE_MODEL": "roberta-large", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "COMPETITION": "glue", "TASK": "mrpc", "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/glue/glue_mrpc_roberta.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/classifiers/glue/glue_qnli_roberta.json ================================================ { "dataset_reader": { "class_name": "huggingface_dataset_reader", "path": "{COMPETITION}", "name": "{TASK}", "train": "train", "valid": "validation", "test": "test" }, "dataset_iterator": { "class_name": "huggingface_dataset_iterator", "features": ["question", "sentence"], "label": "label", "seed": 42 }, "chainer": { "in": ["question", "sentence"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{BASE_MODEL}", "do_lower_case": false, "max_seq_length": 128, "in": ["question", "sentence"], "out": ["bert_features"] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": ["y"], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": ["y"], "out": ["y_ids"] }, { "in": ["y_ids"], "out": ["y_onehot"], "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{BASE_MODEL}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-05 }, "learning_rate_drop_patience": 3, "learning_rate_drop_div": 2.0, "in": ["bert_features"], "in_y": ["y_ids"], "out": ["y_pred_probas"] }, { "in": ["y_pred_probas"], "out": ["y_pred_ids"], "class_name": "proba2labels", "max_proba": true }, { "in": ["y_pred_ids"], "out": ["y_pred_labels"], "ref": "classes_vocab" } ], "out": ["y_pred_labels"] }, "train": { "batch_size": 16, "metrics": ["accuracy"], "validation_patience": 10, "val_every_n_batches": 250, "log_every_n_batches": 250, "show_examples": false, "evaluation_targets": ["train", "valid"], "class_name": "torch_trainer", "tensorboard_log_dir": "{MODEL_PATH}/", "pytest_max_batches": 2 }, "metadata": { "variables": { "BASE_MODEL": "roberta-large", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "COMPETITION": "glue", "TASK": "qnli", "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/glue/glue_qnli_roberta.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/classifiers/glue/glue_qqp_roberta.json ================================================ { "dataset_reader": { "class_name": "huggingface_dataset_reader", "path": "{COMPETITION}", "name": "{TASK}", "train": "train", "valid": "validation", "test": "test" }, "dataset_iterator": { "class_name": "huggingface_dataset_iterator", "features": ["question1", "question2"], "label": "label", "use_label_name": false, "seed": 42 }, "chainer": { "in": ["question1", "question2"], "in_y": ["y_ids"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{BASE_MODEL}", "do_lower_case": false, "max_seq_length": 128, "in": ["question1", "question2"], "out": ["bert_features"] }, { "in": ["y_ids"], "out": ["y_onehot"], "class_name": "one_hotter", "depth": 2, "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": 2, "return_probas": true, "pretrained_bert": "{BASE_MODEL}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-05 }, "learning_rate_drop_patience": 3, "learning_rate_drop_div": 2.0, "in": ["bert_features"], "in_y": ["y_ids"], "out": ["y_pred_probas"] }, { "in": ["y_pred_probas"], "out": ["y_pred_ids"], "class_name": "proba2labels", "max_proba": true } ], "out": ["y_pred_ids"] }, "train": { "batch_size": 16, "metrics": [ "f1", "accuracy" ], "validation_patience": 10, "val_every_n_batches": 250, "log_every_n_batches": 250, "show_examples": false, "evaluation_targets": ["train", "valid"], "class_name": "torch_trainer", "tensorboard_log_dir": "{MODEL_PATH}/", "pytest_max_batches": 2 }, "metadata": { "variables": { "BASE_MODEL": "roberta-large", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "COMPETITION": "glue", "TASK": "qqp", "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/glue/glue_qqp_roberta.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/classifiers/glue/glue_rte_cased_bert_torch.json ================================================ { "dataset_reader": { "class_name": "huggingface_dataset_reader", "path": "glue", "name": "rte", "train": "train", "valid": "validation", "test": "test" }, "dataset_iterator": { "class_name": "huggingface_dataset_iterator", "features": ["sentence1", "sentence2"], "label": "label", "seed": 42 }, "chainer": { "in": ["sentence1", "sentence2"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{BASE_MODEL}", "do_lower_case": false, "max_seq_length": 256, "in": ["sentence1", "sentence2"], "out": ["bert_features"] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": ["y"], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": ["y"], "out": ["y_ids"] }, { "in": ["y_ids"], "out": ["y_onehot"], "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{BASE_MODEL}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-05 }, "learning_rate_drop_patience": 3, "learning_rate_drop_div": 2.0, "in": ["bert_features"], "in_y": ["y_ids"], "out": ["y_pred_probas"] }, { "in": ["y_pred_probas"], "out": ["y_pred_ids"], "class_name": "proba2labels", "max_proba": true }, { "in": ["y_pred_ids"], "out": ["y_pred_labels"], "ref": "classes_vocab" } ], "out": ["y_pred_labels"] }, "train": { "batch_size": 32, "metrics": ["accuracy"], "validation_patience": 10, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "show_examples": false, "evaluation_targets": ["train", "valid"], "class_name": "torch_trainer", "tensorboard_log_dir": "{MODEL_PATH}/", "pytest_max_batches": 2 }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/classifiers/glue_rte_torch_cased_bert", "BASE_MODEL": "bert-base-cased" } } } ================================================ FILE: deeppavlov/configs/classifiers/glue/glue_rte_roberta_mnli.json ================================================ { "dataset_reader": { "class_name": "huggingface_dataset_reader", "path": "{COMPETITION}", "name": "{TASK}", "train": "train", "valid": "validation", "test": "test" }, "dataset_iterator": { "class_name": "huggingface_dataset_iterator", "features": ["sentence1", "sentence2"], "label": "label", "seed": 42 }, "chainer": { "in": ["sentence1", "sentence2"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{BASE_MODEL}", "do_lower_case": false, "max_seq_length": 256, "in": ["sentence1", "sentence2"], "out": ["bert_features"] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": ["y"], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": ["y"], "out": ["y_ids"] }, { "in": ["y_ids"], "out": ["y_onehot"], "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{BASE_MODEL}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": { "lr": 1e-06 }, "learning_rate_drop_patience": 3, "learning_rate_drop_div": 2.0, "in": ["bert_features"], "in_y": ["y_ids"], "out": ["y_pred_probas"] }, { "in": ["y_pred_probas"], "out": ["y_pred_ids"], "class_name": "proba2labels", "max_proba": true }, { "in": ["y_pred_ids"], "out": ["y_pred_labels"], "ref": "classes_vocab" } ], "out": ["y_pred_labels"] }, "train": { "batch_size": 4, "metrics": ["accuracy"], "epochs": 2, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "show_examples": false, "evaluation_targets": ["valid"], "class_name": "torch_trainer", "tensorboard_log_dir": "{MODEL_PATH}/", "pytest_max_batches": 2 }, "metadata": { "variables": { "BASE_MODEL": "roberta-large-mnli", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "COMPETITION": "glue", "TASK": "rte", "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}" }, "download": [ { "url": "http://files.deeppavlov.ai/0.16/classifiers/glue_rte.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/classifiers/glue/glue_sst2_roberta.json ================================================ { "dataset_reader": { "class_name": "huggingface_dataset_reader", "path": "{COMPETITION}", "name": "{TASK}", "train": "train", "valid": "validation", "test": "test" }, "dataset_iterator": { "class_name": "huggingface_dataset_iterator", "features": "sentence", "label": "label", "seed": 42 }, "chainer": { "in": ["x"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "bert-base-cased", "do_lower_case": false, "max_seq_length": 64, "in": ["x"], "out": ["bert_features"] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": ["y"], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": ["y"], "out": ["y_ids"] }, { "in": ["y_ids"], "out": ["y_onehot"], "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "bert-base-cased", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-05 }, "learning_rate_drop_patience": 3, "learning_rate_drop_div": 2.0, "in": ["bert_features"], "in_y": ["y_ids"], "out": ["y_pred_probas"] }, { "in": ["y_pred_probas"], "out": ["y_pred_ids"], "class_name": "proba2labels", "max_proba": true }, { "in": ["y_pred_ids"], "out": ["y_pred_labels"], "ref": "classes_vocab" } ], "out": ["y_pred_labels"] }, "train": { "batch_size": 128, "metrics": ["accuracy"], "validation_patience": 10, "val_every_n_batches": 250, "log_every_n_batches": 250, "show_examples": false, "evaluation_targets": ["train", "valid"], "class_name": "torch_trainer", "tensorboard_log_dir": "{MODEL_PATH}/", "pytest_max_batches": 2 }, "metadata": { "variables": { "BASE_MODEL": "roberta-large", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "COMPETITION": "glue", "TASK": "sst2", "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/glue/glue_sst2_roberta.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/classifiers/glue/glue_stsb_roberta.json ================================================ { "dataset_reader": { "class_name": "huggingface_dataset_reader", "path": "{COMPETITION}", "name": "{TASK}", "train": "train", "valid": "validation", "test": "test" }, "dataset_iterator": { "class_name": "huggingface_dataset_iterator", "features": ["sentence1", "sentence2"], "label": "label", "use_label_name": false, "seed": 42 }, "chainer": { "in": ["sentence1", "sentence2"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{BASE_MODEL}", "do_lower_case": false, "max_seq_length": 64, "in": ["sentence1", "sentence2"], "out": ["bert_features"] }, { "class_name": "torch_transformers_classifier", "n_classes": 1, "return_probas": false, "pretrained_bert": "{BASE_MODEL}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-05 }, "learning_rate_drop_patience": 3, "learning_rate_drop_div": 2.0, "in": ["bert_features"], "in_y": ["y"], "out": ["y_pred"] } ], "out": ["y_pred"] }, "train": { "batch_size": 32, "metrics": [ "pearson_correlation", "spearman_correlation" ], "validation_patience": 10, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "show_examples": false, "evaluation_targets": ["train", "valid"], "class_name": "torch_trainer", "tensorboard_log_dir": "{MODEL_PATH}/", "pytest_max_batches": 2 }, "metadata": { "variables": { "BASE_MODEL": "roberta-large", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "COMPETITION": "glue", "TASK": "stsb", "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/glue/glue_stsb_roberta.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/classifiers/glue/glue_wnli_roberta.json ================================================ { "dataset_reader": { "class_name": "huggingface_dataset_reader", "path": "{COMPETITION}", "name": "{TASK}", "train": "train", "valid": "validation", "test": "test" }, "dataset_iterator": { "class_name": "huggingface_dataset_iterator", "features": ["sentence1", "sentence2"], "label": "label", "seed": 42 }, "chainer": { "in": ["sentence1", "sentence2"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{BASE_MODEL}", "do_lower_case": false, "max_seq_length": 192, "truncation": "longest_first", "padding": "longest", "in": ["sentence1", "sentence2"], "out": ["bert_features"] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": ["y"], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": ["y"], "out": ["y_ids"] }, { "in": ["y_ids"], "out": ["y_onehot"], "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{BASE_MODEL}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": { "lr": 1e-05 }, "learning_rate_drop_patience": 3, "learning_rate_drop_div": 2.0, "in": ["bert_features"], "in_y": ["y_ids"], "out": ["y_pred_probas"] }, { "in": ["y_pred_probas"], "out": ["y_pred_ids"], "class_name": "proba2labels", "max_proba": true }, { "in": ["y_pred_ids"], "out": ["y_pred_labels"], "ref": "classes_vocab" } ], "out": ["y_pred_labels"] }, "train": { "batch_size": 8, "metrics": ["accuracy"], "epochs": 1, "val_every_n_batches": 250, "log_every_n_batches": 250, "show_examples": false, "evaluation_targets": ["train", "valid"], "class_name": "torch_trainer", "tensorboard_log_dir": "{MODEL_PATH}/", "pytest_max_batches": 2 }, "metadata": { "variables": { "BASE_MODEL": "roberta-large", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "COMPETITION": "glue", "TASK": "wnli", "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}" }, "download": [ { "url": "http://files.deeppavlov.ai/0.16/classifiers/glue_wnli_roberta.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/classifiers/insults_kaggle_bert.json ================================================ { "dataset_reader": { "class_name": "basic_classification_reader", "x": "Comment", "y": "Class", "data_path": "{DOWNLOADS_PATH}/insults_data" }, "dataset_iterator": { "class_name": "basic_classification_iterator", "seed": 42 }, "chainer": { "in": [ "x" ], "in_y": [ "y" ], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": true, "max_seq_length": 64, "in": [ "x" ], "out": [ "bert_features" ] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": [ "y" ], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": [ "y" ], "out": [ "y_ids" ] }, { "in": [ "y_ids" ], "out": [ "y_onehot" ], "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{TRANSFORMER}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": { "lr": 1e-05 }, "learning_rate_drop_patience": 5, "learning_rate_drop_div": 2.0, "in": [ "bert_features" ], "in_y": [ "y_ids" ], "out": [ "y_pred_probas" ] }, { "in": [ "y_pred_probas" ], "out": [ "y_pred_ids" ], "class_name": "proba2labels", "max_proba": true }, { "in": [ "y_pred_ids" ], "out": [ "y_pred_labels" ], "ref": "classes_vocab" } ], "out": [ "y_pred_labels" ] }, "train": { "epochs": 100, "batch_size": 64, "metrics": [ { "name": "roc_auc", "inputs": [ "y_onehot", "y_pred_probas" ] }, "accuracy", "f1_macro" ], "validation_patience": 5, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "show_examples": false, "evaluation_targets": [ "train", "valid", "test" ], "class_name": "torch_trainer" }, "metadata": { "variables": { "TRANSFORMER": "bert-base-uncased", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/classifiers/insults_kaggle_torch_bert" }, "download": [ { "url": "http://files.deeppavlov.ai/datasets/insults_data.tar.gz", "subdir": "{DOWNLOADS_PATH}" }, { "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/insults_kaggle_torch_bert_v5.tar.gz", "subdir": "{MODELS_PATH}/classifiers" } ] } } ================================================ FILE: deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L.json ================================================ { "dataset_reader": { "class_name": "paraphraser_reader", "data_path": "{DOWNLOADS_PATH}/paraphraser_data", "do_lower_case": false }, "dataset_iterator": { "class_name": "siamese_iterator", "seed": 243, "len_valid": 500 }, "chainer": { "in": ["text_a", "text_b"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": false, "max_seq_length": 64, "in": ["text_a", "text_b"], "out": ["bert_features"] }, { "class_name": "torch_transformers_classifier", "n_classes": 2, "return_probas": false, "pretrained_bert": "{TRANSFORMER}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "attention_probs_keep_prob": 0.11, "hidden_keep_prob": 1.0, "optimizer": "AdamW", "optimizer_parameters": { "lr": 1.89e-05 }, "learning_rate_drop_patience": 3, "learning_rate_drop_div": 1.5, "in": [ "bert_features" ], "in_y": [ "y" ], "out": [ "predictions" ] } ], "out": ["predictions"] }, "train": { "epochs": 100, "batch_size": 64, "metrics": [ "f1", "accuracy" ], "validation_patience": 7, "val_every_n_batches": 50, "log_every_n_batches": 50, "evaluation_targets": [ "train", "valid", "test" ], "tensorboard_log_dir": "{MODEL_PATH}/", "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "TRANSFORMER": "DeepPavlov/distilrubert-tiny-cased-conversational", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/paraphraser_convers_distilrubert_2L" }, "download": [ { "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/paraphraser_convers_distilrubert_2L.tar.gz", "subdir": "{MODELS_PATH}" }, { "url": "http://files.deeppavlov.ai/datasets/paraphraser.zip", "subdir": "{DOWNLOADS_PATH}/paraphraser_data" }, { "url": "http://files.deeppavlov.ai/datasets/paraphraser_gold.zip", "subdir": "{DOWNLOADS_PATH}/paraphraser_data" } ] } } ================================================ FILE: deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L.json ================================================ { "dataset_reader": { "class_name": "paraphraser_reader", "data_path": "{DOWNLOADS_PATH}/paraphraser_data", "do_lower_case": false }, "dataset_iterator": { "class_name": "siamese_iterator", "seed": 243, "len_valid": 500 }, "chainer": { "in": ["text_a", "text_b"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": false, "max_seq_length": 64, "in": ["text_a", "text_b"], "out": ["bert_features"] }, { "class_name": "torch_transformers_classifier", "n_classes": 2, "return_probas": false, "pretrained_bert": "{TRANSFORMER}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "attention_probs_keep_prob": 0.0, "hidden_keep_prob": 0.67, "optimizer": "AdamW", "optimizer_parameters": { "lr": 7.22e-05 }, "learning_rate_drop_patience": 3, "learning_rate_drop_div": 1.5, "in": [ "bert_features" ], "in_y": [ "y" ], "out": [ "predictions" ] } ], "out": ["predictions"] }, "train": { "epochs": 100, "batch_size": 64, "metrics": [ "f1", "accuracy" ], "validation_patience": 7, "val_every_n_batches": 50, "log_every_n_batches": 50, "evaluation_targets": [ "train", "valid", "test" ], "tensorboard_log_dir": "{MODEL_PATH}/", "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "TRANSFORMER": "DeepPavlov/distilrubert-base-cased-conversational", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/paraphraser_convers_distilrubert_6L" }, "download": [ { "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/paraphraser_convers_distilrubert_6L.tar.gz", "subdir": "{MODELS_PATH}" }, { "url": "http://files.deeppavlov.ai/datasets/paraphraser.zip", "subdir": "{DOWNLOADS_PATH}/paraphraser_data" }, { "url": "http://files.deeppavlov.ai/datasets/paraphraser_gold.zip", "subdir": "{DOWNLOADS_PATH}/paraphraser_data" } ] } } ================================================ FILE: deeppavlov/configs/classifiers/paraphraser_rubert.json ================================================ { "dataset_reader": { "class_name": "paraphraser_reader", "data_path": "{DOWNLOADS_PATH}/paraphraser_data", "do_lower_case": false }, "dataset_iterator": { "class_name": "siamese_iterator", "seed": 243, "len_valid": 500 }, "chainer": { "in": ["text_a", "text_b"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": false, "max_seq_length": 64, "in": ["text_a", "text_b"], "out": ["bert_features"] }, { "class_name": "torch_transformers_classifier", "n_classes": 2, "pretrained_bert": "{TRANSFORMER}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": {"lr": 2e-05}, "learning_rate_drop_patience": 3, "learning_rate_drop_div": 2.0, "in": ["bert_features"], "in_y": ["y"], "out": ["predictions"] } ], "out": ["predictions"] }, "train": { "batch_size": 64, "pytest_max_batches": 2, "train_metrics": ["f1", "acc"], "metrics": ["f1", "acc"], "validation_patience": 7, "val_every_n_batches": 50, "log_every_n_batches": 50, "evaluation_targets": ["valid", "test"], "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/classifiers/paraphraser_rubert_torch", "TRANSFORMER": "DeepPavlov/rubert-base-cased" }, "download": [ { "url": "http://files.deeppavlov.ai/datasets/paraphraser.zip", "subdir": "{DOWNLOADS_PATH}/paraphraser_data" }, { "url": "http://files.deeppavlov.ai/datasets/paraphraser_gold.zip", "subdir": "{DOWNLOADS_PATH}/paraphraser_data" }, { "url": "http://files.deeppavlov.ai/v1/classifiers/paraphraser_rubert/paraphraser_rubert_v1.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/classifiers/query_pr.json ================================================ { "dataset_reader": { "class_name": "sq_reader", "data_path": "{DOWNLOADS_PATH}/query_prediction/query_prediction_eng.pickle" }, "dataset_iterator": { "class_name": "basic_classification_iterator", "seed": 42 }, "chainer": { "in": ["x"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": false, "max_seq_length": 64, "in": ["x"], "out": ["bert_features"] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": ["y"], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": ["y"], "out": ["y_ids"] }, { "in": ["y_ids"], "out": ["y_onehot"], "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{TRANSFORMER}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": {"lr": 1e-05}, "learning_rate_drop_patience": 5, "learning_rate_drop_div": 2.0, "in": ["bert_features"], "in_y": ["y_ids"], "out": ["y_pred_probas"] }, { "in": ["y_pred_probas"], "out": ["y_pred_ids"], "class_name": "proba2labels", "max_proba": true }, { "in": ["y_pred_ids"], "out": ["y_pred_labels"], "ref": "classes_vocab" } ], "out": ["y_pred_labels"] }, "train": { "epochs": 100, "batch_size": 64, "metrics": [ "f1_macro", "accuracy", { "name": "roc_auc", "inputs": ["y_onehot", "y_pred_probas"] } ], "validation_patience": 10, "val_every_n_batches": 100, "log_every_n_batches": 100, "show_examples": false, "evaluation_targets": ["train", "valid", "test"], "class_name": "torch_trainer" }, "metadata": { "variables": { "TRANSFORMER": "haisongzhang/roberta-tiny-cased", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/classifiers/query_prediction_eng" }, "download": [ { "url": "http://files.deeppavlov.ai/kbqa/wikidata/query_prediction_eng.tar.gz", "subdir": "{MODELS_PATH}/classifiers/query_prediction_eng" }, { "url": "http://files.deeppavlov.ai/kbqa/wikidata/query_prediction_eng.pickle", "subdir": "{DOWNLOADS_PATH}/query_prediction" } ] } } ================================================ FILE: deeppavlov/configs/classifiers/rusentiment_bert.json ================================================ { "dataset_reader": { "class_name": "basic_classification_reader", "x": "text", "y": "label", "data_path": "{DOWNLOADS_PATH}/rusentiment/", "train": "rusentiment_random_posts.csv", "test": "rusentiment_test.csv" }, "dataset_iterator": { "class_name": "basic_classification_iterator", "seed": 42, "split_seed": 23, "field_to_split": "train", "split_fields": [ "train", "valid" ], "split_proportions": [ 0.9, 0.1 ] }, "chainer": { "in": [ "x" ], "in_y": [ "y" ], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": false, "max_seq_length": 64, "in": [ "x" ], "out": [ "bert_features" ] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": [ "y" ], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": "y", "out": "y_ids" }, { "in": "y_ids", "out": "y_onehot", "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{TRANSFORMER}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer_parameters": {"lr": 1e-05}, "learning_rate_drop_patience": 5, "learning_rate_drop_div": 2.0, "in": [ "bert_features" ], "in_y": [ "y_onehot" ], "out": [ "y_pred_probas" ] }, { "in": "y_pred_probas", "out": "y_pred_ids", "class_name": "proba2labels", "max_proba": true }, { "in": "y_pred_ids", "out": "y_pred_labels", "ref": "classes_vocab" } ], "out": [ "y_pred_labels" ] }, "train": { "batch_size": 64, "epochs": 100, "metrics": [ "f1_weighted", "f1_macro", "accuracy", { "name": "roc_auc", "inputs": [ "y_onehot", "y_pred_probas" ] } ], "show_examples": false, "pytest_max_batches": 2, "validation_patience": 5, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "evaluation_targets": [ "train", "valid", "test" ], "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/classifiers/rusentiment_bert_torch", "TRANSFORMER": "bert-base-multilingual-cased" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/classifiers/rusentiment_bert/rusentiment_bert_torch.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/classifiers/rusentiment_convers_bert.json ================================================ { "dataset_reader": { "class_name": "basic_classification_reader", "x": "text", "y": "label", "data_path": "{DOWNLOADS_PATH}/rusentiment/", "train": "rusentiment_random_posts.csv", "test": "rusentiment_test.csv" }, "dataset_iterator": { "class_name": "basic_classification_iterator", "seed": 42, "split_seed": 23, "field_to_split": "train", "split_fields": [ "train", "valid" ], "split_proportions": [ 0.9, 0.1 ] }, "chainer": { "in": [ "x" ], "in_y": [ "y" ], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": false, "max_seq_length": 64, "in": [ "x" ], "out": [ "bert_features" ] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": [ "y" ], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": "y", "out": "y_ids" }, { "in": "y_ids", "out": "y_onehot", "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{TRANSFORMER}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer_parameters": {"lr": 1e-05}, "learning_rate_drop_patience": 5, "learning_rate_drop_div": 2.0, "in": [ "bert_features" ], "in_y": [ "y_onehot" ], "out": [ "y_pred_probas" ] }, { "in": "y_pred_probas", "out": "y_pred_ids", "class_name": "proba2labels", "max_proba": true }, { "in": "y_pred_ids", "out": "y_pred_labels", "ref": "classes_vocab" } ], "out": [ "y_pred_labels" ] }, "train": { "batch_size": 64, "epochs": 100, "metrics": [ "f1_weighted", "f1_macro", "accuracy", { "name": "roc_auc", "inputs": [ "y_onehot", "y_pred_probas" ] } ], "show_examples": false, "pytest_max_batches": 2, "validation_patience": 5, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "evaluation_targets": [ "train", "valid", "test" ], "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/classifiers/rusentiment_convers_bert_torch", "TRANSFORMER": "DeepPavlov/rubert-base-cased-conversational" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/classifiers/rusentiment_convers_bert/rusentiment_convers_bert_torch.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_2L.json ================================================ { "dataset_reader": { "class_name": "basic_classification_reader", "x": "text", "y": "label", "data_path": "{DOWNLOADS_PATH}/rusentiment/", "train": "rusentiment_random_posts.csv", "test": "rusentiment_test.csv" }, "dataset_iterator": { "class_name": "basic_classification_iterator", "seed": 42, "split_seed": 23, "field_to_split": "train", "split_fields": [ "train", "valid" ], "split_proportions": [ 0.9, 0.1 ] }, "chainer": { "in": [ "x" ], "in_y": [ "y" ], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": true, "max_seq_length": 64, "in": [ "x" ], "out": [ "bert_features" ] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": [ "y" ], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": "y", "out": "y_ids" }, { "in": "y_ids", "out": "y_onehot", "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{TRANSFORMER}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "attention_probs_keep_prob": 0.78, "hidden_keep_prob": 0.89, "optimizer": "AdamW", "optimizer_parameters": { "lr": 7.22e-05 }, "learning_rate_drop_patience": 5, "learning_rate_drop_div": 1.5, "in": [ "bert_features" ], "in_y": [ "y_ids" ], "out": [ "y_pred_probas" ] }, { "in": "y_pred_probas", "out": "y_pred_ids", "class_name": "proba2labels", "max_proba": true }, { "in": "y_pred_ids", "out": "y_pred_labels", "ref": "classes_vocab" } ], "out": [ "y_pred_labels" ] }, "train": { "epochs": 100, "batch_size": 64, "metrics": [ "f1_weighted", "f1_macro", "accuracy", { "name": "roc_auc", "inputs": [ "y_onehot", "y_pred_probas" ] } ], "validation_patience": 5, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "show_examples": false, "evaluation_targets": [ "train", "valid", "test" ], "tensorboard_log_dir": "{MODEL_PATH}/", "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "TRANSFORMER": "DeepPavlov/distilrubert-tiny-cased-conversational", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/classifiers/rusentiment_convers_distilrubert_2L" }, "download": [ { "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/rusentiment_convers_distilrubert_2L.tar.gz", "subdir": "{MODELS_PATH}/classifiers/" } ] } } ================================================ FILE: deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_6L.json ================================================ { "dataset_reader": { "class_name": "basic_classification_reader", "x": "text", "y": "label", "data_path": "{DOWNLOADS_PATH}/rusentiment/", "train": "rusentiment_random_posts.csv", "test": "rusentiment_test.csv" }, "dataset_iterator": { "class_name": "basic_classification_iterator", "seed": 42, "split_seed": 23, "field_to_split": "train", "split_fields": [ "train", "valid" ], "split_proportions": [ 0.9, 0.1 ] }, "chainer": { "in": [ "x" ], "in_y": [ "y" ], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": true, "max_seq_length": 64, "in": [ "x" ], "out": [ "bert_features" ] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": [ "y" ], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": "y", "out": "y_ids" }, { "in": "y_ids", "out": "y_onehot", "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{TRANSFORMER}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "attention_probs_keep_prob": 0.78, "hidden_keep_prob": 0, "optimizer": "AdamW", "optimizer_parameters": { "lr": 4.56e-05 }, "learning_rate_drop_patience": 5, "learning_rate_drop_div": 1.5, "in": [ "bert_features" ], "in_y": [ "y_ids" ], "out": [ "y_pred_probas" ] }, { "in": "y_pred_probas", "out": "y_pred_ids", "class_name": "proba2labels", "max_proba": true }, { "in": "y_pred_ids", "out": "y_pred_labels", "ref": "classes_vocab" } ], "out": [ "y_pred_labels" ] }, "train": { "epochs": 100, "batch_size": 64, "metrics": [ "f1_weighted", "f1_macro", "accuracy", { "name": "roc_auc", "inputs": [ "y_onehot", "y_pred_probas" ] } ], "validation_patience": 5, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "show_examples": false, "evaluation_targets": [ "train", "valid", "test" ], "tensorboard_log_dir": "{MODEL_PATH}/", "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "TRANSFORMER": "DeepPavlov/distilrubert-base-cased-conversational", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/classifiers/rusentiment_convers_distilrubert_6L" }, "download": [ { "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/rusentiment_convers_distilrubert_6L.tar.gz", "subdir": "{MODELS_PATH}/classifiers/" } ] } } ================================================ FILE: deeppavlov/configs/classifiers/sentiment_sst_conv_bert.json ================================================ { "dataset_reader": { "class_name": "basic_classification_reader", "x": "text", "y": "fine_grained_label", "data_path": "{DOWNLOADS_PATH}/stanfordSentimentTreebank", "train": "train_fine_grained.csv", "valid": "valid_fine_grained.csv", "test": "test_fine_grained.csv" }, "dataset_iterator": { "class_name": "basic_classification_iterator", "seed": 42 }, "chainer": { "in": [ "x" ], "in_y": [ "y" ], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": false, "max_seq_length": 64, "in": [ "x" ], "out": [ "bert_features" ] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": [ "y" ], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": "y", "out": "y_ids" }, { "in": "y_ids", "out": "y_onehot", "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{TRANSFORMER}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer_parameters": {"lr": 1e-05}, "learning_rate_drop_patience": 5, "learning_rate_drop_div": 2.0, "in": [ "bert_features" ], "in_y": [ "y_onehot" ], "out": [ "y_pred_probas" ] }, { "in": "y_pred_probas", "out": "y_pred_ids", "class_name": "proba2labels", "max_proba": true }, { "in": "y_pred_ids", "out": "y_pred_labels", "ref": "classes_vocab" } ], "out": [ "y_pred_labels" ] }, "train": { "epochs": 100, "batch_size": 64, "metrics": [ "accuracy", { "name": "roc_auc", "inputs": [ "y_onehot", "y_pred_probas" ] }, "f1_macro" ], "validation_patience": 5, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "show_examples": false, "evaluation_targets": [ "train", "valid", "test" ], "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/classifiers/sentiment_sst_bert_torch", "TRANSFORMER": "DeepPavlov/bert-base-cased-conversational" }, "download": [ { "url": "http://files.deeppavlov.ai/datasets/stanfordSentimentTreebank.zip", "subdir": "{DOWNLOADS_PATH}" }, { "url": "http://files.deeppavlov.ai/v1/classifiers/sentiment_sst_bert/sentiment_sst_bert_torch.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/classifiers/sentiment_twitter.json ================================================ { "dataset_reader": { "class_name": "basic_classification_reader", "x": "Twit", "y": "Class", "data_path": "{DOWNLOADS_PATH}/sentiment_twitter_data" }, "dataset_iterator": { "class_name": "basic_classification_iterator", "seed": 42 }, "chainer": { "in": [ "x" ], "in_y": [ "y" ], "pipe": [ { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": [ "y" ], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": "y", "out": "y_ids" }, { "in": "x", "out": "x_tok", "id": "my_tokenizer", "class_name": "nltk_tokenizer", "tokenizer": "wordpunct_tokenize" }, { "in": "x_tok", "out": "x_emb", "id": "my_embedder", "class_name": "fasttext", "load_path": "{DOWNLOADS_PATH}/embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin", "pad_zero": true }, { "in": "y_ids", "out": "y_onehot", "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "in": [ "x_emb" ], "in_y": [ "y_ids" ], "out": [ "y_pred_probas" ], "main": true, "class_name": "torch_text_classification_model", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "embedding_size": "#my_embedder.dim", "n_classes": "#classes_vocab.len", "kernel_sizes_cnn": [ 3, 5, 7 ], "filters_cnn": 256, "dropout_rate": 0.5, "dense_size": 64, "optimizer": "SGD", "optimizer_parameters": { "lr": 0.0001, "momentum": 0.9, "weight_decay": 0.0001 } }, { "in": "y_pred_probas", "out": "y_pred_ids", "class_name": "proba2labels", "max_proba": true }, { "in": "y_pred_ids", "out": "y_pred_labels", "ref": "classes_vocab" } ], "out": [ "y_pred_labels" ] }, "train": { "epochs": 100, "batch_size": 128, "metrics": [ "accuracy", "f1_macro", { "name": "roc_auc", "inputs": ["y_onehot", "y_pred_probas"] } ], "validation_patience": 5, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "show_examples": false, "evaluation_targets": [ "train", "valid", "test" ], "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/classifiers/sentiment_twitter_torch" }, "download": [ { "url": "http://files.deeppavlov.ai/datasets/sentiment_twitter_data.tar.gz", "subdir": "{DOWNLOADS_PATH}" }, { "url": "http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin", "subdir": "{DOWNLOADS_PATH}/embeddings" }, { "url": "http://files.deeppavlov.ai/v1/classifiers/sentiment_twitter/sentiment_twitter_torch.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/classifiers/superglue/superglue_boolq_roberta_mnli.json ================================================ { "dataset_reader": { "class_name": "huggingface_dataset_reader", "path": "{COMPETITION}", "name": "{TASK}", "train": "train", "valid": "validation", "test": "test", "dev_percentage": 50 }, "dataset_iterator": { "class_name": "huggingface_dataset_iterator", "features": ["question", "passage"], "label": "label", "seed": 42 }, "chainer": { "in": ["question", "passage"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{BASE_MODEL}", "do_lower_case": false, "max_seq_length": 256, "in": ["question", "passage"], "out": ["bert_features"] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": ["y"], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": ["y"], "out": ["y_ids"] }, { "in": ["y_ids"], "out": ["y_onehot"], "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{BASE_MODEL}", "is_binary": "{BINARY_CLASSIFICATION}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-05, "weight_decay": 0.1 }, "learning_rate_drop_patience": 3, "learning_rate_drop_div": 2.0, "in": ["bert_features"], "in_y": ["y_ids"], "out": ["y_pred_probas"] }, { "in": ["y_pred_probas"], "out": ["y_pred_ids"], "class_name": "proba2labels", "is_binary": "{BINARY_CLASSIFICATION}", "confidence_threshold": 0.5 }, { "in": ["y_pred_ids"], "out": ["y_pred_labels"], "ref": "classes_vocab" } ], "out": ["y_pred_labels"] }, "train": { "batch_size": 24, "metrics": ["accuracy"], "validation_patience": 10, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "show_examples": false, "evaluation_targets": ["train", "valid"], "class_name": "torch_trainer", "tensorboard_log_dir": "{MODEL_PATH}/", "pytest_max_batches": 2, "pytest_batch_size": 2 }, "metadata": { "variables": { "BASE_MODEL": "roberta-large-mnli", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "COMPETITION": "super_glue", "TASK": "boolq", "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}", "BINARY_CLASSIFICATION": true }, "download": [ { "url": "http://files.deeppavlov.ai/v1/superglue/superglue_boolq_roberta_mnli.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/classifiers/superglue/superglue_copa_roberta.json ================================================ { "dataset_reader": { "class_name": "huggingface_dataset_reader", "path": "{COMPETITION}", "name": "{TASK}", "train": "train", "valid": "validation", "test": "test" }, "dataset_iterator": { "class_name": "huggingface_dataset_iterator", "features": ["contexts", "choices"], "label": "label", "seed": 42 }, "chainer": { "in": ["contexts_list", "choices_list"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_multiplechoice_preprocessor", "vocab_file": "{BASE_MODEL}", "do_lower_case": false, "max_seq_length": 64, "in": ["contexts_list", "choices_list"], "out": ["bert_features"] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": ["y"], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": ["y"], "out": ["y_ids"] }, { "in": ["y_ids"], "out": ["y_onehot"], "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_multiplechoice", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{BASE_MODEL}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-05 }, "learning_rate_drop_patience": 3, "learning_rate_drop_div": 2.0, "in": ["bert_features"], "in_y": ["y_ids"], "out": ["y_pred_probas"] }, { "in": ["y_pred_probas"], "out": ["y_pred_ids"], "class_name": "proba2labels", "max_proba": true }, { "in": ["y_pred_ids"], "out": ["y_pred_labels"], "ref": "classes_vocab" } ], "out": ["y_pred_labels"] }, "train": { "batch_size": 16, "metrics": ["accuracy"], "validation_patience": 10, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "show_examples": false, "evaluation_targets": ["train", "valid"], "class_name": "torch_trainer", "tensorboard_log_dir": "{MODEL_PATH}/", "pytest_max_batches": 2, "pytest_batch_size": 2 }, "metadata": { "variables": { "BASE_MODEL": "roberta-large", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "COMPETITION": "super_glue", "TASK": "copa", "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}" }, "download": [ { "url": "http://files.deeppavlov.ai/0.17/classifiers/superglue/superglue_copa_roberta.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/classifiers/superglue/superglue_record_roberta.json ================================================ { "dataset_reader": { "class_name": "huggingface_dataset_reader", "path": "{COMPETITION}", "name": "{TASK}", "train": "train", "valid": "validation", "test": "test", "downsample_ratio": [1.8, 1.8, 1], "do_index_correction": false }, "dataset_iterator": { "class_name": "huggingface_dataset_iterator", "features": ["idx", "query", "passage", "entities", "num_examples"], "label": "label", "seed": 42, "use_label_name": false }, "chainer": { "in": ["idx", "query", "passage", "entities", "num_examples"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{BASE_MODEL}", "do_lower_case": false, "max_seq_length": 320, "in": ["query", "passage"], "out": ["bert_features"] }, { "class_name": "torch_transformers_classifier", "n_classes": 2, "return_probas": true, "is_binary": "{BINARY_CLASSIFICATION}", "pretrained_bert": "{BASE_MODEL}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-05, "weight_decay": 0.1 }, "learning_rate_drop_patience": 3, "learning_rate_drop_div": 2.0, "in": ["bert_features"], "in_y": ["y"], "out": ["y_pred_probas"] }, { "class_name": "proba2labels", "in": ["y_pred_probas"], "out": ["y_pred_ids"], "is_binary": "{BINARY_CLASSIFICATION}", "max_proba": true }, { "class_name": "torch_record_postprocessor", "is_binary": "{BINARY_CLASSIFICATION}", "in": ["idx", "y", "y_pred_probas", "entities", "num_examples"], "out": ["record_examples"] } ], "out": ["y_pred_probas"] }, "train": { "batch_size": 24, "train_metrics": [ { "name": "accuracy", "inputs": ["y", "y_pred_ids"] } ], "metrics": [ { "name": "record_em_score", "inputs": ["record_examples"] }, { "name": "record_f1_score", "inputs": ["record_examples"] }, { "name": "accuracy", "inputs": ["y", "y_pred_ids"] } ], "validation_patience": 10, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "show_examples": false, "class_name": "torch_trainer", "evaluation_targets": ["valid"], "tensorboard_log_dir": "{MODEL_PATH}/", "pytest_max_batches": 2, "pytest_batch_size": 2 }, "metadata": { "variables": { "BASE_MODEL": "roberta-large", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "COMPETITION": "super_glue", "TASK": "record", "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}", "BINARY_CLASSIFICATION": false }, "download": [ { "url": "http://files.deeppavlov.ai/0.17/classifiers/superglue/superglue_record_roberta.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/classifiers/superglue/superglue_wic_bert.json ================================================ { "dataset_reader": { "class_name": "huggingface_dataset_reader", "path": "{COMPETITION}", "name": "{TASK}", "train": "train", "valid": "validation", "test": "test" }, "dataset_iterator": { "class_name": "huggingface_dataset_iterator", "features": ["sentence1", "sentence2"], "label": "label", "seed": 42 }, "chainer": { "in": ["sentence1", "sentence2"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{BASE_MODEL}", "do_lower_case": false, "max_seq_length": 256, "in": ["sentence1", "sentence2"], "out": ["bert_features"] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": ["y"], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": ["y"], "out": ["y_ids"] }, { "in": ["y_ids"], "out": ["y_onehot"], "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{BASE_MODEL}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": {"lr": 2e-05}, "in": ["bert_features"], "in_y": ["y_ids"], "out": ["y_pred_probas"] }, { "in": ["y_pred_probas"], "out": ["y_pred_ids"], "class_name": "proba2labels", "max_proba": true }, { "in": ["y_pred_ids"], "out": ["y_pred_labels"], "ref": "classes_vocab" } ], "out": ["y_pred_labels"] }, "train": { "batch_size": 16, "metrics": ["accuracy"], "epochs": 10, "validation_patience": 10, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "val_every_n_batches": 1000, "show_examples": false, "evaluation_targets": ["valid"], "class_name": "torch_trainer", "tensorboard_log_dir": "{MODEL_PATH}/", "pytest_max_batches": 2 }, "metadata": { "variables": { "BASE_MODEL": "bert-base-cased", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "COMPETITION": "super_glue", "TASK": "wic", "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/superglue/superglue_wic_bert.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/classifiers/topics_distilbert_base_uncased.json ================================================ { "dataset_reader": { "class_name": "basic_classification_reader", "class_sep": ";", "x": "text", "y": "topic", "data_path": "{DOWNLOADS_PATH}/dp_topics_downsampled_data/", "train" : "train.csv", "valid" : "valid.csv" }, "dataset_iterator": { "class_name": "basic_classification_iterator", "seed": 42 }, "chainer": { "in": [ "x" ], "in_y": [ "y" ], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": true, "max_seq_length": 128, "in": [ "x" ], "out": [ "bert_features" ] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": [ "y" ], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": [ "y" ], "out": [ "y_ids" ] }, { "in": [ "y_ids" ], "out": [ "y_onehot" ], "class_name": "one_hotter", "id": "my_one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{TRANSFORMER}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "multilabel": true, "optimizer": "AdamW", "optimizer_parameters": { "lr": 1e-05 }, "learning_rate_drop_patience": 5, "learning_rate_drop_div": 2.0, "in": [ "bert_features" ], "in_y": [ "y_onehot" ], "out": [ "y_pred_probas" ] }, { "in": "y_pred_probas", "out": "y_pred_ids", "class_name": "proba2labels", "max_proba": false, "confidence_threshold": 0.5 }, { "in": "y_pred_ids", "out": "y_pred_labels", "ref": "classes_vocab" }, { "ref": "my_one_hotter", "in": "y_pred_ids", "out": "y_pred_onehot" } ], "out": [ "y_pred_labels" ] }, "train": { "epochs": 100, "batch_size": 64, "metrics": [ { "name": "f1_macro", "inputs": [ "y_onehot", "y_pred_onehot" ] }, { "name": "f1_weighted", "inputs": [ "y_onehot", "y_pred_onehot" ] }, { "name": "accuracy", "inputs": [ "y", "y_pred_labels" ] }, { "name": "roc_auc", "inputs": [ "y_onehot", "y_pred_probas" ] } ], "validation_patience": 10, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "log_every_n_batches": 100, "show_examples": false, "evaluation_targets": [ "train", "valid", "test" ], "tensorboard_log_dir": "{MODEL_PATH}/logs", "class_name": "torch_trainer" }, "metadata": { "variables": { "TRANSFORMER": "distilbert-base-uncased", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/classifiers/topic_distilbert_base_v0" }, "download": [ { "url": "http://files.deeppavlov.ai/datasets/dp_topics_downsampled_dataset_v0.tar.gz", "subdir": "{DOWNLOADS_PATH}" }, { "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/topic_distilbert_base_v0.tar.gz", "subdir": "{MODELS_PATH}/classifiers" } ] } } ================================================ FILE: deeppavlov/configs/doc_retrieval/en_ranker_pop_wiki.json ================================================ { "dataset_reader": { "class_name": "odqa_reader", "data_path": "{DOWNLOADS_PATH}/odqa/enwiki", "save_path": "{DOWNLOADS_PATH}/odqa/enwiki.db", "dataset_format": "wiki" }, "dataset_iterator": { "class_name": "sqlite_iterator", "shuffle": false, "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_l100.db" }, "chainer": { "in": ["docs"], "in_y": ["doc_ids", "doc_nums"], "out": ["pop_doc_ids"], "pipe": [ { "class_name": "hashing_tfidf_vectorizer", "id": "vectorizer", "fit_on": ["docs", "doc_ids", "doc_nums"], "save_path": "{MODELS_PATH}/odqa/enwiki_tfidf_matrix_par_lite.npz", "load_path": "{MODELS_PATH}/odqa/enwiki_tfidf_matrix_par_lite.npz", "tokenizer": { "class_name": "stream_spacy_tokenizer", "lemmas": true, "lowercase": true, "filter_stopwords": true, "ngram_range": [1, 3] } }, { "class_name": "tfidf_ranker", "top_n": 100, "in": ["docs"], "out": ["tfidf_doc_ids", "tfidf_doc_scores"], "vectorizer": "#vectorizer" }, { "class_name": "pop_ranker", "pop_dict_path": "{DOWNLOADS_PATH}/odqa/enwiki_popularities.json", "load_path": "{MODELS_PATH}/odqa/logreg_3features_v2.joblib", "top_n": 100, "in": ["tfidf_doc_ids", "tfidf_doc_scores"], "out": ["pop_doc_ids", "pop_doc_scores"] } ] }, "train": { "batch_size": 10000, "evaluation_targets": [], "class_name": "fit_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models" }, "download": [ { "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_l100.tar.gz", "subdir": "{DOWNLOADS_PATH}/odqa" }, { "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_tfidf_matrix_par_lite.tar.gz", "subdir": "{MODELS_PATH}/odqa" }, { "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_popularities.tar.gz", "subdir": "{DOWNLOADS_PATH}/odqa" }, { "url": "http://files.deeppavlov.ai/deeppavlov_data/ranking/logreg_3features_v2.joblib", "subdir": "{MODELS_PATH}/odqa" } ] } } ================================================ FILE: deeppavlov/configs/doc_retrieval/en_ranker_tfidf_wiki.json ================================================ { "dataset_reader": { "class_name": "odqa_reader", "data_path": "{DOWNLOADS_PATH}/odqa/enwiki", "save_path": "{DOWNLOADS_PATH}/odqa/enwiki.db", "dataset_format": "wiki" }, "dataset_iterator": { "class_name": "sqlite_iterator", "shuffle": false, "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_l100.db" }, "chainer": { "in": ["docs"], "in_y": ["doc_ids", "doc_nums"], "out": ["tfidf_doc_ids"], "pipe": [ { "class_name": "hashing_tfidf_vectorizer", "id": "vectorizer", "fit_on": ["docs", "doc_ids", "doc_nums"], "save_path": "{MODELS_PATH}/odqa/enwiki_tfidf_matrix_par_lite.npz", "load_path": "{MODELS_PATH}/odqa/enwiki_tfidf_matrix_par_lite.npz", "tokenizer": { "class_name": "stream_spacy_tokenizer", "lemmas": true, "lowercase": true, "filter_stopwords": true, "ngram_range": [1, 3] } }, { "class_name": "tfidf_ranker", "top_n": 100, "in": ["docs"], "out": ["tfidf_doc_ids", "tfidf_doc_scores"], "vectorizer": "#vectorizer" } ] }, "train": { "batch_size": 10000, "evaluation_targets": [], "class_name": "fit_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models" }, "download": [ { "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_l100.tar.gz", "subdir": "{DOWNLOADS_PATH}/odqa" }, { "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_tfidf_matrix_par_lite.tar.gz", "subdir": "{MODELS_PATH}/odqa" } ] } } ================================================ FILE: deeppavlov/configs/doc_retrieval/ru_ranker_tfidf_wiki.json ================================================ { "dataset_reader": { "class_name": "odqa_reader", "data_path": "{DOWNLOADS_PATH}/odqa/ruwiki", "save_path": "{DOWNLOADS_PATH}/odqa/ruwiki_par_page_compr.db", "dataset_format": "wiki" }, "dataset_iterator": { "class_name": "sqlite_iterator", "shuffle": false, "load_path": "{DOWNLOADS_PATH}/odqa/ruwiki_par_page_compr.db" }, "chainer": { "in": ["docs"], "in_y": ["doc_ids", "doc_nums"], "out": ["tfidf_doc_ids"], "pipe": [ { "class_name": "hashing_tfidf_vectorizer", "id": "vectorizer", "fit_on": ["docs", "doc_ids", "doc_nums"], "save_path": "{MODELS_PATH}/odqa/ruwiki_tfidf_matrix_compr.npz", "load_path": "{MODELS_PATH}/odqa/ruwiki_tfidf_matrix_compr.npz", "tokenizer": { "class_name": "stream_spacy_tokenizer", "spacy_model": "ru_core_news_sm", "lemmas": true, "lowercase": true, "filter_stopwords": true, "ngram_range": [1, 3] } }, { "class_name": "tfidf_ranker", "top_n": 100, "in": ["docs"], "out": ["tfidf_doc_ids", "tfidf_doc_scores"], "vectorizer": "#vectorizer" } ] }, "train": { "batch_size": 10000, "evaluation_targets": [], "class_name": "fit_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models" }, "download": [ { "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/ruwiki_par_page_compr.tar.gz", "subdir": "{DOWNLOADS_PATH}/odqa" }, { "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/ruwiki_tfidf_matrix_compr.tar.gz", "subdir": "{MODELS_PATH}/odqa" } ] } } ================================================ FILE: deeppavlov/configs/embedder/bert_embedder.json ================================================ { "chainer": { "in": ["texts"], "pipe": [ { "class_name": "transformers_bert_preprocessor", "vocab_file": "{BERT_PATH}/vocab.txt", "do_lower_case": false, "max_seq_length": 512, "in": ["texts"], "out": ["tokens", "subword_tokens", "subword_tok_ids", "startofword_markers", "attention_mask"] }, { "class_name": "transformers_bert_embedder", "bert_config_path": "{BERT_PATH}/bert_config.json", "load_path": "{BERT_PATH}", "truncate": true, "in": ["subword_tok_ids", "startofword_markers", "attention_mask"], "out": ["word_emb", "subword_emb", "max_emb", "mean_emb", "pooler_output"] } ], "out": ["tokens", "word_emb", "subword_tokens", "subword_emb", "max_emb", "mean_emb", "pooler_output"] }, "train": {}, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "BERT_PATH": "{DOWNLOADS_PATH}/bert_models/multi_cased_L-12_H-768_A-12_pt" }, "labels": {}, "download": [ { "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/multi_cased_L-12_H-768_A-12_pt.tar.gz", "subdir": "{DOWNLOADS_PATH}/bert_models" } ] } } ================================================ FILE: deeppavlov/configs/embedder/bert_sentence_embedder.json ================================================ { "chainer": { "in": ["texts"], "pipe": [ { "class_name": "transformers_bert_preprocessor", "vocab_file": "{BERT_PATH}/vocab.txt", "do_lower_case": false, "max_seq_length": 512, "in": ["texts"], "out": ["tokens", "subword_tokens", "subword_tok_ids", "startofword_markers", "attention_mask"] }, { "class_name": "transformers_bert_embedder", "bert_config_path": "{BERT_PATH}/config.json", "load_path": "{BERT_PATH}", "truncate": false, "in": ["subword_tok_ids", "startofword_markers", "attention_mask"], "out": ["word_emb", "subword_emb", "max_emb", "mean_emb", "pooler_output"] } ], "out": ["max_emb", "mean_emb", "pooler_output"] }, "train": {}, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "BERT_PATH": "{DOWNLOADS_PATH}/bert_models/sentence_multi_cased_L-12_H-768_A-12_pt_v1" }, "labels": {}, "download": [ { "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_multi_cased_L-12_H-768_A-12_pt_v1.tar.gz", "subdir": "{DOWNLOADS_PATH}/bert_models" } ] } } ================================================ FILE: deeppavlov/configs/entity_extraction/entity_detection_en.json ================================================ { "chainer": { "in": ["x"], "pipe": [ { "class_name": "ner_chunker", "batch_size": 16, "max_seq_len" : 300, "vocab_file": "{TRANSFORMER}", "in": ["x"], "out": ["x_chunk", "chunk_nums", "chunk_sentences_offsets", "chunk_sentences"] }, { "thres_proba": 0.6, "o_tag": "O", "tags_file": "{NER_PATH}/tag.dict", "class_name": "entity_detection_parser", "id": "edp" }, { "class_name": "ner_chunk_model", "ner": { "config_path": "{CONFIGS_PATH}/ner/ner_ontonotes_bert.json", "overwrite": { "chainer.out": ["x_tokens", "tokens_offsets", "y_pred", "probas"] } }, "ner_parser": "#edp", "in": ["x_chunk", "chunk_nums", "chunk_sentences_offsets", "chunk_sentences"], "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"] } ], "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"] }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "MODELS_PATH": "{ROOT_PATH}/models", "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs", "TRANSFORMER": "bert-base-cased", "NER_PATH": "{MODELS_PATH}/ner_ontonotes_bert_torch_crf" } } } ================================================ FILE: deeppavlov/configs/entity_extraction/entity_detection_ru.json ================================================ { "chainer": { "in": ["x"], "pipe": [ { "class_name": "ner_chunker", "batch_size": 16, "max_seq_len" : 300, "vocab_file": "{TRANSFORMER}", "in": ["x"], "out": ["x_chunk", "chunk_nums", "chunk_sentences_offsets", "chunk_sentences"] }, { "thres_proba": 0.05, "o_tag": "O", "tags_file": "{NER_PATH}/tag.dict", "class_name": "entity_detection_parser", "id": "edp" }, { "class_name": "ner_chunk_model", "ner": {"config_path": "{CONFIGS_PATH}/ner/ner_rus_bert_probas.json"}, "ner_parser": "#edp", "in": ["x_chunk", "chunk_nums", "chunk_sentences_offsets", "chunk_sentences"], "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"] } ], "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"] }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "MODELS_PATH": "{ROOT_PATH}/models", "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs", "TRANSFORMER": "DeepPavlov/rubert-base-cased", "NER_PATH": "{MODELS_PATH}/wiki_ner_rus_bert" } } } ================================================ FILE: deeppavlov/configs/entity_extraction/entity_extraction_en.json ================================================ { "chainer": { "in": ["x"], "pipe": [ { "config_path": "{CONFIGS_PATH}/entity_extraction/entity_detection_en.json", "in": ["x"], "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"] }, { "config_path": "{CONFIGS_PATH}/entity_extraction/entity_linking_en.json", "in": ["entity_substr", "tags", "probas", "sentences", "entity_offsets", "sentences_offsets"], "out": ["entity_ids", "entity_conf", "entity_pages", "entity_labels"] } ], "out": ["entity_substr", "tags", "entity_offsets", "entity_ids", "entity_conf", "entity_pages", "entity_labels"] }, "metadata": { "variables": { "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs" } } } ================================================ FILE: deeppavlov/configs/entity_extraction/entity_extraction_ru.json ================================================ { "chainer": { "in": ["x"], "pipe": [ { "config_path": "{CONFIGS_PATH}/entity_extraction/entity_detection_ru.json", "in": ["x"], "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"] }, { "config_path": "{CONFIGS_PATH}/entity_extraction/entity_linking_ru.json", "in": ["entity_substr", "tags", "probas", "sentences", "entity_offsets", "sentences_offsets"], "out": ["entity_ids", "entity_conf", "entity_pages", "entity_labels"] } ], "out": ["entity_substr", "tags", "entity_offsets", "entity_ids", "entity_conf", "entity_pages", "entity_labels"] }, "metadata": { "variables": { "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs" } } } ================================================ FILE: deeppavlov/configs/entity_extraction/entity_linking_en.json ================================================ { "chainer": { "in": ["entity_substr", "tags", "probas", "sentences", "entity_offsets", "sentences_offsets"], "pipe": [ { "class_name": "torch_transformers_entity_ranker_infer", "id": "entity_descr_ranking", "pretrained_bert": "{TRANSFORMER}", "encoder_weights_path": "{MODELS_PATH}/entity_linking_eng/encoder.pth.tar", "bilinear_weights_path": "{MODELS_PATH}/entity_linking_eng/bilinear.pth.tar", "special_token_id": 30522, "emb_size": 512, "block_size": 8 }, { "class_name": "entity_linker", "in": ["entity_substr", "tags", "probas", "sentences", "entity_offsets", "sentences_offsets"], "out": ["entity_ids", "entity_conf", "entity_pages", "entity_labels"], "load_path": "{DOWNLOADS_PATH}/entity_linking_eng", "entities_database_filename": "el_eng_v2.db", "entity_ranker": "#entity_descr_ranking", "rank_in_runtime": true, "num_entities_for_bert_ranking": 20, "include_mention": false, "num_entities_to_return": 3, "lemmatize": true, "use_descriptions": true, "use_connections": true, "use_tags": true, "full_paragraph": true, "return_confidences": true, "lang": "en" } ], "out": ["entity_ids", "entity_conf", "entity_pages", "entity_labels"] }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "TRANSFORMER": "prajjwal1/bert-small" }, "download": [ { "url": "http://files.deeppavlov.ai/kbqa/downloads/el_db_eng_v2.tar.gz", "subdir": "{DOWNLOADS_PATH}/entity_linking_eng" }, { "url": "http://files.deeppavlov.ai/deeppavlov_data/entity_linking/el_ranker_eng.tar.gz", "subdir": "{MODELS_PATH}/entity_linking_eng" } ] } } ================================================ FILE: deeppavlov/configs/entity_extraction/entity_linking_ru.json ================================================ { "chainer": { "in": ["entity_substr", "tags", "probas", "sentences", "entity_offsets", "sentences_offsets"], "pipe": [ { "class_name": "torch_transformers_entity_ranker_infer", "id": "entity_descr_ranking", "pretrained_bert": "{TRANSFORMER}", "encoder_weights_path": "{MODELS_PATH}/entity_linking_rus/encoder.pth.tar", "bilinear_weights_path": "{MODELS_PATH}/entity_linking_rus/bilinear.pth.tar", "special_token_id": 30522, "emb_size": 264, "block_size": 6 }, { "class_name": "entity_linker", "in": ["entity_substr", "tags", "probas", "sentences", "entity_offsets", "sentences_offsets"], "out": ["entity_ids", "entity_conf", "entity_pages", "entity_labels"], "load_path": "{DOWNLOADS_PATH}/entity_linking_rus", "entities_database_filename": "el_rus_v2.db", "words_dict_filename": "{DOWNLOADS_PATH}/entity_linking_rus/words_dict.pickle", "ngrams_matrix_filename": "{DOWNLOADS_PATH}/entity_linking_rus/ngrams_matrix.npz", "entity_ranker": "#entity_descr_ranking", "rank_in_runtime": true, "num_entities_for_bert_ranking": 30, "use_gpu": false, "include_mention": false, "num_entities_to_return": 3, "lemmatize": true, "use_descriptions": true, "use_connections": true, "use_tags": true, "kb_filename": "{DOWNLOADS_PATH}/wikidata/wikidata_lite.hdt", "prefixes": {"entity": ["http://we"], "rels": {"direct": "http://wpd", "no_type": "http://wp", "statement": "http://wps", "qualifier": "http://wpq" } }, "full_paragraph": true, "return_confidences": true, "lang": "ru" } ], "out": ["entity_ids", "entity_conf", "entity_pages", "entity_labels"] }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "TRANSFORMER": "DeepPavlov/distilrubert-tiny-cased-conversational-v1" }, "download": [ { "url": "http://files.deeppavlov.ai/kbqa/downloads/el_files_rus_v2.tar.gz", "subdir": "{DOWNLOADS_PATH}/entity_linking_rus" }, { "url": "http://files.deeppavlov.ai/deeppavlov_data/entity_linking/el_ranker_rus.tar.gz", "subdir": "{MODELS_PATH}/entity_linking_rus" }, { "url": "http://files.deeppavlov.ai/kbqa/wikidata/wikidata_lite.tar.gz", "subdir": "{DOWNLOADS_PATH}/wikidata" } ] } } ================================================ FILE: deeppavlov/configs/faq/fasttext_logreg.json ================================================ { "dataset_reader": { "class_name": "basic_classification_reader", "format": "json", "orient": "split", "x": "text", "y": "category", "data_path": "{DOWNLOADS_PATH}/massive/{LANGUAGE}", "train": "train.json", "valid": "dev.json", "test": "test.json" }, "dataset_iterator": { "class_name": "basic_classification_iterator", "seed": 42, "shuffle": true, "shot": 5 }, "chainer": { "in": ["text"], "in_y": ["category"], "pipe": [ { "class_name": "stream_spacy_tokenizer", "in": ["text"], "id": "my_tokenizer", "lemmas": false, "out": "token_lemmas", "spacy_model": "{SPACY_MODEL}" }, { "ref": "my_tokenizer", "in": ["token_lemmas"], "out": ["text_lem"] }, { "class_name": "fasttext", "in": ["token_lemmas"], "load_path": "{DOWNLOADS_PATH}/embeddings/fasttext/{LANGUAGE}.bin", "mean": true, "out": ["text_vector"] }, { "id": "answers_vocab", "class_name": "simple_vocab", "fit_on": "category", "save_path": "{MODEL_PATH}/cat_answers.dict", "load_path": "{MODEL_PATH}/cat_answers.dict", "in": ["category"], "out": ["y_ids"] }, { "in": ["text_vector"], "fit_on": ["text_vector", "y_ids"], "out": ["y_pred_proba"], "class_name": "sklearn_component", "main": true, "save_path": "{MODEL_PATH}/model.pkl", "load_path": "{MODEL_PATH}/model.pkl", "model_class": "sklearn.linear_model:LogisticRegression", "infer_method": "predict_proba", "C": 10, "penalty": "l2" }, { "in": ["y_pred_proba"], "out": ["y_pred_ids"], "class_name": "proba2labels", "max_proba": true }, { "in": ["y_pred_ids"], "out": ["y_pred_category"], "ref": "answers_vocab" } ], "out": ["y_pred_category"] }, "train": { "evaluation_targets": ["train", "valid", "test"], "class_name": "fit_trainer", "metrics": [ { "name": "accuracy", "inputs": ["category", "y_pred_category"] } ] }, "metadata": { "variables": { "LANGUAGE": "en", "ROOT_PATH": "~/.deeppavlov", "SPACY_MODEL": "en_core_web_sm", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODEL_PATH": "{ROOT_PATH}/models/faq/{LANGUAGE}/fasttext_logreg" }, "download": [ { "url": "http://files.deeppavlov.ai/embeddings/fasttext/{LANGUAGE}.bin", "subdir": "{DOWNLOADS_PATH}/embeddings/fasttext" }, { "url": "http://files.deeppavlov.ai/datasets/massive-{LANGUAGE}.tar.gz", "subdir": "{DOWNLOADS_PATH}/massive/{LANGUAGE}" }, { "url": "http://files.deeppavlov.ai/faq/fasttext_logreg_{LANGUAGE}.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/kbqa/kbqa_cq_en.json ================================================ { "dataset_reader": { "class_name": "lcquad_reader", "question_types": ["statement_property", "right-subgraph", "simple question left", "simple question right", "left-subgraph", "rank"], "num_samples": 100, "data_path": "{DOWNLOADS_PATH}/lcquad/lcquad2.json" }, "dataset_iterator": { "class_name": "data_learning_iterator" }, "chainer": { "in": ["x"], "in_y": ["gold_answer_ids", "gold_answer_labels", "gold_query"], "pipe": [ { "class_name": "question_sign_checker", "in": ["x"], "out": ["x_punct"] }, { "config_path": "{CONFIGS_PATH}/classifiers/query_pr.json", "in": ["x_punct"], "out": ["template_type"] }, { "class_name": "query_formatter", "query_info": {"unk_var": "?answer", "mid_var": "?ent"}, "in": ["gold_query"], "out": ["f_gold_query"] }, { "config_path": "{CONFIGS_PATH}/entity_extraction/entity_detection_en.json", "overwrite": { "chainer.pipe.1.make_tags_from_probas": true, "chainer.pipe.2.ner": { "config_path": "{CONFIGS_PATH}/ner/ner_ontonotes_bert.json", "overwrite": { "chainer.out": ["x_tokens", "tokens_offsets", "y_pred", "probas"], "chainer.pipe.2.use_crf": false, "metadata.variables.TRANSFORMER": "distilbert-base-cased", "metadata.variables.MODEL_PATH": "{MODELS_PATH}/entity_type_detection_distilbert_lcquad2.0" } }, "metadata.variables.NER_PATH": "{MODELS_PATH}/entity_type_detection_distilbert_lcquad2.0" }, "in": ["x_punct", "template_type"], "out": ["entity_type_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"] }, { "class_name": "entity_type_split", "in": ["entity_type_substr", "tags"], "out": ["entity_substr", "entity_tags", "type_substr"] }, { "class_name": "answer_types_extractor", "lang": "@en", "types_filename": "{DOWNLOADS_PATH}/wikidata_eng/types_labels_dict_en.pickle", "types_sets_filename": "{DOWNLOADS_PATH}/wikidata_eng/answer_types.pickle", "in": ["x_punct", "entity_substr", "tags"], "out": ["answer_types", "f_entity_substr", "f_tags"] }, { "class_name": "entity_linker", "load_path": "{DOWNLOADS_PATH}/entity_linking_eng", "entities_database_filename": "el_db_lcquad2.db", "num_entities_to_return": 7, "lemmatize": true, "use_descriptions": false, "use_connections": false, "use_tags": true, "alias_coef": 1.0, "prefixes": {"entity": ["http://we"], "rels": {"direct": "http://wpd", "no_type": "http://wp", "statement": "http://wps", "qualifier": "http://wpq" } }, "return_confidences": true, "lang": "en", "id": "entity_linker" }, { "class_name": "wiki_parser", "id": "wiki_p", "wiki_filename": "{DOWNLOADS_PATH}/wikidata/wikidata_full.hdt", "lang": "@en" }, { "class_name": "template_matcher", "id": "template_m", "num_processors": 16, "load_path": "{DOWNLOADS_PATH}/wikidata_eng", "templates_filename": "templates_eng.json" }, { "class_name": "rel_ranking_infer", "id": "rel_r_inf", "ranker": {"config_path": "{CONFIGS_PATH}/ranking/rel_ranking_roberta_en.json", "overwrite": {"chainer.out": ["y_pred_probas"]} }, "wiki_parser": "#wiki_p", "batch_size": 32, "rank_answers": true, "load_path": "{DOWNLOADS_PATH}/wikidata_eng", "rel_q2name_filename": "wiki_dict_properties_eng.pickle" }, { "class_name": "query_generator", "id": "query_g", "entity_linker": "#entity_linker", "template_matcher": "#template_m", "rel_ranker": "#rel_r_inf", "wiki_parser": "#wiki_p", "load_path": "{DOWNLOADS_PATH}/wikidata", "rels_in_ranking_queries_fname": "rels_in_ranking_queries.json", "sparql_queries_filename": "{DOWNLOADS_PATH}/wikidata/sparql_queries_eng.json", "entities_to_leave": 5, "rels_to_leave": 10, "return_answers": false, "map_query_str_to_kb": [["P0", "http://wd"], ["P00", "http://wl"], ["wd:", "http://we/"], ["wdt:", "http://wpd/"], [" p:", " http://wp/"], ["ps:", "http://wps/"], ["pq:", "http://wpq/"]], "kb_prefixes": {"entity": "wd:E", "rel": "wdt:R", "type": "wd:T", "type_rel": "wdt:P", "type_rels": ["P31", "P279"]}, "gold_query_info": {"unk_var": "?answer", "mid_var": "?ent"}, "in": ["x_punct", "x_punct", "template_type", "entity_substr", "type_substr", "entity_tags", "probas", "answer_types"], "out": ["cand_answers", "template_answers"] }, { "class_name": "rel_ranking_infer", "ranker": {"config_path": "{CONFIGS_PATH}/ranking/path_ranking_nll_roberta_en.json"}, "wiki_parser": "#wiki_p", "batch_size": 32, "nll_path_ranking": true, "return_elements": ["answer_ids", "queries"], "rank_answers": true, "load_path": "{DOWNLOADS_PATH}/wikidata_eng", "rel_q2name_filename": "wiki_dict_properties_eng.pickle", "in": ["x_punct", "template_type", "cand_answers", "entity_substr", "template_answers"], "out": ["answers", "answer_ids", "query"] } ], "out": ["answers", "answer_ids", "query"] }, "train": { "evaluation_targets": ["test"], "batch_size": 1, "metrics": [ { "name": "kbqa_accuracy", "inputs": ["x", "answers", "answer_ids", "query", "gold_answer_labels", "gold_answer_ids", "f_gold_query"] } ], "class_name": "nn_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs" }, "download": [ { "url": "http://files.deeppavlov.ai/kbqa/datasets/lcquad2.tar.gz", "subdir": "{DOWNLOADS_PATH}/lcquad" }, { "url": "http://files.deeppavlov.ai/kbqa/models/entity_type_detection_distilbert_lcquad2.0.tar.gz", "subdir": "{MODELS_PATH}/entity_type_detection_distilbert_lcquad2.0" }, { "url": "http://files.deeppavlov.ai/kbqa/wikidata/queries_and_rels_lcquad2_v2.tar.gz", "subdir": "{DOWNLOADS_PATH}/wikidata" }, { "url": "http://files.deeppavlov.ai/kbqa/downloads/el_db_lcquad2.tar.gz", "subdir": "{DOWNLOADS_PATH}/entity_linking_eng" }, { "url": "http://files.deeppavlov.ai/kbqa/wikidata/wikidata_full.tar.gz", "subdir": "{DOWNLOADS_PATH}/wikidata" }, { "url": "http://files.deeppavlov.ai/kbqa/wikidata/kbqa_files_en.tar.gz", "subdir": "{DOWNLOADS_PATH}/wikidata_eng" } ] } } ================================================ FILE: deeppavlov/configs/kbqa/kbqa_cq_ru.json ================================================ { "dataset_reader": { "class_name": "rubq_reader", "version": "2.0", "question_types": ["all"], "num_samples": 100, "data_path": "{DOWNLOADS_PATH}/rubq/rubq2.0.json" }, "dataset_iterator": { "class_name": "data_learning_iterator" }, "chainer": { "in": ["x"], "in_y": ["gold_answer_ids", "gold_answer_labels", "gold_query"], "pipe": [ { "class_name": "question_sign_checker", "delete_brackets": true, "in": ["x"], "out": ["x_punct"] }, { "class_name": "query_formatter", "query_info": {"unk_var": "?answer", "mid_var": "?ent"}, "in": ["gold_query"], "out": ["f_gold_query"] }, { "class_name": "ner_chunker", "batch_size": 16, "max_seq_len" : 300, "vocab_file": "distilbert-base-multilingual-cased", "in": ["x_punct"], "out": ["x_chunk", "chunk_nums", "chunk_sentences_offsets", "chunk_sentences"] }, { "thres_proba": 0.05, "o_tag": "O", "tags_file": "{NER_PATH}/tag.dict", "class_name": "entity_detection_parser", "ignored_tags": ["DATE", "CARDINAL", "ORDINAL", "QUANTITY", "PERCENT", "NORP"], "lang": "ru", "id": "edp" }, { "thres_proba": 0.05, "o_tag": "O", "tags_file": "{NER_PATH2}/tag.dict", "class_name": "entity_detection_parser", "ignored_tags": ["T"], "lang": "ru", "id": "edp2" }, { "class_name": "ner_chunk_model", "ner": { "config_path": "{CONFIGS_PATH}/ner/ner_ontonotes_bert_mult.json", "overwrite": { "chainer.pipe.2.use_crf": false, "metadata.variables.TRANSFORMER": "distilbert-base-multilingual-cased", "chainer.out": ["x_tokens", "tokens_offsets", "y_pred", "probas"], "metadata.variables.MODEL_PATH": "{MODELS_PATH}/ner_ontonotes_torch_distilbert_mult" } }, "ner_parser": "#edp", "ner2": { "config_path": "{CONFIGS_PATH}/ner/ner_ontonotes_bert_mult.json", "overwrite": { "chainer.pipe.2.use_crf": false, "metadata.variables.TRANSFORMER": "DeepPavlov/distilrubert-small-cased-conversational", "chainer.out": ["x_tokens", "tokens_offsets", "y_pred", "probas"], "metadata.variables.MODEL_PATH": "{MODELS_PATH}/entity_detection_rubq" } }, "ner_parser2": "#edp2", "in": ["x_chunk", "chunk_nums", "chunk_sentences_offsets", "chunk_sentences"], "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"] }, { "class_name": "answer_types_extractor", "lang": "@ru", "types_filename": "{DOWNLOADS_PATH}/wikidata_rus/types_labels_dict_ru.pickle", "types_sets_filename": "{DOWNLOADS_PATH}/wikidata_rus/answer_types.pickle", "in": ["x_punct", "entity_substr", "tags"], "out": ["answer_types", "f_entity_substr", "f_tags"] }, { "class_name": "entity_linker", "load_path": "{DOWNLOADS_PATH}/entity_linking_rus", "entities_database_filename": "el_db_rus.db", "words_dict_filename": "{DOWNLOADS_PATH}/entity_linking_rus/words_dict.pickle", "ngrams_matrix_filename": "{DOWNLOADS_PATH}/entity_linking_rus/ngrams_matrix.npz", "include_mention": false, "num_entities_to_return": 7, "lemmatize": true, "use_descriptions": false, "use_connections": true, "use_tags": true, "kb_filename": "{DOWNLOADS_PATH}/wikidata/wikidata_full.hdt", "prefixes": {"entity": ["http://we"], "rels": {"direct": "http://wpd", "no_type": "http://wp", "statement": "http://wps", "qualifier": "http://wpq" } }, "return_confidences": true, "lang": "ru", "id": "entity_linker" }, { "class_name": "wiki_parser", "id": "wiki_p", "wiki_filename": "{DOWNLOADS_PATH}/wikidata/wikidata_full.hdt", "max_comb_num": 40000, "lang": "@ru" }, { "class_name": "slovnet_syntax_parser", "load_path": "{MODELS_PATH}/slovnet_syntax_parser", "navec_filename": "{MODELS_PATH}/slovnet_syntax_parser/navec_news_v1_1B_250K_300d_100q.tar", "syntax_parser_filename": "{MODELS_PATH}/slovnet_syntax_parser/slovnet_syntax_news_v1.tar", "tree_patterns_filename": "{MODELS_PATH}/slovnet_syntax_parser/tree_patterns.json", "id": "slovnet_parser" }, { "class_name": "ru_adj_to_noun", "freq_dict_filename": "{DOWNLOADS_PATH}/wikidata_rus/freqrnc2011.csv", "id": "adj2noun" }, { "class_name": "tree_to_sparql", "sparql_queries_filename": "{DOWNLOADS_PATH}/wikidata/sparql_queries_rus.json", "adj_to_noun": "#adj2noun", "syntax_parser": "#slovnet_parser", "kb_prefixes": {"entity": "wd:E", "rel": "wdt:R", "type": "wd:T", "type_rel": "wdt:P", "type_rels": ["P31", "P279"]}, "in": ["x_punct", "entity_substr", "tags", "entity_offsets", "entity_positions", "probas"], "out": ["x_sanitized", "query_nums", "s_entity_substr", "s_tags", "s_probas", "entities_to_link", "s_types_substr"] }, { "class_name": "template_matcher", "id": "template_m", "num_processors": 8, "load_path": "{DOWNLOADS_PATH}/wikidata_rus", "templates_filename": "templates_rus.json" }, { "class_name": "rel_ranking_infer", "id": "rel_r_inf", "ranker": {"config_path": "{CONFIGS_PATH}/ranking/rel_ranking_nll_bert_ru.json"}, "wiki_parser": "#wiki_p", "batch_size": 32, "nll_rel_ranking": true, "return_elements": ["answer_ids", "queries"], "load_path": "{DOWNLOADS_PATH}/wikidata_rus", "rank": false, "rel_thres": -4.0, "type_rels": ["P31", "P279"], "rel_q2name_filename": "wiki_dict_properties_full_rus.pickle" }, { "class_name": "query_generator", "id": "query_g", "entity_linker": "#entity_linker", "template_matcher": "#template_m", "rel_ranker": "#rel_r_inf", "wiki_parser": "#wiki_p", "load_path": "{DOWNLOADS_PATH}/wikidata", "rels_in_ranking_queries_fname": "rels_in_ranking_queries.json", "sparql_queries_filename": "{DOWNLOADS_PATH}/wikidata/sparql_queries_rus.json", "entities_to_leave": 9, "rels_to_leave": 10, "max_comb_num": 1000, "map_query_str_to_kb": [["P0", "http://wd"], ["P00", "http://wl"], ["wd:", "http://we/"], ["wdt:", "http://wpd/"], [" p:", " http://wp/"], ["ps:", "http://wps/"], ["pq:", "http://wpq/"]], "kb_prefixes": {"entity": "wd:E", "rel": "wdt:R", "type": "wd:T", "type_rel": "wdt:P", "type_rels": ["P31", "P279"]}, "gold_query_info": {"unk_var": "?answer", "mid_var": "?ent"}, "syntax_structure_known": true, "in": ["x_punct", "x_sanitized", "query_nums", "s_entity_substr", "s_types_substr", "s_tags", "s_probas", "answer_types", "entities_to_link"], "out": ["answers", "answer_ids", "query"] } ], "out": ["answers", "answer_ids", "query"] }, "train": { "evaluation_targets": ["test"], "batch_size": 1, "metrics": [ { "name": "kbqa_accuracy", "inputs": ["x", "answers", "answer_ids", "query", "gold_answer_labels", "gold_answer_ids", "f_gold_query"] } ], "class_name": "nn_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs", "NER_PATH": "{MODELS_PATH}/ner_ontonotes_torch_distilbert_mult", "NER_PATH2": "{MODELS_PATH}/entity_detection_rubq" }, "download": [ { "url": "http://files.deeppavlov.ai/datasets/rubq2.0.tar.gz", "subdir": "{DOWNLOADS_PATH}/rubq" }, { "url": "http://files.deeppavlov.ai/kbqa/downloads/el_files_rus.tar.gz", "subdir": "{DOWNLOADS_PATH}/entity_linking_rus" }, { "url": "http://files.deeppavlov.ai/kbqa/models/ner_ontonotes_torch_distilbert_mult.tar.gz", "subdir": "{MODELS_PATH}/ner_ontonotes_torch_distilbert_mult" }, { "url": "http://files.deeppavlov.ai/kbqa/models/entity_detection_rubq.tar.gz", "subdir": "{MODELS_PATH}/entity_detection_rubq" }, { "url": "http://files.deeppavlov.ai/kbqa/wikidata/queries_and_rels_rus_v2.tar.gz", "subdir": "{DOWNLOADS_PATH}/wikidata" }, { "url": "http://files.deeppavlov.ai/kbqa/wikidata/kbqa_files_ru.tar.gz", "subdir": "{DOWNLOADS_PATH}/wikidata_rus" }, { "url": "http://files.deeppavlov.ai/deeppavlov_data/syntax_parser/slovnet_syntax_parser_v2.tar.gz", "subdir": "{MODELS_PATH}/slovnet_syntax_parser" }, { "url": "http://files.deeppavlov.ai/kbqa/wikidata/wikidata_full.tar.gz", "subdir": "{DOWNLOADS_PATH}/wikidata" } ] } } ================================================ FILE: deeppavlov/configs/kbqa/wiki_parser.json ================================================ { "chainer": { "in": ["parser_info", "query"], "pipe": [ { "class_name": "wiki_parser", "in": ["parser_info", "query"], "out": ["wiki_parser_output"], "wiki_filename": "{DOWNLOADS_PATH}/wikidata/wikidata_compr.pickle", "file_format": "pickle", "lang": "@en" } ], "out": ["wiki_parser_output"] }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs" }, "download": [ { "url": "http://files.deeppavlov.ai/kbqa/wikidata/wikidata_compr.pickle", "subdir": "{DOWNLOADS_PATH}/wikidata" } ] } } ================================================ FILE: deeppavlov/configs/morpho_syntax_parser/morpho_ru_syntagrus_bert.json ================================================ { "dataset_reader": { "class_name": "morphotagger_dataset_reader", "data_path": "{DOWNLOADS_PATH}/UD2.3_source", "language": "ru_syntagrus", "data_types": ["train", "dev", "test"] }, "dataset_iterator": { "class_name": "morphotagger_dataset_iterator" }, "chainer": { "in": ["x"], "in_y": ["y"], "pipe": [ { "in": ["x"], "class_name": "lazy_tokenizer", "out": ["x_words"] }, { "class_name": "torch_transformers_ner_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": false, "max_seq_length": 512, "max_subword_length": 15, "token_masking_prob": 0.0, "in": ["x_words"], "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets"] }, { "id": "tag_vocab", "class_name": "simple_vocab", "min_freq": 3, "fit_on": ["y"], "in": ["y"], "out": ["y_ind"], "special_tokens": ["PAD", "BEGIN", "END"], "pad_with_zeros": true, "save_path": "{MODEL_PATH}/tag.dict", "load_path": "{MODEL_PATH}/tag.dict" }, { "class_name": "torch_transformers_sequence_tagger", "n_tags": "#tag_vocab.len", "pretrained_bert": "{TRANSFORMER}", "attention_probs_keep_prob": 0.5, "use_crf": false, "encoder_layer_ids": [-6, -5, -4, -3, -2, -1], "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-05, "weight_decay": 1e-06, "betas": [0.9, 0.999], "eps": 1e-06 }, "clip_norm": 1.0, "min_learning_rate": 1e-07, "learning_rate_drop_patience": 10, "learning_rate_drop_div": 1.5, "load_before_drop": true, "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"], "in_y": ["y_ind"], "out": ["y_pred_ind", "probas"] }, { "ref": "tag_vocab", "in": ["y_pred_ind"], "out": ["y_pred"] }, { "in": ["x_words"], "out": ["y_lemmas"], "model": "ru_core_news_sm", "class_name": "spacy_lemmatizer" }, { "in": ["x_words", "y_pred", "y_lemmas"], "out": ["y_prettified"], "id": "prettifier", "class_name": "lemmatized_output_prettifier" } ], "out": ["y_prettified"] }, "train": { "epochs": 10, "batch_size": 32, "metrics": [ { "name": "per_token_accuracy", "inputs": ["y", "y_pred"] }, { "name": "accuracy", "inputs": ["y", "y_pred"] } ], "validation_patience": 15, "val_every_n_epochs": 1, "val_every_n_batches": 300, "show_examples": false, "pytest_max_batches": 2, "pytest_batch_size": 8, "evaluation_targets": ["valid", "test"], "class_name": "nn_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "TRANSFORMER": "DeepPavlov/rubert-base-cased", "MODEL_PATH": "{MODELS_PATH}/morpho_ru_syntagrus_torch_bert" }, "download": [ { "url": "http://files.deeppavlov.ai/deeppavlov_data/morpho_tagger/UD2.3/ru_syntagrus.tar.gz", "subdir": "{DOWNLOADS_PATH}/UD2.3_source/ru_syntagrus" }, { "url": "http://files.deeppavlov.ai/deeppavlov_data/syntax_parsing/morpho_ru_syntagrus_torch_bert.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/morpho_syntax_parser/ru_syntagrus_joint_parsing.json ================================================ { "chainer": { "in": ["x_words"], "pipe": [ { "id": "main", "class_name": "joint_tagger_parser", "tagger": { "config_path": "{CONFIGS_PATH}/morpho_syntax_parser/morpho_ru_syntagrus_bert.json", "overwrite": {"chainer.pipe.6.return_string": false} }, "parser": { "config_path": "{CONFIGS_PATH}/morpho_syntax_parser/syntax_ru_syntagrus_bert.json", "overwrite": {"chainer.pipe.6.return_string": false} }, "in": ["x_words"], "out": ["y_parsed"] } ], "out": ["y_parsed"] }, "metadata": { "variables": { "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs" } } } ================================================ FILE: deeppavlov/configs/morpho_syntax_parser/syntax_ru_syntagrus_bert.json ================================================ { "dataset_reader": { "class_name": "morphotagger_dataset_reader", "data_path": "{DOWNLOADS_PATH}/UD2.3_source", "language": "ru_syntagrus", "data_types": ["train", "dev", "test"], "read_syntax": true }, "dataset_iterator": { "class_name": "morphotagger_dataset_iterator" }, "chainer": { "in": ["x"], "in_y": ["y_tags", "y_heads", "y_deps"], "pipe": [ { "in": ["x"], "class_name": "lazy_tokenizer", "out": ["x_words"] }, { "class_name": "torch_transformers_ner_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": false, "max_seq_length": 512, "max_subword_length": 15, "token_masking_prob": 0.0, "in": ["x_words"], "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets"] }, { "id": "dep_vocab", "class_name": "simple_vocab", "min_freq": 3, "fit_on": ["y_deps"], "in": ["y_deps"], "out": ["y_deps_indexes"], "special_tokens": ["PAD"], "pad_with_zeros": true, "save_path": "{MODEL_PATH}/deps.dict", "load_path": "{MODEL_PATH}/deps.dict" }, { "class_name": "torch_transformers_syntax_parser", "n_deps": "#dep_vocab.len", "state_size": 384, "pretrained_bert": "{TRANSFORMER}", "attention_probs_keep_prob": 0.5, "return_probas": true, "encoder_layer_ids": [6, 7, 8, 9, 10, 11], "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-05, "weight_decay": 1e-06, "betas": [0.9, 0.999], "eps": 1e-06 }, "clip_norm": 1.0, "min_learning_rate": 1e-07, "use_birnn": true, "learning_rate_drop_patience": 10, "learning_rate_drop_div": 1.5, "load_before_drop": true, "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"], "in_y": ["y_heads", "y_deps_indexes"], "out": ["y_predicted_heads_probs", "y_predicted_deps_indexes"] }, { "class_name": "chu_liu_edmonds_transformer", "in": ["y_predicted_heads_probs"], "out": ["y_predicted_heads"] }, { "ref": "dep_vocab", "in": ["y_predicted_deps_indexes"], "out": ["y_predicted_deps"] }, { "in": ["x_words", "y_predicted_heads", "y_predicted_deps"], "out": ["y_prettified"], "id": "dependency_output_prettifier", "class_name": "dependency_output_prettifier" } ], "out": ["y_prettified"] }, "train": { "epochs": 10, "batch_size": 32, "metrics": [ { "name": "multitask_token_accuracy", "alias": "LAS", "inputs": ["y_deps", "y_heads", "y_predicted_deps", "y_predicted_heads"] }, { "name": "per_token_accuracy", "alias": "UAS", "inputs": ["y_heads", "y_predicted_heads"] } ], "validation_patience": 15, "val_every_n_batches": 300, "show_examples": false, "pytest_max_batches": 2, "pytest_batch_size": 8, "evaluation_targets": ["valid", "test"], "class_name": "nn_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "TRANSFORMER": "DeepPavlov/rubert-base-cased", "MODEL_PATH": "{MODELS_PATH}/syntax_parsing/rus_6layers" }, "download": [ { "url": "http://files.deeppavlov.ai/deeppavlov_data/morpho_tagger/UD2.3/ru_syntagrus.tar.gz", "subdir": "{DOWNLOADS_PATH}/UD2.3_source/ru_syntagrus" }, { "url": "http://files.deeppavlov.ai/deeppavlov_data/syntax_parsing/rus_6layers.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/multitask/mt_glue.json ================================================ { "dataset_reader": { "class_name": "multitask_reader", "task_defaults": { "class_name": "huggingface_dataset_reader", "path": "glue", "train": "train", "valid": "validation" }, "tasks": { "cola": {"name": "cola"}, "sst2": {"name": "sst2"}, "qqp": {"name": "qqp"}, "mrpc": {"name": "mrpc"}, "rte": {"name": "rte"}, "mnli": { "name": "mnli", "valid": "validation_matched" }, "qnli": {"name": "qnli"}, "stsb": {"name": "stsb"} } }, "dataset_iterator": { "class_name": "multitask_iterator", "num_train_epochs": "{NUM_TRAIN_EPOCHS}", "gradient_accumulation_steps": "{GRADIENT_ACC_STEPS}", "seed": 42, "task_defaults": { "class_name": "huggingface_dataset_iterator", "label": "label", "use_label_name": false, "seed": 42 }, "tasks": { "cola": { "features": ["sentence"] }, "sst2": { "features": ["sentence"] }, "qqp": { "features": ["question1", "question2"] }, "mrpc": { "features": ["sentence1", "sentence2"] }, "rte": { "features": ["sentence1", "sentence2"] }, "mnli": { "features": ["premise", "hypothesis"] }, "qnli": { "features": ["question", "sentence"] }, "stsb": { "features": ["sentence1", "sentence2"] } } }, "chainer": { "in": ["x_cola", "x_sst2", "x_qqp", "x_mrpc", "x_rte", "x_mnli", "x_qnli", "x_stsb"], "in_y": ["y_cola", "y_sst2", "y_qqp", "y_mrpc", "y_rte", "y_mnli", "y_qnli", "y_stsb" ], "pipe": [ { "class_name": "multitask_pipeline_preprocessor", "possible_keys_to_extract": [0, 1], "preprocessor": "TorchTransformersPreprocessor", "vocab_file": "{BACKBONE}", "max_seq_length": 128, "do_lower_case": true, "n_task": 8, "in": ["x_cola", "x_sst2", "x_qqp", "x_mrpc", "x_rte", "x_mnli", "x_qnli", "x_stsb"], "out": [ "bert_features_cola", "bert_features_sst2", "bert_features_qqp", "bert_features_mrpc", "bert_features_rte", "bert_features_mnli", "bert_features_qnli", "bert_features_stsb" ] }, { "id": "multitask_transformer", "class_name": "multitask_transformer", "optimizer_parameters": {"lr": 2e-5}, "gradient_accumulation_steps": "{GRADIENT_ACC_STEPS}", "learning_rate_drop_patience": 2, "learning_rate_drop_div": 2.0, "return_probas": true, "backbone_model": "{BACKBONE}", "save_path": "{MODEL_PATH}", "load_path": "{MODEL_PATH}", "tasks": { "cola": { "type": "classification", "options": 2 }, "sst2": { "type": "classification", "options": 2 }, "qqp": { "type": "classification", "options": 2 }, "mrpc": { "type": "classification", "options": 2 }, "rte": { "type": "classification", "options": 2 }, "mnli": { "type": "classification", "options": 3 }, "qnli": { "type": "classification", "options": 2 }, "stsb": { "type": "regression", "options": 1 } }, "in": [ "bert_features_cola", "bert_features_sst2", "bert_features_qqp", "bert_features_mrpc", "bert_features_rte", "bert_features_mnli", "bert_features_qnli", "bert_features_stsb" ], "in_y": ["y_cola", "y_sst2", "y_qqp", "y_mrpc", "y_rte", "y_mnli", "y_qnli", "y_stsb"], "out": [ "y_cola_pred_probas", "y_sst2_pred_probas", "y_qqp_pred_probas", "y_mrpc_pred_probas", "y_rte_pred_probas", "y_mnli_pred_probas", "y_qnli_pred_probas", "y_stsb_pred" ] }, { "in": [ "y_cola_pred_probas", "y_sst2_pred_probas", "y_qqp_pred_probas", "y_mrpc_pred_probas", "y_rte_pred_probas", "y_mnli_pred_probas", "y_qnli_pred_probas" ], "out": [ "y_cola_pred_ids", "y_sst2_pred_ids", "y_qqp_pred_ids", "y_mrpc_pred_ids", "y_rte_pred_ids", "y_mnli_pred_ids", "y_qnli_pred_ids" ], "class_name": "proba2labels", "max_proba": true } ], "out": [ "y_cola_pred_probas", "y_sst2_pred_probas", "y_qqp_pred_probas", "y_mrpc_pred_probas", "y_rte_pred_probas", "y_mnli_pred_probas", "y_qnli_pred_probas", "y_stsb_pred", "y_cola_pred_ids", "y_sst2_pred_ids", "y_qqp_pred_ids", "y_mrpc_pred_ids", "y_rte_pred_ids", "y_mnli_pred_ids", "y_qnli_pred_ids", "y_stsb_pred" ] }, "train": { "epochs": "{NUM_TRAIN_EPOCHS}", "batch_size": 32, "metrics": [ { "name": "multitask_accuracy", "inputs": [ "y_rte", "y_mnli", "y_qnli", "y_mrpc", "y_cola", "y_sst2", "y_qqp", "y_rte_pred_ids", "y_mnli_pred_ids", "y_qnli_pred_ids", "y_mrpc_pred_ids", "y_cola_pred_ids", "y_sst2_pred_ids", "y_qqp_pred_ids" ] }, { "name": "accuracy", "alias": "accuracy_mrpc", "inputs": ["y_mrpc", "y_mrpc_pred_ids"] }, { "name": "accuracy", "alias": "accuracy_rte", "inputs": ["y_rte", "y_rte_pred_ids"] }, { "name": "accuracy", "alias": "accuracy_mnli", "inputs": ["y_mnli", "y_mnli_pred_ids"] }, { "name": "accuracy", "alias": "accuracy_qnli", "inputs": ["y_qnli", "y_qnli_pred_ids"] }, { "name": "accuracy", "alias": "accuracy_sst", "inputs": ["y_sst2", "y_sst2_pred_ids"] }, { "name": "accuracy", "alias": "accuracy_cola", "inputs": ["y_cola", "y_cola_pred_ids"] }, { "name": "accuracy", "alias": "accuracy_qqp", "inputs": ["y_qqp", "y_qqp_pred_ids"] }, { "name": "pearson_correlation", "alias": "pearson_correlation_stsb", "inputs": ["y_stsb", "y_stsb_pred"] }, { "name": "spearman_correlation", "alias": "spearman_correlation_stsb", "inputs": ["y_stsb", "y_stsb_pred"] } ], "validation_patience": 3, "log_every_n_epochs": 1, "show_examples": false, "evaluation_targets": ["valid"], "class_name": "torch_trainer", "pytest_max_batches": 2 }, "metadata": { "variables": { "BACKBONE": "bert-base-uncased", "MODELS_PATH": "~/.deeppavlov/models/glue", "MODEL_PATH": "{MODELS_PATH}/8task", "NUM_TRAIN_EPOCHS": 5, "GRADIENT_ACC_STEPS": 1 }, "download": [ { "url": "http://files.deeppavlov.ai/deeppavlov_data/multitask/glue.tar.gz", "subdir": "{MODELS_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/multitask/multitask_example.json ================================================ { "dataset_reader": { "class_name": "multitask_reader", "task_defaults": { "class_name": "huggingface_dataset_reader", "path": "glue", "train": "train", "valid": "validation", "test": "test" }, "tasks": { "cola": {"name": "cola"}, "rte": {"name": "rte"}, "stsb": {"name": "stsb"}, "copa": { "path": "super_glue", "name": "copa" }, "conll": { "class_name": "conll2003_reader", "use_task_defaults": false, "data_path": "{DOWNLOADS_PATH}/conll2003/", "dataset_name": "conll2003", "provide_pos": false } } }, "dataset_iterator": { "class_name": "multitask_iterator", "num_train_epochs": "{NUM_TRAIN_EPOCHS}", "gradient_accumulation_steps": "{GRADIENT_ACC_STEPS}", "seed": 42, "task_defaults": { "class_name": "huggingface_dataset_iterator", "label": "label", "use_label_name": false, "seed": 42 }, "tasks": { "cola": { "features": ["sentence"] }, "rte": { "features": ["sentence1", "sentence2"] }, "stsb": { "features": ["sentence1", "sentence2"] }, "copa": { "features": ["contexts", "choices"] }, "conll": { "class_name": "basic_classification_iterator", "seed": 42, "use_task_defaults": false } } }, "chainer": { "in": ["x_cola", "x_rte", "x_stsb", "x_copa", "x_conll"], "in_y": ["y_cola", "y_rte", "y_stsb", "y_copa", "y_conll"], "pipe": [ { "class_name": "multitask_pipeline_preprocessor", "possible_keys_to_extract": [0, 1], "preprocessors": [ "TorchTransformersPreprocessor", "TorchTransformersPreprocessor", "TorchTransformersPreprocessor", "TorchTransformersMultiplechoicePreprocessor", "TorchTransformersNerPreprocessor" ], "do_lower_case": true, "n_task": 5, "vocab_file": "{BACKBONE}", "max_seq_length": 200, "max_subword_length": 15, "token_masking_prob": 0.0, "return_features": true, "in": ["x_cola", "x_rte", "x_stsb", "x_copa", "x_conll"], "out": [ "bert_features_cola", "bert_features_rte", "bert_features_stsb", "bert_features_copa", "bert_features_conll" ] }, { "id": "vocab_conll", "class_name": "simple_vocab", "unk_token": ["O"], "pad_with_zeros": true, "save_path": "{MODELS_PATH}/tag.dict", "load_path": "{MODELS_PATH}/tag.dict", "fit_on": ["y_conll"], "in": ["y_conll"], "out": ["y_ids_conll"] }, { "id": "multitask_transformer", "class_name": "multitask_transformer", "optimizer_parameters": {"lr": 2e-5}, "gradient_accumulation_steps": "{GRADIENT_ACC_STEPS}", "learning_rate_drop_patience": 2, "learning_rate_drop_div": 2.0, "return_probas": true, "backbone_model": "{BACKBONE}", "save_path": "{MODEL_PATH}", "load_path": "{MODEL_PATH}", "tasks": { "cola": { "type": "classification", "options": 2 }, "rte": { "type": "classification", "options": 2 }, "stsb": { "type": "regression", "options": 1 }, "copa": { "type": "multiple_choice", "options": 2 }, "conll": { "type": "sequence_labeling", "options": "#vocab_conll.len" } }, "in": [ "bert_features_cola", "bert_features_rte", "bert_features_stsb", "bert_features_copa", "bert_features_conll" ], "in_y": ["y_cola", "y_rte", "y_stsb", "y_copa", "y_ids_conll"], "out": [ "y_cola_pred_probas", "y_rte_pred_probas", "y_stsb_pred", "y_copa_pred_probas", "y_conll_pred_ids" ] }, { "in": ["y_cola_pred_probas"], "out": ["y_cola_pred_ids"], "class_name": "proba2labels", "max_proba": true }, { "in": ["y_rte_pred_probas"], "out": ["y_rte_pred_ids"], "class_name": "proba2labels", "max_proba": true }, { "in": ["y_copa_pred_probas"], "out": ["y_copa_pred_ids"], "class_name": "proba2labels", "max_proba": true }, { "in": ["y_conll_pred_ids"], "out": ["y_conll_pred_labels"], "ref": "vocab_conll" } ], "out": ["y_cola_pred_ids", "y_rte_pred_ids", "y_stsb_pred", "y_copa_pred_ids", "y_conll_pred_labels"] }, "train": { "epochs": "{NUM_TRAIN_EPOCHS}", "batch_size": 32, "metrics": [ { "name": "multitask_accuracy", "inputs": ["y_rte", "y_cola", "y_copa", "y_rte_pred_ids", "y_cola_pred_ids", "y_copa_pred_ids"] }, { "name": "ner_f1", "inputs": ["y_conll", "y_conll_pred_labels"] }, { "name": "ner_token_f1", "inputs": ["y_conll", "y_conll_pred_labels"] }, { "name": "accuracy", "alias": "accuracy_cola", "inputs": ["y_cola", "y_cola_pred_ids"] }, { "name": "accuracy", "alias": "accuracy_rte", "inputs": ["y_rte", "y_rte_pred_ids"] }, { "name": "accuracy", "alias": "accuracy_copa", "inputs": ["y_copa", "y_copa_pred_ids"] }, { "name": "pearson_correlation", "alias": "pearson_stsb", "inputs": ["y_stsb", "y_stsb_pred"] }, { "name": "spearman_correlation", "alias": "spearman_stsb", "inputs": ["y_stsb", "y_stsb_pred"] } ], "validation_patience": 3, "log_every_n_epochs": 1, "show_examples": false, "evaluation_targets": ["valid"], "class_name": "torch_trainer", "pytest_max_batches": 2 }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "MODELS_PATH": "{ROOT_PATH}/models/multitask_example", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "BACKBONE": "distilbert-base-uncased", "MODEL_PATH": "{MODELS_PATH}/{BACKBONE}", "NUM_TRAIN_EPOCHS": 5, "GRADIENT_ACC_STEPS": 1 }, "download": [ { "url": "http://files.deeppavlov.ai/deeppavlov_data/multitask/multitask_example.tar.gz", "subdir": "{MODELS_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/ner/ner_bert_base.json ================================================ { "chainer": { "in": ["x"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_ner_preprocessor", "vocab_file": "{BASE_MODEL}", "in": ["x"], "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets"] }, { "id": "tag_vocab", "class_name": "simple_vocab", "unk_token": ["O"], "save_path": "{MODEL_PATH}/tag.dict", "load_path": "{MODEL_PATH}/tag.dict", "fit_on": ["y"], "in": ["y"], "out": ["y_ind"] }, { "class_name": "torch_transformers_sequence_tagger", "n_tags": "#tag_vocab.len", "pretrained_bert": "{BASE_MODEL}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"], "in_y": ["y_ind"], "out": ["y_pred_ind", "probas"] }, { "ref": "tag_vocab", "in": ["y_pred_ind"], "out": ["y_pred"] } ], "out": ["x_tokens", "y_pred"] }, "metadata": { "variables": { "BASE_MODEL": "bert-base-multilingual-cased", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/ner/{BASE_MODEL}" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/ner/ner_bert_base.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/ner/ner_case_agnostic_mdistilbert.json ================================================ { "dataset_reader": { "class_name": "conll2003_reader", "data_path": "{DOWNLOADS_PATH}/conll2003/", "dataset_name": "conll2003", "provide_pos": false }, "dataset_iterator": { "class_name": "data_learning_iterator" }, "chainer": { "in": ["x"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_ner_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": false, "max_seq_length": 512, "max_subword_length": 15, "token_masking_prob": 0.0, "in": ["x"], "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets"] }, { "id": "tag_vocab", "class_name": "simple_vocab", "unk_token": ["O"], "pad_with_zeros": true, "save_path": "{MODEL_PATH}/tag.dict", "load_path": "{MODEL_PATH}/tag.dict", "fit_on": ["y"], "in": ["y"], "out": ["y_ind"] }, { "class_name": "torch_transformers_sequence_tagger", "n_tags": "#tag_vocab.len", "pretrained_bert": "{TRANSFORMER}", "attention_probs_keep_prob": 0.5, "use_crf": true, "encoder_layer_ids": [-1], "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-05, "weight_decay": 1e-06, "betas": [0.9, 0.999], "eps": 1e-06 }, "clip_norm": 1.0, "min_learning_rate": 1e-07, "learning_rate_drop_patience": 20, "learning_rate_drop_div": 1.5, "load_before_drop": true, "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"], "in_y": ["y_ind"], "out": ["y_pred_ind", "probas"] }, { "ref": "tag_vocab", "in": ["y_pred_ind"], "out": ["y_pred"] } ], "out": ["x_tokens", "y_pred"] }, "train": { "epochs": 50, "batch_size": 8, "metrics": [ { "name": "ner_f1", "inputs": ["y", "y_pred"] }, { "name": "ner_token_f1", "inputs": ["y", "y_pred"] } ], "validation_patience": 100, "val_every_n_batches": 50, "log_every_n_batches": 50, "show_examples": false, "pytest_max_batches": 2, "pytest_batch_size": 8, "evaluation_targets": ["test", "valid"], "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "~/.deeppavlov/downloads", "MODELS_PATH": "~/.deeppavlov/models", "TRANSFORMER": "distilbert-base-multilingual-cased", "MODEL_PATH": "{MODELS_PATH}/ner/ner_case_agnostic_mdistilbert" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/ner/ner_case_agnostic_mdistilbert.tar.gz", "subdir": "{MODELS_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/ner/ner_collection3_bert.json ================================================ { "dataset_reader": { "class_name": "conll2003_reader", "data_path": "{DOWNLOADS_PATH}/collection3/", "dataset_name": "collection3", "provide_pos": false, "provide_chunk": false, "iobes": true }, "dataset_iterator": { "class_name": "data_learning_iterator" }, "chainer": { "in": [ "x" ], "in_y": [ "y" ], "pipe": [ { "class_name": "torch_transformers_ner_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": false, "max_seq_length": 512, "max_subword_length": 15, "token_masking_prob": 0.0, "in": [ "x" ], "out": [ "x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets" ] }, { "id": "tag_vocab", "class_name": "simple_vocab", "unk_token": [ "O" ], "pad_with_zeros": true, "save_path": "{MODEL_PATH}/tag.dict", "load_path": "{MODEL_PATH}/tag.dict", "fit_on": [ "y" ], "in": [ "y" ], "out": [ "y_ind" ] }, { "class_name": "torch_transformers_sequence_tagger", "n_tags": "#tag_vocab.len", "pretrained_bert": "{TRANSFORMER}", "attention_probs_keep_prob": 0.5, "encoder_layer_ids": [ -1 ], "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-05, "weight_decay": 1e-06, "betas": [ 0.9, 0.999 ], "eps": 1e-06 }, "clip_norm": 1.0, "min_learning_rate": 1e-07, "learning_rate_drop_patience": 30, "learning_rate_drop_div": 1.5, "load_before_drop": true, "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "in": [ "x_subword_tok_ids", "attention_mask", "startofword_markers" ], "in_y": [ "y_ind" ], "out": [ "y_pred_ind", "probas" ] }, { "ref": "tag_vocab", "in": [ "y_pred_ind" ], "out": [ "y_pred" ] } ], "out": [ "x_tokens", "y_pred" ] }, "train": { "epochs": 30, "batch_size": 10, "metrics": [ { "name": "ner_f1", "inputs": [ "y", "y_pred" ] }, { "name": "ner_token_f1", "inputs": [ "y", "y_pred" ] } ], "validation_patience": 100, "val_every_n_batches": 20, "log_every_n_batches": 20, "show_examples": false, "pytest_max_batches": 2, "pytest_batch_size": 8, "evaluation_targets": [ "valid", "test" ], "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "TRANSFORMER": "DeepPavlov/rubert-base-cased", "MODEL_PATH": "{MODELS_PATH}/ner_rus_bert_coll3_torch" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/ner/ner_rus_bert_coll3_torch.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/ner/ner_conll2003_bert.json ================================================ { "dataset_reader": { "class_name": "conll2003_reader", "data_path": "{DOWNLOADS_PATH}/conll2003/", "dataset_name": "conll2003", "provide_pos": false }, "dataset_iterator": { "class_name": "data_learning_iterator" }, "chainer": { "in": [ "x" ], "in_y": [ "y" ], "pipe": [ { "class_name": "torch_transformers_ner_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": false, "max_seq_length": 512, "max_subword_length": 15, "token_masking_prob": 0.0, "in": [ "x" ], "out": [ "x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets" ] }, { "id": "tag_vocab", "class_name": "simple_vocab", "unk_token": [ "O" ], "pad_with_zeros": true, "save_path": "{MODEL_PATH}/tag.dict", "load_path": "{MODEL_PATH}/tag.dict", "fit_on": [ "y" ], "in": [ "y" ], "out": [ "y_ind" ] }, { "class_name": "torch_transformers_sequence_tagger", "n_tags": "#tag_vocab.len", "pretrained_bert": "{TRANSFORMER}", "attention_probs_keep_prob": 0.5, "use_crf": true, "encoder_layer_ids": [ -1 ], "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-05, "weight_decay": 1e-06, "betas": [ 0.9, 0.999 ], "eps": 1e-06 }, "clip_norm": 1.0, "min_learning_rate": 1e-07, "learning_rate_drop_patience": 30, "learning_rate_drop_div": 1.5, "load_before_drop": true, "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "in": [ "x_subword_tok_ids", "attention_mask", "startofword_markers" ], "in_y": [ "y_ind" ], "out": [ "y_pred_ind", "probas" ] }, { "ref": "tag_vocab", "in": [ "y_pred_ind" ], "out": [ "y_pred" ] } ], "out": [ "x_tokens", "y_pred" ] }, "train": { "epochs": 30, "batch_size": 16, "metrics": [ { "name": "ner_f1", "inputs": [ "y", "y_pred" ] }, { "name": "ner_token_f1", "inputs": [ "y", "y_pred" ] } ], "validation_patience": 100, "val_every_n_batches": 20, "log_every_n_batches": 20, "show_examples": false, "pytest_max_batches": 2, "pytest_batch_size": 8, "evaluation_targets": [ "valid", "test" ], "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "TRANSFORMER": "bert-base-cased", "MODEL_PATH": "{MODELS_PATH}/ner_conll2003_torch_bert_crf" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/ner/ner_conll2003_bert_torch_crf.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/ner/ner_conll2003_deberta_crf.json ================================================ { "dataset_reader": { "class_name": "conll2003_reader", "data_path": "{DOWNLOADS_PATH}/conll2003/", "dataset_name": "conll2003", "provide_pos": false }, "dataset_iterator": { "class_name": "data_learning_iterator" }, "chainer": { "in": [ "x" ], "in_y": [ "y" ], "pipe": [ { "class_name": "torch_transformers_ner_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": false, "max_seq_length": 512, "max_subword_length": 15, "token_masking_prob": 0.0, "in": [ "x" ], "out": [ "x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets" ] }, { "id": "tag_vocab", "class_name": "simple_vocab", "unk_token": [ "O" ], "pad_with_zeros": true, "save_path": "{MODEL_PATH}/tag.dict", "load_path": "{MODEL_PATH}/tag.dict", "fit_on": [ "y" ], "in": [ "y" ], "out": [ "y_ind" ] }, { "class_name": "torch_transformers_sequence_tagger", "n_tags": "#tag_vocab.len", "pretrained_bert": "{TRANSFORMER}", "attention_probs_keep_prob": 0.5, "use_crf": true, "encoder_layer_ids": [ -1 ], "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "in": [ "x_subword_tok_ids", "attention_mask", "startofword_markers" ], "in_y": [ "y_ind" ], "out": [ "y_pred_ind", "probas" ] }, { "ref": "tag_vocab", "in": [ "y_pred_ind" ], "out": [ "y_pred" ] } ], "out": [ "x_tokens", "y_pred" ] }, "train": { "metrics": [ { "name": "ner_f1", "inputs": [ "y", "y_pred" ] }, { "name": "ner_token_f1", "inputs": [ "y", "y_pred" ] } ], "evaluation_targets": [ "valid", "test" ], "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "TRANSFORMER": "microsoft/deberta-v3-base", "MODEL_PATH": "{MODELS_PATH}/ner_conll2003_deberta_crf" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/ner/ner_conll2003_deberta_crf.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/ner/ner_ontonotes_bert.json ================================================ { "dataset_reader": { "class_name": "conll2003_reader", "data_path": "{DOWNLOADS_PATH}/ontonotes/", "dataset_name": "ontonotes", "provide_pos": false }, "dataset_iterator": { "class_name": "data_learning_iterator" }, "chainer": { "in": ["x"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_ner_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": false, "max_seq_length": 512, "max_subword_length": 15, "token_masking_prob": 0.0, "in": ["x"], "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets"] }, { "id": "tag_vocab", "class_name": "simple_vocab", "unk_token": ["O"], "pad_with_zeros": true, "save_path": "{MODEL_PATH}/tag.dict", "load_path": "{MODEL_PATH}/tag.dict", "fit_on": ["y"], "in": ["y"], "out": ["y_ind"] }, { "class_name": "torch_transformers_sequence_tagger", "n_tags": "#tag_vocab.len", "pretrained_bert": "{TRANSFORMER}", "attention_probs_keep_prob": 0.5, "use_crf": true, "encoder_layer_ids": [-1], "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-05, "weight_decay": 1e-06, "betas": [0.9, 0.999], "eps": 1e-06 }, "clip_norm": 1.0, "min_learning_rate": 1e-07, "learning_rate_drop_patience": 30, "learning_rate_drop_div": 1.5, "load_before_drop": true, "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"], "in_y": ["y_ind"], "out": ["y_pred_ind", "probas"] }, { "ref": "tag_vocab", "in": ["y_pred_ind"], "out": ["y_pred"] } ], "out": ["x_tokens", "y_pred"] }, "train": { "epochs": 30, "batch_size": 60, "metrics": [ { "name": "ner_f1", "inputs": ["y", "y_pred"] }, { "name": "ner_token_f1", "inputs": ["y", "y_pred"] } ], "validation_patience": 100, "val_every_n_batches": 20, "log_every_n_batches": 20, "show_examples": false, "pytest_max_batches": 2, "pytest_batch_size": 8, "evaluation_targets": ["valid", "test"], "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "TRANSFORMER": "bert-base-cased", "MODEL_PATH": "{MODELS_PATH}/ner_ontonotes_bert_torch_crf" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/ner/ner_ontonotes_bert_torch_crf.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/ner/ner_ontonotes_bert_mult.json ================================================ { "dataset_reader": { "class_name": "conll2003_reader", "data_path": "{DOWNLOADS_PATH}/ontonotes/", "dataset_name": "ontonotes", "provide_pos": false }, "dataset_iterator": { "class_name": "data_learning_iterator" }, "chainer": { "in": ["x"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_ner_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": false, "max_seq_length": 512, "max_subword_length": 15, "token_masking_prob": 0.0, "in": ["x"], "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets"] }, { "id": "tag_vocab", "class_name": "simple_vocab", "unk_token": ["O"], "pad_with_zeros": true, "save_path": "{MODEL_PATH}/tag.dict", "load_path": "{MODEL_PATH}/tag.dict", "fit_on": ["y"], "in": ["y"], "out": ["y_ind"] }, { "class_name": "torch_transformers_sequence_tagger", "n_tags": "#tag_vocab.len", "pretrained_bert": "{TRANSFORMER}", "attention_probs_keep_prob": 0.5, "use_crf": true, "encoder_layer_ids": [-1], "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-05, "weight_decay": 1e-06, "betas": [0.9, 0.999], "eps": 1e-06 }, "clip_norm": 1.0, "min_learning_rate": 1e-07, "learning_rate_drop_patience": 30, "learning_rate_drop_div": 1.5, "load_before_drop": true, "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"], "in_y": ["y_ind"], "out": ["y_pred_ind", "probas"] }, { "ref": "tag_vocab", "in": ["y_pred_ind"], "out": ["y_pred"] } ], "out": ["x_tokens", "y_pred"] }, "train": { "epochs": 30, "batch_size": 10, "metrics": [ { "name": "ner_f1", "inputs": ["y", "y_pred"] }, { "name": "ner_token_f1", "inputs": ["y", "y_pred"] } ], "validation_patience": 100, "val_every_n_batches": 20, "log_every_n_batches": 20, "show_examples": false, "pytest_max_batches": 2, "pytest_batch_size": 8, "evaluation_targets": ["valid", "test"], "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "TRANSFORMER": "bert-base-multilingual-cased", "MODEL_PATH": "{MODELS_PATH}/ner_ontonotes_torch_bert_mult_crf" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/ner/ner_ontonotes_bert_mult_torch_crf.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/ner/ner_ontonotes_deberta_crf.json ================================================ { "dataset_reader": { "class_name": "conll2003_reader", "data_path": "{DOWNLOADS_PATH}/ontonotes/", "dataset_name": "ontonotes", "provide_pos": false }, "dataset_iterator": { "class_name": "data_learning_iterator" }, "chainer": { "in": ["x"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_ner_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": false, "max_seq_length": 512, "max_subword_length": 15, "token_masking_prob": 0.0, "in": ["x"], "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets"] }, { "id": "tag_vocab", "class_name": "simple_vocab", "unk_token": ["O"], "pad_with_zeros": true, "save_path": "{MODEL_PATH}/tag.dict", "load_path": "{MODEL_PATH}/tag.dict", "fit_on": ["y"], "in": ["y"], "out": ["y_ind"] }, { "class_name": "torch_transformers_sequence_tagger", "n_tags": "#tag_vocab.len", "pretrained_bert": "{TRANSFORMER}", "attention_probs_keep_prob": 0.5, "use_crf": true, "encoder_layer_ids": [-1], "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"], "in_y": ["y_ind"], "out": ["y_pred_ind", "probas"] }, { "ref": "tag_vocab", "in": ["y_pred_ind"], "out": ["y_pred"] } ], "out": ["x_tokens", "y_pred"] }, "train": { "metrics": [ { "name": "ner_f1", "inputs": ["y", "y_pred"] }, { "name": "ner_token_f1", "inputs": ["y", "y_pred"] } ], "evaluation_targets": ["valid", "test"], "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "TRANSFORMER": "microsoft/deberta-v3-base", "MODEL_PATH": "{MODELS_PATH}/ner_ontonotes_deberta_crf" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/ner/ner_ontonotes_deberta_crf.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/ner/ner_rus_bert.json ================================================ { "dataset_reader": { "class_name": "conll2003_reader", "data_path": "{DOWNLOADS_PATH}/total_rus/", "dataset_name": "collection_rus", "provide_pos": false }, "dataset_iterator": { "class_name": "data_learning_iterator" }, "chainer": { "in": [ "x" ], "in_y": [ "y" ], "pipe": [ { "class_name": "torch_transformers_ner_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": false, "max_seq_length": 512, "max_subword_length": 15, "token_masking_prob": 0.0, "in": [ "x" ], "out": [ "x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets" ] }, { "id": "tag_vocab", "class_name": "simple_vocab", "unk_token": [ "O" ], "pad_with_zeros": true, "save_path": "{MODEL_PATH}/tag.dict", "load_path": "{MODEL_PATH}/tag.dict", "fit_on": [ "y" ], "in": [ "y" ], "out": [ "y_ind" ] }, { "class_name": "torch_transformers_sequence_tagger", "n_tags": "#tag_vocab.len", "pretrained_bert": "{TRANSFORMER}", "attention_probs_keep_prob": 0.5, "encoder_layer_ids": [ -1 ], "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-05, "weight_decay": 1e-06, "betas": [ 0.9, 0.999 ], "eps": 1e-06 }, "clip_norm": 1.0, "min_learning_rate": 1e-07, "learning_rate_drop_patience": 30, "learning_rate_drop_div": 1.5, "load_before_drop": true, "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "in": [ "x_subword_tok_ids", "attention_mask", "startofword_markers" ], "in_y": [ "y_ind" ], "out": [ "y_pred_ind", "probas" ] }, { "ref": "tag_vocab", "in": [ "y_pred_ind" ], "out": [ "y_pred" ] } ], "out": [ "x_tokens", "y_pred" ] }, "train": { "epochs": 30, "batch_size": 10, "metrics": [ { "name": "ner_f1", "inputs": [ "y", "y_pred" ] }, { "name": "ner_token_f1", "inputs": [ "y", "y_pred" ] } ], "validation_patience": 100, "val_every_n_batches": 20, "log_every_n_batches": 20, "show_examples": false, "pytest_max_batches": 2, "pytest_batch_size": 8, "evaluation_targets": [ "valid", "test" ], "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "TRANSFORMER": "DeepPavlov/rubert-base-cased", "MODEL_PATH": "{MODELS_PATH}/ner_rus_bert_torch" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/ner/ner_rus_bert_torch_new.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/ner/ner_rus_bert_probas.json ================================================ { "dataset_reader": { "class_name": "sq_reader", "data_path": "{DOWNLOADS_PATH}/wiki_ner_rus/wikipedia_dataset.pickle" }, "dataset_iterator": { "class_name": "data_learning_iterator" }, "chainer": { "in": ["x"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_ner_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": false, "max_seq_length": 512, "max_subword_length": 15, "token_masking_prob": 0.0, "in": ["x"], "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets"] }, { "id": "tag_vocab", "class_name": "simple_vocab", "unk_token": ["O"], "pad_with_zeros": true, "save_path": "{MODEL_PATH}/tag.dict", "load_path": "{MODEL_PATH}/tag.dict", "fit_on": ["y"], "in": ["y"], "out": ["y_ind"] }, { "class_name": "torch_transformers_sequence_tagger", "n_tags": "#tag_vocab.len", "pretrained_bert": "{TRANSFORMER}", "attention_probs_keep_prob": 0.5, "encoder_layer_ids": [-1], "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-05, "weight_decay": 1e-06, "betas": [ 0.9, 0.999 ], "eps": 1e-06 }, "clip_norm": 1.0, "min_learning_rate": 1e-07, "learning_rate_drop_patience": 30, "learning_rate_drop_div": 1.5, "load_before_drop": true, "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"], "in_y": ["y_ind"], "out": ["y_pred_ind", "probas"] }, { "ref": "tag_vocab", "in": ["y_pred_ind"], "out": ["y_pred"] } ], "out": ["x_tokens", "tokens_offsets", "y_pred", "probas"] }, "train": { "epochs": 30, "batch_size": 10, "metrics": [ { "name": "ner_f1", "inputs": [ "y", "y_pred" ] }, { "name": "ner_token_f1", "inputs": [ "y", "y_pred" ] } ], "validation_patience": 100, "val_every_n_batches": 20, "log_every_n_batches": 20, "show_examples": false, "pytest_max_batches": 2, "pytest_batch_size": 8, "evaluation_targets": [ "valid", "test" ], "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "TRANSFORMER": "DeepPavlov/rubert-base-cased", "MODEL_PATH": "{MODELS_PATH}/wiki_ner_rus_bert" }, "download": [ { "url": "http://files.deeppavlov.ai/deeppavlov_data/rus_dream_entity_detection/wiki_ner_rus_bert.tar.gz", "subdir": "{MODELS_PATH}/wiki_ner_rus_bert" }, { "url": "http://files.deeppavlov.ai/datasets/wiki_ner_rus/wiki_ner_rus_dataset.tar.gz", "subdir": "{DOWNLOADS_PATH}/wiki_ner_rus" } ] } } ================================================ FILE: deeppavlov/configs/ner/ner_rus_convers_distilrubert_2L.json ================================================ { "dataset_reader": { "class_name": "conll2003_reader", "data_path": "{DOWNLOADS_PATH}/total_rus/", "dataset_name": "collection_rus", "provide_pos": false }, "dataset_iterator": { "class_name": "data_learning_iterator" }, "chainer": { "in": [ "x" ], "in_y": [ "y" ], "pipe": [ { "class_name": "torch_transformers_ner_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": false, "max_seq_length": 512, "max_subword_length": 15, "token_masking_prob": 0.0, "in": [ "x" ], "out": [ "x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets" ] }, { "id": "tag_vocab", "class_name": "simple_vocab", "unk_token": [ "O" ], "pad_with_zeros": true, "save_path": "{MODEL_PATH}/tag.dict", "load_path": "{MODEL_PATH}/tag.dict", "fit_on": [ "y" ], "in": [ "y" ], "out": [ "y_ind" ] }, { "class_name": "torch_transformers_sequence_tagger", "n_tags": "#tag_vocab.len", "pretrained_bert": "{TRANSFORMER}", "attention_probs_keep_prob": 0.11, "hidden_keep_prob": 0.11, "encoder_layer_ids": [ -1 ], "optimizer": "AdamW", "optimizer_parameters": { "lr": 5.45e-05, "weight_decay": 1e-06, "betas": [ 0.9, 0.999 ], "eps": 1e-06 }, "clip_norm": 1.0, "min_learning_rate": 1e-07, "learning_rate_drop_patience": 30, "learning_rate_drop_div": 1.5, "load_before_drop": true, "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "in": [ "x_subword_tok_ids", "attention_mask", "startofword_markers" ], "in_y": [ "y_ind" ], "out": [ "y_pred_ind", "probas" ] }, { "ref": "tag_vocab", "in": [ "y_pred_ind" ], "out": [ "y_pred" ] } ], "out": [ "x_tokens", "y_pred" ] }, "train": { "epochs": 30, "batch_size": 10, "metrics": [ { "name": "ner_f1", "inputs": [ "y", "y_pred" ] }, { "name": "ner_token_f1", "inputs": [ "y", "y_pred" ] } ], "validation_patience": 100, "val_every_n_batches": 20, "log_every_n_batches": 20, "show_examples": false, "evaluation_targets": [ "valid", "test" ], "tensorboard_log_dir": "{MODEL_PATH}/", "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/ner_rus_conversational_distilrubert_2L", "TRANSFORMER": "DeepPavlov/distilrubert-tiny-cased-conversational" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/ner/ner_rus_conversational_distilrubert_2L.tar.gz", "subdir": "{MODELS_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/ner/ner_rus_convers_distilrubert_6L.json ================================================ { "dataset_reader": { "class_name": "conll2003_reader", "data_path": "{DOWNLOADS_PATH}/total_rus/", "dataset_name": "collection_rus", "provide_pos": false }, "dataset_iterator": { "class_name": "data_learning_iterator" }, "chainer": { "in": [ "x" ], "in_y": [ "y" ], "pipe": [ { "class_name": "torch_transformers_ner_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": false, "max_seq_length": 512, "max_subword_length": 15, "token_masking_prob": 0.0, "in": [ "x" ], "out": [ "x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets" ] }, { "id": "tag_vocab", "class_name": "simple_vocab", "unk_token": [ "O" ], "pad_with_zeros": true, "save_path": "{MODEL_PATH}/tag.dict", "load_path": "{MODEL_PATH}/tag.dict", "fit_on": [ "y" ], "in": [ "y" ], "out": [ "y_ind" ] }, { "class_name": "torch_transformers_sequence_tagger", "n_tags": "#tag_vocab.len", "pretrained_bert": "{TRANSFORMER}", "attention_probs_keep_prob": 0.44, "hidden_keep_prob": 0.89, "encoder_layer_ids": [ -1 ], "optimizer": "AdamW", "optimizer_parameters": { "lr": 2.78e-05, "weight_decay": 1e-06, "betas": [ 0.9, 0.999 ], "eps": 1e-06 }, "clip_norm": 1.0, "min_learning_rate": 1e-07, "learning_rate_drop_patience": 30, "learning_rate_drop_div": 1.5, "load_before_drop": true, "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "in": [ "x_subword_tok_ids", "attention_mask", "startofword_markers" ], "in_y": [ "y_ind" ], "out": [ "y_pred_ind", "probas" ] }, { "ref": "tag_vocab", "in": [ "y_pred_ind" ], "out": [ "y_pred" ] } ], "out": [ "x_tokens", "y_pred" ] }, "train": { "epochs": 30, "batch_size": 10, "metrics": [ { "name": "ner_f1", "inputs": [ "y", "y_pred" ] }, { "name": "ner_token_f1", "inputs": [ "y", "y_pred" ] } ], "validation_patience": 100, "val_every_n_batches": 20, "log_every_n_batches": 20, "show_examples": false, "evaluation_targets": [ "valid", "test" ], "tensorboard_log_dir": "{MODEL_PATH}/", "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/ner_rus_conversational_distilrubert_6L", "TRANSFORMER": "DeepPavlov/distilrubert-base-cased-conversational" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/ner/ner_rus_conversational_distilrubert_6L.tar.gz", "subdir": "{MODELS_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/odqa/en_odqa_infer_wiki.json ================================================ { "chainer": { "in": ["question_raw"], "out": ["answer", "answer_score", "answer_place"], "pipe": [ { "config_path": "{CONFIGS_PATH}/doc_retrieval/en_ranker_tfidf_wiki.json", "in": ["question_raw"], "out": ["tfidf_doc_ids"] }, { "class_name": "bpr", "load_path": "{MODELS_PATH}/bpr/eng", "query_encoder_file": "query_encoder_en.pth.tar", "bpr_index": "bpr_finetuned_nq_adv.idx", "pretrained_model": "bert-base-uncased", "top_n": 100, "in": ["question_raw"], "out": ["bpr_doc_ids"] }, { "class_name": "concat_lists", "in": ["tfidf_doc_ids", "bpr_doc_ids"], "out": ["doc_ids"] }, { "class_name": "wiki_sqlite_vocab", "in": ["doc_ids"], "out": ["doc_text"], "join_docs": false, "shuffle": false, "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_l100.db" }, { "class_name": "string_multiplier", "in": ["question_raw", "doc_text"], "out":["questions"] }, { "class_name": "logit_ranker", "batch_size": 64, "squad_model": {"config_path": "{CONFIGS_PATH}/squad/qa_nq_psgcls_bert.json"}, "sort_noans": true, "in": ["doc_text", "questions"], "out": ["answer", "answer_score", "answer_place"] } ] }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs" }, "download": [ { "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/bpr_encoder_index_eng.tar.gz", "subdir": "{MODELS_PATH}/bpr/eng" } ] } } ================================================ FILE: deeppavlov/configs/odqa/en_odqa_pop_infer_wiki.json ================================================ { "chainer": { "in": ["question_raw"], "out": ["answer", "answer_score", "answer_place"], "pipe": [ { "config_path": "{CONFIGS_PATH}/doc_retrieval/en_ranker_pop_wiki.json", "in": ["question_raw"], "out": ["tfidf_doc_ids"] }, { "class_name": "bpr", "load_path": "{MODELS_PATH}/bpr/eng", "query_encoder_file": "query_encoder_en.pth.tar", "bpr_index": "bpr_finetuned_nq_adv.idx", "pretrained_model": "bert-base-uncased", "top_n": 100, "in": ["question_raw"], "out": ["bpr_doc_ids"] }, { "class_name": "concat_lists", "in": ["tfidf_doc_ids", "bpr_doc_ids"], "out": ["doc_ids"] }, { "class_name": "wiki_sqlite_vocab", "in": ["doc_ids"], "out": ["doc_text"], "join_docs": false, "shuffle": false, "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_l100.db" }, { "class_name": "string_multiplier", "in": ["question_raw", "doc_text"], "out":["questions"] }, { "class_name": "logit_ranker", "batch_size": 64, "squad_model": {"config_path": "{CONFIGS_PATH}/squad/qa_nq_psgcls_bert.json"}, "sort_noans": true, "in": ["doc_text", "questions"], "out": ["answer", "answer_score", "answer_place"] } ] }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs" }, "download": [ { "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/bpr_encoder_index_eng.tar.gz", "subdir": "{MODELS_PATH}/bpr/eng" } ] } } ================================================ FILE: deeppavlov/configs/odqa/ru_odqa_infer_wiki.json ================================================ { "chainer": { "in": ["question_raw"], "out": ["best_answer"], "pipe": [ { "config_path": "{CONFIGS_PATH}/doc_retrieval/ru_ranker_tfidf_wiki.json", "in": ["question_raw"], "out": ["tfidf_doc_ids"] }, { "class_name": "wiki_sqlite_vocab", "in": ["tfidf_doc_ids"], "out": ["tfidf_doc_text"], "join_docs": false, "shuffle": false, "load_path": "{DOWNLOADS_PATH}/odqa/ruwiki_par_page_compr.db" }, { "class_name": "string_multiplier", "in": ["question_raw", "tfidf_doc_text"], "out":["questions"] }, { "class_name": "logit_ranker", "batch_size": 64, "squad_model": {"config_path": "{CONFIGS_PATH}/squad/qa_multisberquad_bert.json"}, "sort_noans": true, "in": ["tfidf_doc_text", "questions"], "out": ["best_answer", "best_answer_score"] } ] }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs" }, "download": [ ] } } ================================================ FILE: deeppavlov/configs/ranking/path_ranking_nll_roberta_en.json ================================================ { "chainer": { "in": ["question", "rels"], "pipe": [ { "class_name": "path_ranking_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": false, "additional_special_tokens": ["", "", "", "", "", "", ""], "max_seq_length": 96, "in": ["question", "rels"], "out": ["bert_features"] }, { "class_name": "torch_transformers_nll_ranker", "in": ["bert_features"], "out": ["model_output"], "return_probas": true, "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "encoder_save_path": "{MODEL_PATH}/encoder", "linear_save_path": "{MODEL_PATH}/linear", "pretrained_bert": "{TRANSFORMER}", "learning_rate_drop_patience": 5, "learning_rate_drop_div": 1.5, "optimizer_parameters": {"lr": 1e-5, "weight_decay": 0.01, "eps": 1e-6} } ], "out": ["model_output"] }, "metadata": { "variables": { "TRANSFORMER": "haisongzhang/roberta-tiny-cased", "MODEL_PATH": "~/.deeppavlov/models/classifiers/path_ranking_nll_roberta_lcquad2" }, "download": [ { "url": "http://files.deeppavlov.ai/kbqa/models/path_ranking_nll_roberta_lcquad2.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/ranking/ranking_ubuntu_v2_torch_bert_uncased.json ================================================ { "dataset_reader": { "class_name": "ubuntu_v2_reader", "data_path": "{DOWNLOADS_PATH}/ubuntu_v2_data" }, "dataset_iterator": { "class_name": "siamese_iterator", "seed": 243 }, "chainer": { "in": [ "x" ], "in_y": [ "y" ], "pipe": [ { "class_name": "torch_bert_ranker_preprocessor", "vocab_file": "bert-base-uncased", "do_lower_case": true, "max_seq_length": 128, "in": [ "x" ], "out": [ "bert_features" ] }, { "class_name": "torch_bert_ranker", "pretrained_bert": "bert-base-uncased", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-5, "weight_decay": 1e-2, "betas": [ 0.9, 0.999 ], "eps": 1e-6 }, "clip_norm": 1.0, "in": [ "bert_features" ], "in_y": [ "y" ], "out": [ "predictions" ] } ], "out": [ "predictions" ] }, "train": { "batch_size": 32, "pytest_max_batches": 2, "train_metrics": [], "metrics": [ "r@1", "r@2", "r@5" ], "validation_patience": 1, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "evaluation_targets": [ "valid", "test" ], "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/ubuntu_v2_uncased_torch_bert_model" }, "download": [ { "url": "http://files.deeppavlov.ai/datasets/ubuntu_v2_data.tar.gz", "subdir": "{DOWNLOADS_PATH}/ubuntu_v2_data" }, { "url": "http://files.deeppavlov.ai/deeppavlov_data/ubuntu_v2_uncased_torch_bert_model_v2.tar.gz", "subdir": "{MODELS_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/ranking/rel_ranking_nll_bert_ru.json ================================================ { "chainer": { "in": ["question", "rels"], "pipe": [ { "class_name": "path_ranking_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": false, "max_seq_length": 96, "in": ["question", "rels"], "out": ["bert_features"] }, { "class_name": "torch_transformers_nll_ranker", "in": ["bert_features"], "out": ["model_output"], "return_probas": true, "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "encoder_save_path": "{MODEL_PATH}/encoder", "linear_save_path": "{MODEL_PATH}/linear", "pretrained_bert": "{TRANSFORMER}", "learning_rate_drop_patience": 4, "learning_rate_drop_div": 1.5, "optimizer_parameters": {"lr": 1e-5, "weight_decay": 0.01, "eps": 1e-6} } ], "out": ["model_output"] }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "TRANSFORMER": "DeepPavlov/rubert-base-cased", "MODEL_PATH": "{MODELS_PATH}/classifiers/rel_ranking_nll_bert_ru" }, "download": [ { "url": "http://files.deeppavlov.ai/kbqa/models/rel_ranking_nll_bert_ru.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/ranking/rel_ranking_roberta_en.json ================================================ { "dataset_reader": { "class_name": "sq_reader", "data_path": "{DOWNLOADS_PATH}/rel_ranking_eng/lcquad_one_rel_ranking.json" }, "dataset_iterator": { "class_name": "basic_classification_iterator", "seed": 42 }, "chainer": { "in": ["question", "rel_list"], "in_y": ["y"], "pipe": [ { "class_name": "rel_ranking_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": false, "max_seq_length": 64, "in": ["question", "rel_list"], "out": ["bert_features"] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": ["y"], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": ["y"], "out": ["y_ids"] }, { "in": ["y_ids"], "out": ["y_onehot"], "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{TRANSFORMER}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": {"lr": 1e-05}, "learning_rate_drop_patience": 5, "learning_rate_drop_div": 2.0, "in": ["bert_features"], "in_y": ["y_ids"], "out": ["y_pred_probas"] }, { "in": ["y_pred_probas"], "out": ["y_pred_ids"], "class_name": "proba2labels", "max_proba": true }, { "in": ["y_pred_ids"], "out": ["y_pred_labels"], "ref": "classes_vocab" } ], "out": ["y_pred_labels"] }, "train": { "epochs": 3, "batch_size": 30, "metrics": [ "accuracy", "f1_macro" ], "validation_patience": 10, "val_every_n_batches": 100, "log_every_n_batches": 100, "show_examples": false, "evaluation_targets": ["valid", "test"], "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "TRANSFORMER": "haisongzhang/roberta-tiny-cased", "MODEL_PATH": "{MODELS_PATH}/classifiers/rel_ranking_roberta_en" }, "download": [ { "url": "http://files.deeppavlov.ai/kbqa/models/rel_ranking_roberta_en.tar.gz", "subdir": "{MODEL_PATH}" }, { "url": "http://files.deeppavlov.ai/kbqa/wikidata/lcquad_rel_ranking.pickle", "subdir": "{DOWNLOADS_PATH}/rel_ranking_eng" } ] } } ================================================ FILE: deeppavlov/configs/regressors/translation_ranker.json ================================================ { "metadata": { "variables": { "BASE_MODEL": "cointegrated/LaBSE-en-ru", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/classifiers/ranker_labse", "SEED": 42 }, "download": [ { "url": "http://files.deeppavlov.ai/v1/tmp/translation_ranker.tar.gz", "subdir": "{MODELS_PATH}" } ] }, "dataset_iterator": { "class_name": "huggingface_dataset_iterator", "features": [ "source", "hypothesis" ], "label": "agg_score", "seed": "{SEED}", "use_label_name": false }, "chainer": { "in": [ "source", "hypothesis" ], "in_y": [ "score" ], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{BASE_MODEL}", "do_lower_case": false, "max_seq_length": 256, "in": [ "source", "hypothesis" ], "out": [ "bert_features" ] }, { "class_name": "torch_transformers_classifier", "n_classes": 1, "return_probas": false, "pretrained_bert": "{BASE_MODEL}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-06, "weight_decay": 0.1 }, "learning_rate_drop_patience": 3, "learning_rate_drop_div": 2.0, "in": [ "bert_features" ], "in_y": [ "score" ], "out": [ "pred_score" ] } ], "out": [ "pred_score" ] }, "train": { "batch_size": 32, "metrics": [ { "name": "mean_squared_error", "inputs": [ "score", "pred_score" ] } ], "validation_patience": 10, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "show_examples": false, "class_name": "torch_trainer", "evaluation_targets": [ "train", "valid" ], "metric_optimization": "minimize", "tensorboard_log_dir": "{MODEL_PATH}/", "pytest_max_batches": 2, "pytest_batch_size": 2 } } ================================================ FILE: deeppavlov/configs/relation_extraction/re_docred.json ================================================ { "dataset_reader": { "class_name": "docred_reader", "data_path": "{DOWNLOADS_PATH}/docred/", "rel2id_path": "{DOWNLOADS_PATH}/docred/meta/rel2id.json", "rel_info_path": "{DOWNLOADS_PATH}/docred/rel_info.json", "valid_test_data_size": 150 }, "dataset_iterator": { "class_name": "basic_classification_iterator" }, "chainer": { "in": ["tokens", "entity_pos", "entity_tags"], "in_y": ["y_ids"], "pipe": [ { "in": ["tokens", "entity_pos", "entity_tags"], "out": ["input_ids", "attention_mask", "upd_entity_pos", "upd_entity_tags", "nf_samples"], "class_name": "re_preprocessor", "vocab_file": "bert-base-cased", "default_tag": "PER" }, { "class_name": "re_classifier", "in": ["input_ids", "attention_mask", "upd_entity_pos", "upd_entity_tags"], "in_y": ["y_ids"], "out": ["model_output"], "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer_parameters": {"lr": 5e-5, "weight_decay": 0.01, "eps": 1e-6}, "n_classes": 97, "num_ner_tags": 6, "pretrained_bert": "bert-base-cased", "return_probas": true }, { "class_name": "re_postprocessor", "rel2id_path": "{DOWNLOADS_PATH}/docred/meta/rel2id.json", "rel2label_path": "{DOWNLOADS_PATH}/docred/rel_info.json", "in": ["model_output", "nf_samples"], "out": ["wikidata_relation_id", "relation_name"] } ], "out": ["wikidata_relation_id", "relation_name"] }, "train": { "epochs": 50, "batch_size": 30, "log_every_n_batches": 100, "train_metrics": ["f1_weighted", "acc"], "evaluation_targets": ["valid", "train"], "metrics": ["f1_weighted", "acc"], "validation_patience": 50, "val_every_n_batches": 200, "show_examples": false, "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/re_docred" }, "download": [ { "url": "http://files.deeppavlov.ai/deeppavlov_data/relation_extraction/docred.tar.gz", "subdir": "{DOWNLOADS_PATH}/docred" }, { "url": "http://files.deeppavlov.ai/deeppavlov_data/relation_extraction/re_docred_model_v1.tar.gz", "subdir": "{MODELS_PATH}/re_docred" }, { "url": "http://files.deeppavlov.ai/deeppavlov_data/relation_extraction/rel2label.json", "subdir": "{DOWNLOADS_PATH}/docred" } ] } } ================================================ FILE: deeppavlov/configs/relation_extraction/re_rured.json ================================================ { "dataset_reader": { "class_name": "rured_reader", "data_path": "{DOWNLOADS_PATH}/rured/" }, "dataset_iterator": { "class_name": "basic_classification_iterator" }, "chainer": { "in": ["tokens", "entity_pos", "entity_tags"], "in_y": ["y_ids"], "pipe": [ { "in": ["tokens", "entity_pos", "entity_tags"], "out": ["input_ids", "attention_mask", "upd_entity_pos", "upd_entity_tags", "nf_samples"], "class_name": "re_preprocessor", "ner_tags": ["WORK_OF_ART", "NORP", "GROUP", "LAW", "NATIONALITY", "EVENT", "DATE", "CURRENCY", "GPE", "QUANTITY", "FAMILY", "ORDINAL", "RELIGION", "CITY", "MONEY", "AGE", "LOCATION", "PERCENT", "BOROUGH", "STREET", "PERSON", "REGION", "COUNTRY", "PROFESSION", "ORGANIZATION", "FAC", "CARDINAL", "PRODUCT", "TIME"], "max_seq_length": 512, "vocab_file": "{TRANSFORMER}", "default_tag": "PERSON" }, { "class_name": "re_classifier", "in": ["input_ids", "attention_mask", "upd_entity_pos", "upd_entity_tags"], "in_y": ["y_ids"], "out": ["model_output"], "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer_parameters": {"lr": 5e-5, "weight_decay": 0.01, "eps": 1e-6}, "n_classes": 30, "num_ner_tags": 29, "pretrained_bert": "{TRANSFORMER}" }, { "class_name": "re_postprocessor", "rel2id_path": "{DOWNLOADS_PATH}/rured/rel2id.json", "rel2label_path": "{DOWNLOADS_PATH}/rured/rel2label.json", "in": ["model_output", "nf_samples"], "out": ["wikidata_relation_id", "relation_name"] } ], "out": ["wikidata_relation_id", "relation_name"] }, "train": { "epochs": 50, "batch_size": 16, "train_metrics": ["acc"], "metrics": ["acc"], "validation_patience": 50, "val_every_n_batches": 100, "log_every_n_batches": 100, "evaluation_targets": ["valid", "train"], "show_examples": false, "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "TRANSFORMER": "DeepPavlov/rubert-base-cased", "MODEL_PATH": "{MODELS_PATH}/re_rured" }, "download": [ { "url": "http://files.deeppavlov.ai/deeppavlov_data/relation_extraction/rured.tar.gz", "subdir": "{DOWNLOADS_PATH}/rured" }, { "url": "http://files.deeppavlov.ai/deeppavlov_data/relation_extraction/re_rured_model_v1.tar.gz", "subdir": "{MODELS_PATH}/re_rured" } ] } } ================================================ FILE: deeppavlov/configs/russian_super_glue/russian_superglue_danetqa_rubert.json ================================================ { "dataset_reader": { "class_name": "huggingface_dataset_reader", "path": "{COMPETITION}", "name": "{TASK}", "train": "train", "valid": "validation", "test": "test", "data_url": "http://files.deeppavlov.ai/datasets/russian_super_glue/DaNetQA", "ignore_verifications": true }, "dataset_iterator": { "class_name": "huggingface_dataset_iterator", "features": ["question", "passage"], "label": "label", "seed": 42 }, "chainer": { "in": ["question", "passage"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{BASE_MODEL}", "do_lower_case": false, "max_seq_length": 512, "in": ["question", "passage"], "out": ["bert_features"] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": ["y"], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": ["y"], "out": ["y_ids"] }, { "in": ["y_ids"], "out": ["y_onehot"], "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{BASE_MODEL}", "is_binary": "{BINARY_CLASSIFICATION}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": {"lr": 2e-05}, "in": ["bert_features"], "in_y": ["y_ids"], "out": ["y_pred_probas"] }, { "in": ["y_pred_probas"], "out": ["y_pred_ids"], "class_name": "proba2labels", "is_binary": "{BINARY_CLASSIFICATION}", "max_proba": true }, { "in": ["y_pred_ids"], "out": ["y_pred_labels"], "ref": "classes_vocab" } ], "out": ["y_pred_labels"] }, "train": { "batch_size": 4, "metrics": ["accuracy"], "epochs": 10, "validation_patience": 10, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "show_examples": false, "evaluation_targets": ["train", "valid"], "class_name": "torch_trainer", "tensorboard_log_dir": "{MODEL_PATH}/", "pytest_max_batches": 2, "pytest_batch_size": 2 }, "metadata": { "variables": { "BASE_MODEL": "DeepPavlov/rubert-base-cased", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "COMPETITION": "russian_super_glue", "BINARY_CLASSIFICATION": false, "TASK": "danetqa", "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/russian_super_glue/russian_superglue_danetqa_rubert.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/russian_super_glue/russian_superglue_lidirus_rubert.json ================================================ { "dataset_reader": { "class_name": "huggingface_dataset_reader", "path": "{COMPETITION}", "name": "{TASK}", "test": "test", "data_url": "http://files.deeppavlov.ai/datasets/russian_super_glue/LiDiRus", "ignore_verifications": true }, "dataset_iterator": { "class_name": "huggingface_dataset_iterator", "features": ["sentence1", "sentence2"], "label": "label", "seed": 42 }, "chainer": { "in": ["sentence1", "sentence2"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{BASE_MODEL}", "do_lower_case": false, "max_seq_length": 256, "in": ["sentence1", "sentence2"], "out": ["bert_features"] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": ["y"], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": ["y"], "out": ["y_ids"] }, { "in": ["y_ids"], "out": ["y_onehot"], "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{BASE_MODEL}", "is_binary": "{BINARY_CLASSIFICATION}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": {"lr": 2e-05, "weight_decay": 0.1}, "learning_rate_drop_patience": 3, "learning_rate_drop_div": 2.0, "in": ["bert_features"], "in_y": ["y_ids"], "out": ["y_pred_probas"] }, { "in": ["y_pred_probas"], "out": ["y_pred_ids"], "class_name": "proba2labels", "is_binary": "{BINARY_CLASSIFICATION}", "confidence_threshold": 0.5 }, { "in": ["y_pred_ids"], "out": ["y_pred_labels"], "ref": "classes_vocab" } ], "out": ["y_pred_labels"] }, "train": { "batch_size": 16, "metrics": ["matthews_correlation"], "validation_patience": 10, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "show_examples": false, "evaluation_targets": ["test"], "class_name": "torch_trainer", "tensorboard_log_dir": "{MODEL_PATH}/", "pytest_max_batches": 2, "pytest_batch_size": 2 }, "metadata": { "variables": { "BASE_MODEL": "DeepPavlov/rubert-base-cased", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "COMPETITION": "russian_super_glue", "BINARY_CLASSIFICATION": false, "TASK": "lidirus", "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/terra/{BASE_MODEL}" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/russian_super_glue/russian_superglue_terra_rubert.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/russian_super_glue/russian_superglue_muserc_rubert.json ================================================ { "dataset_reader": { "class_name": "huggingface_dataset_reader", "path": "{COMPETITION}", "name": "{TASK}", "train": "train", "valid": "validation", "test": "test", "data_url": "http://files.deeppavlov.ai/datasets/russian_super_glue/MuSeRC", "ignore_verifications": true }, "dataset_iterator": { "class_name": "huggingface_dataset_iterator", "features": ["context", "answer", "idx"], "label": "label", "seed": 42 }, "chainer": { "in": ["context", "answer", "idx"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{BASE_MODEL}", "do_lower_case": false, "max_seq_length": 512, "in": ["context", "answer"], "out": ["bert_features"] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": ["y"], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": ["y"], "out": ["y_ids"] }, { "in": ["y_ids"], "out": ["y_onehot"], "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "BINARY_CLASSIFICATION": false, "pretrained_bert": "{BASE_MODEL}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": {"lr": 2e-05}, "in": ["bert_features"], "in_y": ["y_ids"], "out": ["y_pred_probas"] }, { "in": ["y_pred_probas"], "out": ["y_pred_ids"], "class_name": "proba2labels", "is_binary": "{BINARY_CLASSIFICATION}", "max_proba": true }, { "in": ["y_pred_ids"], "out": ["y_pred_labels"], "ref": "classes_vocab" } ], "out": ["y_pred_labels"] }, "train": { "batch_size": 8, "metrics": ["roc_auc","f1"], "epochs": 10, "validation_patience": 10, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "show_examples": false, "evaluation_targets": ["train", "valid"], "class_name": "torch_trainer", "tensorboard_log_dir": "{MODEL_PATH}/", "pytest_max_batches": 2, "pytest_batch_size": 2 }, "metadata": { "variables": { "BASE_MODEL": "DeepPavlov/rubert-base-cased", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "COMPETITION": "russian_super_glue", "BINARY_CLASSIFICATION": false, "TASK": "muserc", "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/russian_super_glue/russian_superglue_muserc_rubert.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/russian_super_glue/russian_superglue_parus_rubert.json ================================================ { "dataset_reader": { "class_name": "huggingface_dataset_reader", "path": "{COMPETITION}", "name": "{TASK}", "train": "train", "valid": "validation", "test": "test", "data_url": "http://files.deeppavlov.ai/datasets/russian_super_glue/PARus", "ignore_verifications": true }, "dataset_iterator": { "class_name": "huggingface_dataset_iterator", "features": ["contexts", "choices"], "label": "label", "seed": 42 }, "chainer": { "in": ["contexts_list", "choices_list"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_multiplechoice_preprocessor", "vocab_file": "{BASE_MODEL}", "do_lower_case": false, "max_seq_length": 256, "in": ["contexts_list", "choices_list"], "out": ["bert_features"] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": ["y"], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": ["y"], "out": ["y_ids"] }, { "in": ["y_ids"], "out": ["y_onehot"], "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_multiplechoice", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{BASE_MODEL}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": {"lr": 4e-05}, "in": ["bert_features"], "in_y": ["y_ids"], "out": ["y_pred_probas"] }, { "in": ["y_pred_probas"], "out": ["y_pred_ids"], "class_name": "proba2labels", "max_proba": true }, { "in": ["y_pred_ids"], "out": ["y_pred_labels"], "ref": "classes_vocab" } ], "out": ["y_pred_labels"] }, "train": { "batch_size": 4, "metrics": ["accuracy"], "validation_patience": 10, "epochs": 10, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "show_examples": false, "evaluation_targets": ["train", "valid"], "class_name": "torch_trainer", "tensorboard_log_dir": "{MODEL_PATH}/", "pytest_max_batches": 2, "pytest_batch_size": 2 }, "metadata": { "variables": { "BASE_MODEL": "DeepPavlov/rubert-base-cased", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "COMPETITION": "russian_super_glue", "TASK": "parus", "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/russian_super_glue/russian_superglue_parus_rubert.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/russian_super_glue/russian_superglue_rcb_rubert.json ================================================ { "dataset_reader": { "class_name": "huggingface_dataset_reader", "path": "{COMPETITION}", "name": "{TASK}", "train": "train", "valid": "validation", "test": "test", "data_url": "http://files.deeppavlov.ai/datasets/russian_super_glue/RCB", "ignore_verifications": true }, "dataset_iterator": { "class_name": "huggingface_dataset_iterator", "features": ["premise", "hypothesis"], "label": "label", "seed": 42 }, "chainer": { "in": ["premise", "hypothesis"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{BASE_MODEL}", "do_lower_case": false, "max_seq_length": 256, "in": ["premise", "hypothesis"], "out": ["bert_features"] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": ["y"], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": ["y"], "out": ["y_ids"] }, { "in": ["y_ids"], "out": ["y_onehot"], "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{BASE_MODEL}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": {"lr": 4e-05}, "in": ["bert_features"], "in_y": ["y_ids"], "out": ["y_pred_probas"] }, { "in": ["y_pred_probas"], "out": ["y_pred_ids"], "class_name": "proba2labels", "max_proba": true }, { "in": ["y_pred_ids"], "out": ["y_pred_labels"], "ref": "classes_vocab" } ], "out": ["y_pred_labels"] }, "train": { "batch_size": 4, "metrics": ["accuracy", "f1_macro"], "validation_patience": 10, "epochs": 10, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "show_examples": false, "evaluation_targets": ["train", "valid"], "class_name": "torch_trainer", "tensorboard_log_dir": "{MODEL_PATH}/", "pytest_max_batches": 2 }, "metadata": { "variables": { "BASE_MODEL": "DeepPavlov/rubert-base-cased", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "COMPETITION": "russian_super_glue", "TASK": "rcb", "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/russian_super_glue/russian_superglue_rcb_rubert.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/russian_super_glue/russian_superglue_rucos_rubert.json ================================================ { "dataset_reader": { "class_name": "huggingface_dataset_reader", "path": "{COMPETITION}", "name": "{TASK}", "train": "train", "valid": "validation", "test": "test", "data_url": "http://files.deeppavlov.ai/datasets/russian_super_glue/RuCoS", "ignore_verifications": true, "downsample_ratio": [1.8, 1.8, 1], "do_index_correction": false }, "dataset_iterator": { "class_name": "huggingface_dataset_iterator", "features": ["idx", "query", "passage", "entities", "num_examples"], "label": "label", "use_label_name": false }, "chainer": { "in": ["idx", "query", "passage", "entities", "num_examples"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{BASE_MODEL}", "do_lower_case": false, "max_seq_length": 320, "in": ["query", "passage"], "out": ["bert_features"] }, { "class_name": "torch_transformers_classifier", "n_classes": 2, "return_probas": true, "is_binary": "{BINARY_CLASSIFICATION}", "pretrained_bert": "{BASE_MODEL}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": {"lr": 2e-05}, "in": ["bert_features"], "in_y": ["y"], "out": ["y_pred_probas"] }, { "class_name": "proba2labels", "in": ["y_pred_probas"], "out": ["y_pred_ids"], "is_binary": "{BINARY_CLASSIFICATION}", "max_proba": true }, { "class_name": "torch_record_postprocessor", "is_binary": "{BINARY_CLASSIFICATION}", "in": ["idx", "y", "y_pred_probas", "entities", "num_examples"], "out": ["record_examples"] } ], "out": ["y_pred_probas"] }, "train": { "batch_size": 12, "train_metrics": [ { "name": "accuracy", "inputs": ["y", "y_pred_ids"] } ], "metrics": [ { "name": "record_em_score", "inputs": ["record_examples"] }, { "name": "record_f1_score", "inputs": ["record_examples"] }, { "name": "accuracy", "inputs": ["y", "y_pred_ids"] } ], "epochs": 10, "validation_patience": 10, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "show_examples": false, "class_name": "torch_trainer", "evaluation_targets": ["valid"], "tensorboard_log_dir": "{MODEL_PATH}/", "pytest_max_batches": 2, "pytest_batch_size": 2 }, "metadata": { "variables": { "BASE_MODEL": "DeepPavlov/rubert-base-cased", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "COMPETITION": "russian_super_glue", "BINARY_CLASSIFICATION": false, "TASK": "rucos", "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/russian_super_glue/russian_superglue_rucos_rubert.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/russian_super_glue/russian_superglue_russe_rubert.json ================================================ { "dataset_reader": { "class_name": "huggingface_dataset_reader", "path": "{COMPETITION}", "name": "{TASK}", "train": "train", "valid": "validation", "test": "test", "data_url": "http://files.deeppavlov.ai/datasets/russian_super_glue/RUSSE", "ignore_verifications": true }, "dataset_iterator": { "class_name": "huggingface_dataset_iterator", "features": ["sentence1", "sentence2"], "label": "label", "seed": 42 }, "chainer": { "in": ["sentence1", "sentence2"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{BASE_MODEL}", "do_lower_case": false, "max_seq_length": 256, "in": ["sentence1", "sentence2"], "out": ["bert_features"] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": ["y"], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": ["y"], "out": ["y_ids"] }, { "in": ["y_ids"], "out": ["y_onehot"], "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{BASE_MODEL}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": {"lr": 2e-05}, "in": ["bert_features"], "in_y": ["y_ids"], "out": ["y_pred_probas"] }, { "in": ["y_pred_probas"], "out": ["y_pred_ids"], "class_name": "proba2labels", "max_proba": true }, { "in": ["y_pred_ids"], "out": ["y_pred_labels"], "ref": "classes_vocab" } ], "out": ["y_pred_labels"] }, "train": { "batch_size": 4, "metrics": ["accuracy"], "epochs": 10, "validation_patience": 10, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "val_every_n_batches": 1000, "show_examples": false, "evaluation_targets": ["valid"], "class_name": "torch_trainer", "tensorboard_log_dir": "{MODEL_PATH}/", "pytest_max_batches": 2 }, "metadata": { "variables": { "BASE_MODEL": "DeepPavlov/rubert-base-cased", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "COMPETITION": "russian_super_glue", "TASK": "russe", "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/russian_super_glue/russian_superglue_russe_rubert.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/russian_super_glue/russian_superglue_rwsd_rubert.json ================================================ { "dataset_reader": { "class_name": "huggingface_dataset_reader", "path": "{COMPETITION}", "name": "{TASK}", "train": "train", "valid": "validation", "test": "test", "data_url": "http://files.deeppavlov.ai/datasets/russian_super_glue/RWSD", "ignore_verifications": true }, "dataset_iterator": { "class_name": "huggingface_dataset_iterator", "features": ["text", "answer"], "label": "label", "seed": 42 }, "chainer": { "in": ["text", "answer"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{BASE_MODEL}", "do_lower_case": true, "max_seq_length": 256, "in": ["text", "answer"], "out": ["bert_features"] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": ["y"], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": ["y"], "out": ["y_ids"] }, { "in": ["y_ids"], "out": ["y_onehot"], "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{BASE_MODEL}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "Adam", "optimizer_parameters": {"lr": 2e-05}, "in": ["bert_features"], "in_y": ["y_ids"], "out": ["y_pred_probas"] }, { "in": ["y_pred_probas"], "out": ["y_pred_ids"], "class_name": "proba2labels", "max_proba": true }, { "in": ["y_pred_ids"], "out": ["y_pred_labels"], "ref": "classes_vocab" } ], "out": ["y_pred_labels"] }, "train": { "batch_size": 4, "metrics": ["accuracy"], "epochs": 10, "validation_patience": 10, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "show_examples": false, "evaluation_targets": ["train", "valid"], "class_name": "torch_trainer", "tensorboard_log_dir": "{MODEL_PATH}/", "pytest_max_batches": 2 }, "metadata": { "variables": { "BASE_MODEL": "DeepPavlov/rubert-base-cased", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "COMPETITION": "russian_super_glue", "TASK": "rwsd", "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/russian_super_glue/russian_superglue_rwsd_rubert.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/russian_super_glue/russian_superglue_terra_rubert.json ================================================ { "dataset_reader": { "class_name": "huggingface_dataset_reader", "path": "{COMPETITION}", "name": "{TASK}", "train": "train", "valid": "validation", "test": "test", "data_url": "http://files.deeppavlov.ai/datasets/russian_super_glue/TERRa", "ignore_verifications": true }, "dataset_iterator": { "class_name": "huggingface_dataset_iterator", "features": ["premise", "hypothesis"], "label": "label", "seed": 42 }, "chainer": { "in": ["premise", "hypothesis"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_preprocessor", "vocab_file": "{BASE_MODEL}", "do_lower_case": false, "max_seq_length": 256, "in": ["premise", "hypothesis"], "out": ["bert_features"] }, { "id": "classes_vocab", "class_name": "simple_vocab", "fit_on": ["y"], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", "in": ["y"], "out": ["y_ids"] }, { "in": ["y_ids"], "out": ["y_onehot"], "class_name": "one_hotter", "depth": "#classes_vocab.len", "single_vector": true }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", "return_probas": true, "pretrained_bert": "{BASE_MODEL}", "is_binary": "{BINARY_CLASSIFICATION}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": {"lr": 2e-05}, "in": ["bert_features"], "in_y": ["y_ids"], "out": ["y_pred_probas"] }, { "in": ["y_pred_probas"], "out": ["y_pred_ids"], "class_name": "proba2labels", "max_proba": true }, { "in": ["y_pred_ids"], "out": ["y_pred_labels"], "ref": "classes_vocab" } ], "out": ["y_pred_labels"] }, "train": { "batch_size": 4, "metrics": ["accuracy"], "epochs": 10, "validation_patience": 10, "val_every_n_epochs": 1, "log_every_n_epochs": 1, "show_examples": false, "evaluation_targets": ["train", "valid"], "class_name": "torch_trainer", "tensorboard_log_dir": "{MODEL_PATH}/", "pytest_max_batches": 2, "pytest_batch_size": 2 }, "metadata": { "variables": { "BASE_MODEL": "DeepPavlov/rubert-base-cased", "ROOT_PATH": "~/.deeppavlov", "COMPETITION": "russian_super_glue", "TASK": "terra", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "BINARY_CLASSIFICATION": false, "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/russian_super_glue/russian_superglue_terra_rubert.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/sentence_segmentation/sentseg_dailydialog_bert.json ================================================ { "dataset_reader": { "class_name": "conll2003_reader", "data_path": "{DOWNLOADS_PATH}/dailydialog/", "dataset_name": "dailydialog" }, "dataset_iterator": { "class_name": "data_learning_iterator" }, "chainer": { "in": ["x"], "in_y": ["y"], "pipe": [ { "class_name": "torch_transformers_ner_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": true, "max_seq_length": 512, "max_subword_length": 15, "token_masking_prob": 0.0, "in": ["x"], "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets"] }, { "id": "tag_vocab", "class_name": "simple_vocab", "unk_token": ["O"], "pad_with_zeros": true, "save_path": "{MODEL_PATH}/tag.dict", "load_path": "{MODEL_PATH}/tag.dict", "fit_on": ["y"], "in": ["y"], "out": ["y_ind"] }, { "class_name": "torch_transformers_sequence_tagger", "n_tags": "#tag_vocab.len", "pretrained_bert": "{TRANSFORMER}", "attention_probs_keep_prob": 0.5, "encoder_layer_ids": [-1], "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-05, "weight_decay": 1e-06, "betas": [0.9, 0.999], "eps": 1e-06 }, "clip_norm": 1.0, "min_learning_rate": 1e-07, "learning_rate_drop_patience": 6, "learning_rate_drop_div": 1.5, "load_before_drop": true, "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"], "in_y": ["y_ind"], "out": ["y_pred_ind", "probas"] }, { "ref": "tag_vocab", "in": ["y_pred_ind"], "out": ["y_pred"] }, { "in": ["x_tokens", "y_pred"], "out": "punctuated_sents", "class_name": "sentseg_restore_sent" } ], "out": ["x_tokens", "punctuated_sents"] }, "train": { "epochs": 30, "batch_size": 30, "metrics": [ { "name": "ner_f1", "inputs": ["y", "y_pred"] }, { "name": "ner_token_f1", "inputs": ["y", "y_pred"] } ], "validation_patience": 20, "val_every_n_batches": 100, "log_every_n_batches": 100, "show_examples": false, "pytest_max_batches": 2, "pytest_batch_size": 8, "evaluation_targets": ["valid", "test"], "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "TRANSFORMER": "bert-base-uncased", "MODEL_PATH": "{MODELS_PATH}/sentseg_dailydialog_bert" }, "download": [ { "url": "http://files.deeppavlov.ai/deeppavlov_data/sentseg_dailydialog_bert.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/spelling_correction/brillmoore_wikitypos_en.json ================================================ { "dataset_reader": { "class_name": "typos_wikipedia_reader", "data_path": "{DOWNLOADS_PATH}" }, "dataset_iterator": { "class_name": "typos_iterator", "test_ratio": 0.05 }, "chainer":{ "in": ["x"], "in_y": ["y"], "pipe": [ { "class_name": "str_lower", "id": "lower", "in": ["x"], "out": ["x_lower"] }, { "class_name": "nltk_moses_tokenizer", "id": "tokenizer", "in": ["x_lower"], "out": ["x_tokens"] }, { "ref": "tokenizer", "in": ["y"], "out": ["y_tokens"] }, { "fit_on": ["x_tokens", "y_tokens"], "in": ["x_tokens"], "out": ["tokens_candidates"], "class_name": "spelling_error_model", "window": 1, "candidates_count": 4, "dictionary": { "class_name": "wikitionary_100K_vocab", "data_dir": "{DOWNLOADS_PATH}/vocabs" }, "save_path": "{MODELS_PATH}/error_model/error_model.tsv" }, { "class_name": "kenlm_elector", "in": ["tokens_candidates"], "out": ["y_predicted_tokens"], "load_path": "{DOWNLOADS_PATH}/language_models/en_wiki_no_punkt.arpa.binary" }, { "ref": "tokenizer", "in": ["y_predicted_tokens"], "out": ["y_predicted"] } ], "out": ["y_predicted"] }, "train": { "evaluation_targets": ["test"], "class_name": "fit_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models" }, "download": [ { "url": "http://files.deeppavlov.ai/deeppavlov_data/error_model.tar.gz", "subdir": "{MODELS_PATH}" }, { "url": "http://files.deeppavlov.ai/lang_models/en_wiki_no_punkt.arpa.binary.gz", "subdir": "{DOWNLOADS_PATH}/language_models" }, { "url": "http://files.deeppavlov.ai/datasets/wiktionary/wikipedia_100K_vocab.tar.gz", "subdir": "{DOWNLOADS_PATH}/vocabs" } ] } } ================================================ FILE: deeppavlov/configs/spelling_correction/levenshtein_corrector_ru.json ================================================ { "chainer":{ "in": ["x"], "pipe": [ { "class_name": "str_lower", "id": "lower", "in": ["x"], "out": ["x_lower"] }, { "class_name": "nltk_moses_tokenizer", "id": "tokenizer", "in": ["x_lower"], "out": ["x_tokens"] }, { "id": "vocab", "class_name": "simple_vocab", "save_path": "{DOWNLOADS_PATH}/vocabs/russian_words_vocab.dict", "load_path": "{DOWNLOADS_PATH}/vocabs/russian_words_vocab.dict" }, { "in": ["x_tokens"], "out": ["tokens_candidates"], "class_name": "spelling_levenshtein", "words": "#vocab.keys()" }, { "class_name": "kenlm_elector", "in": ["tokens_candidates"], "out": ["y_predicted_tokens"], "load_path": "{DOWNLOADS_PATH}/language_models/ru_wiyalen_no_punkt.arpa.binary" }, { "ref": "tokenizer", "in": ["y_predicted_tokens"], "out": ["y_predicted"] } ], "out": ["y_predicted"] }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models" }, "download": [ { "url": "http://files.deeppavlov.ai/deeppavlov_data/vocabs/russian_words_vocab.dict.gz", "subdir": "{DOWNLOADS_PATH}/vocabs" }, { "url": "http://files.deeppavlov.ai/lang_models/ru_wiyalen_no_punkt.arpa.binary.gz", "subdir": "{DOWNLOADS_PATH}/language_models" } ] } } ================================================ FILE: deeppavlov/configs/squad/qa_multisberquad_bert.json ================================================ { "dataset_reader": { "class_name": "multi_squad_dataset_reader", "dataset": "MultiSQuADRuRetrClean", "url": "http://files.deeppavlov.ai/datasets/multi_squad_ru_retr_clean.tar.gz", "data_path": "{DOWNLOADS_PATH}/multi_squad_ru_retr_clean/" }, "dataset_iterator": { "class_name": "multi_squad_retr_iterator", "seed": 1337, "shuffle": false, "with_answer_rate": 0.666 }, "chainer": { "in": ["context_raw", "question_raw"], "in_y": ["ans_raw", "ans_raw_start"], "pipe": [ { "class_name": "torch_squad_transformers_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": "{LOWERCASE}", "max_seq_length": 384, "in": ["question_raw", "context_raw"], "out": ["bert_features", "subtokens", "split_context"] }, { "class_name": "squad_bert_mapping", "do_lower_case": "{LOWERCASE}", "in": ["split_context", "bert_features", "subtokens"], "out": ["subtok2chars", "char2subtoks"] }, { "class_name": "squad_bert_ans_preprocessor", "do_lower_case": "{LOWERCASE}", "in": ["ans_raw", "ans_raw_start", "char2subtoks"], "out": ["ans", "ans_start", "ans_end"] }, { "class_name": "torch_transformers_squad", "pretrained_bert": "{TRANSFORMER}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-05, "weight_decay": 0.01, "betas": [0.9, 0.999], "eps": 1e-06 }, "learning_rate_drop_patience": 3, "learning_rate_drop_div": 2.0, "in": ["bert_features"], "in_y": ["ans_start", "ans_end"], "out": ["ans_start_predicted", "ans_end_predicted", "logits", "scores", "inds"] }, { "class_name": "squad_bert_ans_postprocessor", "in": ["ans_start_predicted", "ans_end_predicted", "split_context", "subtok2chars", "subtokens", "inds"], "out": ["ans_predicted", "ans_start_predicted", "ans_end_predicted"] } ], "out": ["ans_predicted", "ans_start_predicted", "scores"] }, "train": { "show_examples": false, "evaluation_targets": ["valid"], "log_every_n_batches": 250, "val_every_n_batches": 500, "batch_size": 20, "valid_batch_size": 64, "validation_patience": 10, "metrics": [ { "name": "squad_v1_f1", "inputs": ["ans", "ans_predicted"] }, { "name": "squad_v1_em", "inputs": ["ans", "ans_predicted"] }, { "name": "squad_v2_f1", "inputs": ["ans", "ans_predicted"] }, { "name": "squad_v2_em", "inputs": ["ans", "ans_predicted"] } ], "class_name": "torch_trainer" }, "metadata": { "variables": { "LOWERCASE": false, "TRANSFORMER": "DeepPavlov/rubert-base-cased", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/multi_squad_ru_torch_bert_retr_noans/{TRANSFORMER}" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/squad/multi_squad_ru_torch_bert_retr_noans.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/squad/qa_nq_psgcls_bert.json ================================================ { "chainer": { "in": ["context_raw", "question_raw"], "pipe": [ { "class_name": "torch_squad_transformers_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": "{LOWERCASE}", "max_seq_length": 384, "in": ["question_raw", "context_raw"], "out": ["bert_features", "subtokens", "split_context"] }, { "class_name": "squad_bert_mapping", "do_lower_case": "{LOWERCASE}", "in": ["split_context", "bert_features", "subtokens"], "out": ["subtok2chars", "char2subtoks"] }, { "class_name": "torch_transformers_squad", "pretrained_bert": "{TRANSFORMER}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "torch_seed": 1, "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-05, "weight_decay": 0.01, "betas": [0.9, 0.999], "eps": 1e-06 }, "random_seed": 1, "psg_cls": true, "learning_rate_drop_patience": 2, "learning_rate_drop_div": 2.0, "in": ["bert_features"], "out": ["ans_start_predicted", "ans_end_predicted", "logits", "scores", "inds"] }, { "class_name": "squad_bert_ans_postprocessor", "in": ["ans_start_predicted", "ans_end_predicted", "split_context", "subtok2chars", "subtokens", "inds"], "out": ["ans_predicted", "ans_start_predicted", "ans_end_predicted"] } ], "out": ["ans_predicted", "ans_start_predicted", "scores"] }, "metadata": { "variables": { "LOWERCASE": true, "TRANSFORMER": "bert-base-uncased", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/passage_reader_classifier_eng" }, "download": [ { "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/nq_psgcls_bert.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/squad/qa_squad2_bert.json ================================================ { "dataset_reader": { "class_name": "squad_dataset_reader", "dataset": "SQuAD2.0", "data_path": "{DOWNLOADS_PATH}/squad2/" }, "dataset_iterator": { "class_name": "squad_iterator", "seed": 1337, "shuffle": true }, "chainer": { "in": [ "context_raw", "question_raw" ], "in_y": [ "ans_raw", "ans_raw_start" ], "pipe": [ { "class_name": "torch_squad_transformers_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": "{LOWERCASE}", "max_seq_length": 384, "in": [ "question_raw", "context_raw" ], "out": [ "bert_features", "subtokens", "split_context" ] }, { "class_name": "squad_bert_mapping", "do_lower_case": "{LOWERCASE}", "in": [ "split_context", "bert_features", "subtokens" ], "out": [ "subtok2chars", "char2subtoks" ] }, { "class_name": "squad_bert_ans_preprocessor", "do_lower_case": "{LOWERCASE}", "in": [ "ans_raw", "ans_raw_start", "char2subtoks" ], "out": [ "ans", "ans_start", "ans_end" ] }, { "class_name": "torch_transformers_squad", "pretrained_bert": "{TRANSFORMER}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "torch_seed": 1, "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-05, "weight_decay": 0.01, "betas": [ 0.9, 0.999 ], "eps": 1e-06 }, "random_seed": 1, "learning_rate_drop_patience": 2, "learning_rate_drop_div": 2.0, "in": [ "bert_features" ], "in_y": [ "ans_start", "ans_end" ], "out": [ "ans_start_predicted", "ans_end_predicted", "logits", "scores", "inds" ] }, { "class_name": "squad_bert_ans_postprocessor", "in": [ "ans_start_predicted", "ans_end_predicted", "split_context", "subtok2chars", "subtokens", "inds" ], "out": [ "ans_predicted", "ans_start_predicted", "ans_end_predicted" ] } ], "out": [ "ans_predicted", "ans_start_predicted", "scores" ] }, "train": { "show_examples": false, "evaluation_targets": [ "valid" ], "log_every_n_batches": 50, "val_every_n_batches": 500, "batch_size": 20, "valid_batch_size": 60, "valid_batch_size": 32, "pytest_max_batches": 2, "pytest_batch_size": 5, "validation_patience": 10, "metrics": [ { "name": "squad_v1_f1", "inputs": [ "ans", "ans_predicted" ] }, { "name": "squad_v1_em", "inputs": [ "ans", "ans_predicted" ] }, { "name": "squad_v2_f1", "inputs": [ "ans", "ans_predicted" ] }, { "name": "squad_v2_em", "inputs": [ "ans", "ans_predicted" ] } ], "class_name": "torch_trainer" }, "metadata": { "variables": { "LOWERCASE": false, "TRANSFORMER": "bert-base-cased", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/squad2_bert" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/squad/squad2_bert.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/squad/squad_bert.json ================================================ { "dataset_reader": { "class_name": "squad_dataset_reader", "data_path": "{DOWNLOADS_PATH}/squad/" }, "dataset_iterator": { "class_name": "squad_iterator", "seed": 1337, "shuffle": true }, "chainer": { "in": ["context_raw", "question_raw"], "in_y": ["ans_raw", "ans_raw_start"], "pipe": [ { "class_name": "torch_squad_transformers_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": "{LOWERCASE}", "max_seq_length": 384, "in": ["question_raw", "context_raw"], "out": ["bert_features", "subtokens", "split_context"] }, { "class_name": "squad_bert_mapping", "do_lower_case": "{LOWERCASE}", "in": ["split_context", "bert_features", "subtokens"], "out": ["subtok2chars", "char2subtoks"] }, { "class_name": "squad_bert_ans_preprocessor", "do_lower_case": "{LOWERCASE}", "in": ["ans_raw", "ans_raw_start", "char2subtoks"], "out": ["ans", "ans_start", "ans_end"] }, { "class_name": "torch_transformers_squad", "pretrained_bert": "{TRANSFORMER}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-05, "weight_decay": 0.01, "betas": [0.9, 0.999], "eps": 1e-06 }, "learning_rate_drop_patience": 2, "learning_rate_drop_div": 2.0, "batch_size": 10, "in": ["bert_features"], "in_y": ["ans_start", "ans_end"], "out": ["ans_start_predicted", "ans_end_predicted", "logits", "scores", "inds"] }, { "class_name": "squad_bert_ans_postprocessor", "in": ["ans_start_predicted", "ans_end_predicted", "split_context", "subtok2chars", "subtokens", "inds"], "out": ["ans_predicted", "ans_start_predicted", "ans_end_predicted"] } ], "out": ["ans_predicted", "ans_start_predicted", "scores"] }, "train": { "show_examples": false, "evaluation_targets": ["valid"], "log_every_n_batches": 250, "val_every_n_batches": 500, "batch_size": 10, "pytest_max_batches": 2, "pytest_batch_size": 5, "validation_patience": 10, "metrics": [ { "name": "squad_v1_f1", "inputs": ["ans", "ans_predicted"] }, { "name": "squad_v1_em", "inputs": ["ans", "ans_predicted"] }, { "name": "squad_v2_f1", "inputs": ["ans", "ans_predicted"] }, { "name": "squad_v2_em", "inputs": ["ans", "ans_predicted"] } ], "class_name": "torch_trainer" }, "metadata": { "variables": { "LOWERCASE": false, "TRANSFORMER": "bert-base-cased", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/squad_torch_bert/cased/{TRANSFORMER}" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/squad/squad_torch_bert_cased.tar.gz", "subdir": "{MODEL_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/squad/squad_ru_bert.json ================================================ { "dataset_reader": { "class_name": "squad_dataset_reader", "dataset": "SberSQuADClean", "url": "http://files.deeppavlov.ai/datasets/sber_squad_clean-v1.1.tar.gz", "data_path": "{DOWNLOADS_PATH}/squad_ru_clean/" }, "dataset_iterator": { "class_name": "squad_iterator", "seed": 1337, "shuffle": true }, "chainer": { "in": [ "context_raw", "question_raw" ], "in_y": [ "ans_raw", "ans_raw_start" ], "pipe": [ { "class_name": "torch_squad_transformers_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": "{LOWERCASE}", "max_seq_length": 384, "in": [ "question_raw", "context_raw" ], "out": [ "bert_features", "subtokens", "split_context" ] }, { "class_name": "squad_bert_mapping", "do_lower_case": "{LOWERCASE}", "in": [ "split_context", "bert_features", "subtokens" ], "out": [ "subtok2chars", "char2subtoks" ] }, { "class_name": "squad_bert_ans_preprocessor", "do_lower_case": "{LOWERCASE}", "in": [ "ans_raw", "ans_raw_start", "char2subtoks" ], "out": [ "ans", "ans_start", "ans_end" ] }, { "class_name": "torch_transformers_squad", "pretrained_bert": "{TRANSFORMER}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "optimizer": "AdamW", "optimizer_parameters": { "lr": 2e-05, "weight_decay": 0.01, "betas": [ 0.9, 0.999 ], "eps": 1e-06 }, "learning_rate_drop_patience": 3, "learning_rate_drop_div": 2.0, "in": [ "bert_features" ], "in_y": [ "ans_start", "ans_end" ], "out": [ "ans_start_predicted", "ans_end_predicted", "logits", "scores", "inds" ] }, { "class_name": "squad_bert_ans_postprocessor", "in": [ "ans_start_predicted", "ans_end_predicted", "split_context", "subtok2chars", "subtokens", "inds" ], "out": [ "ans_predicted", "ans_start_predicted", "ans_end_predicted" ] } ], "out": [ "ans_predicted", "ans_start_predicted", "scores" ] }, "train": { "show_examples": false, "evaluation_targets": [ "valid" ], "log_every_n_batches": 250, "val_every_n_batches": 500, "batch_size": 10, "validation_patience": 10, "metrics": [ { "name": "squad_v1_f1", "inputs": [ "ans", "ans_predicted" ] }, { "name": "squad_v1_em", "inputs": [ "ans", "ans_predicted" ] }, { "name": "squad_v2_f1", "inputs": [ "ans", "ans_predicted" ] }, { "name": "squad_v2_em", "inputs": [ "ans", "ans_predicted" ] } ], "class_name": "torch_trainer" }, "metadata": { "variables": { "LOWERCASE": false, "TRANSFORMER": "DeepPavlov/rubert-base-cased", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/squad_ru_torch_bert/{TRANSFORMER}" }, "download": [ { "url": "http://files.deeppavlov.ai/v1/squad/squad_ru_torch_bert.tar.gz", "subdir": "{MODELS_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L.json ================================================ { "dataset_reader": { "class_name": "squad_dataset_reader", "dataset": "SberSQuADClean", "url": "http://files.deeppavlov.ai/datasets/sber_squad_clean-v1.1.tar.gz", "data_path": "{DOWNLOADS_PATH}/squad_ru_clean/" }, "dataset_iterator": { "class_name": "squad_iterator", "seed": 1337, "shuffle": true }, "chainer": { "in": [ "context_raw", "question_raw" ], "in_y": [ "ans_raw", "ans_raw_start" ], "pipe": [ { "class_name": "torch_squad_transformers_preprocessor", "add_token_type_ids": true, "vocab_file": "{TRANSFORMER}", "do_lower_case": "{lowercase}", "max_seq_length": 384, "in": [ "question_raw", "context_raw" ], "out": [ "bert_features", "subtokens", "split_context" ] }, { "class_name": "squad_bert_mapping", "do_lower_case": "{lowercase}", "in": [ "split_context", "bert_features", "subtokens" ], "out": [ "subtok2chars", "char2subtoks" ] }, { "class_name": "squad_bert_ans_preprocessor", "do_lower_case": "{lowercase}", "in": [ "ans_raw", "ans_raw_start", "char2subtoks" ], "out": [ "ans", "ans_start", "ans_end" ] }, { "class_name": "torch_transformers_squad", "pretrained_bert": "{TRANSFORMER}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "attention_probs_keep_prob": 0.11, "hidden_keep_prob": 0.33, "optimizer": "AdamW", "optimizer_parameters": { "lr": 9e-05 }, "learning_rate_drop_patience": 2, "learning_rate_drop_div": 1.5, "in": [ "bert_features" ], "in_y": [ "ans_start", "ans_end" ], "out": [ "ans_start_predicted", "ans_end_predicted", "logits", "scores", "inds" ] }, { "class_name": "squad_bert_ans_postprocessor", "in": [ "ans_start_predicted", "ans_end_predicted", "split_context", "subtok2chars", "subtokens", "inds" ], "out": [ "ans_predicted", "ans_start_predicted", "ans_end_predicted" ] } ], "out": [ "ans_predicted", "ans_start_predicted", "scores" ] }, "train": { "show_examples": false, "evaluation_targets": [ "valid" ], "log_every_n_batches": 250, "val_every_n_batches": 500, "batch_size": 10, "validation_patience": 10, "metrics": [ { "name": "squad_v2_f1", "inputs": [ "ans", "ans_predicted" ] }, { "name": "squad_v2_em", "inputs": [ "ans", "ans_predicted" ] }, { "name": "squad_v1_f1", "inputs": [ "ans", "ans_predicted" ] }, { "name": "squad_v1_em", "inputs": [ "ans", "ans_predicted" ] } ], "tensorboard_log_dir": "{MODEL_PATH}/logs", "class_name": "torch_trainer" }, "metadata": { "variables": { "lowercase": false, "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "TRANSFORMER": "DeepPavlov/distilrubert-tiny-cased-conversational", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/squad_ru_convers_distilrubert_2L" }, "download": [ { "url": "http://files.deeppavlov.ai/deeppavlov_data/squad_ru_convers_distilrubert_2L.tar.gz", "subdir": "{MODELS_PATH}" } ] } } ================================================ FILE: deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L.json ================================================ { "dataset_reader": { "class_name": "squad_dataset_reader", "dataset": "SberSQuADClean", "url": "http://files.deeppavlov.ai/datasets/sber_squad_clean-v1.1.tar.gz", "data_path": "{DOWNLOADS_PATH}/squad_ru_clean/" }, "dataset_iterator": { "class_name": "squad_iterator", "seed": 1337, "shuffle": true }, "chainer": { "in": [ "context_raw", "question_raw" ], "in_y": [ "ans_raw", "ans_raw_start" ], "pipe": [ { "class_name": "torch_squad_transformers_preprocessor", "add_token_type_ids": true, "vocab_file": "{TRANSFORMER}", "do_lower_case": "{lowercase}", "max_seq_length": 384, "in": [ "question_raw", "context_raw" ], "out": [ "bert_features", "subtokens", "split_context" ] }, { "class_name": "squad_bert_mapping", "do_lower_case": "{lowercase}", "in": [ "split_context", "bert_features", "subtokens" ], "out": [ "subtok2chars", "char2subtoks" ] }, { "class_name": "squad_bert_ans_preprocessor", "do_lower_case": "{lowercase}", "in": [ "ans_raw", "ans_raw_start", "char2subtoks" ], "out": [ "ans", "ans_start", "ans_end" ] }, { "class_name": "torch_transformers_squad", "pretrained_bert": "{TRANSFORMER}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "attention_probs_keep_prob": 0.0, "hidden_keep_prob": 0.33, "optimizer": "AdamW", "optimizer_parameters": { "lr": 3.67e-5 }, "learning_rate_drop_patience": 2, "learning_rate_drop_div": 1.5, "in": [ "bert_features" ], "in_y": [ "ans_start", "ans_end" ], "out": [ "ans_start_predicted", "ans_end_predicted", "logits", "scores", "inds" ] }, { "class_name": "squad_bert_ans_postprocessor", "in": [ "ans_start_predicted", "ans_end_predicted", "split_context", "subtok2chars", "subtokens", "inds" ], "out": [ "ans_predicted", "ans_start_predicted", "ans_end_predicted" ] } ], "out": [ "ans_predicted", "ans_start_predicted", "scores" ] }, "train": { "show_examples": false, "evaluation_targets": [ "valid" ], "log_every_n_batches": 250, "val_every_n_batches": 500, "batch_size": 10, "validation_patience": 10, "metrics": [ { "name": "squad_v2_f1", "inputs": [ "ans", "ans_predicted" ] }, { "name": "squad_v2_em", "inputs": [ "ans", "ans_predicted" ] }, { "name": "squad_v1_f1", "inputs": [ "ans", "ans_predicted" ] }, { "name": "squad_v1_em", "inputs": [ "ans", "ans_predicted" ] } ], "tensorboard_log_dir": "{MODEL_PATH}/logs", "class_name": "torch_trainer" }, "metadata": { "variables": { "lowercase": false, "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "TRANSFORMER": "DeepPavlov/distilrubert-base-cased-conversational", "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/squad_ru_convers_distilrubert_6L" }, "download": [ { "url": "http://files.deeppavlov.ai/deeppavlov_data/squad_ru_convers_distilrubert_6L.tar.gz", "subdir": "{MODELS_PATH}" } ] } } ================================================ FILE: deeppavlov/core/__init__.py ================================================ ================================================ FILE: deeppavlov/core/commands/__init__.py ================================================ ================================================ FILE: deeppavlov/core/commands/infer.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json import sys from itertools import islice from logging import getLogger from pathlib import Path from typing import Optional, Union from deeppavlov.core.commands.utils import import_packages, parse_config from deeppavlov.core.common.chainer import Chainer from deeppavlov.core.common.params import from_params from deeppavlov.core.data.utils import jsonify_data from deeppavlov.download import deep_download from deeppavlov.utils.pip_wrapper import install_from_config log = getLogger(__name__) def build_model(config: Union[str, Path, dict], mode: str = 'infer', load_trained: bool = False, install: bool = False, download: bool = False) -> Chainer: """Build and return the model described in corresponding configuration file.""" config = parse_config(config) if install: install_from_config(config) if download: deep_download(config) import_packages(config.get('metadata', {}).get('imports', [])) model_config = config['chainer'] model = Chainer(model_config['in'], model_config['out'], model_config.get('in_y')) for component_config in model_config['pipe']: if load_trained and ('fit_on' in component_config or 'in_y' in component_config): try: component_config['load_path'] = component_config['save_path'] except KeyError: log.warning('No "save_path" parameter for the {} component, so "load_path" will not be renewed' .format(component_config.get('class_name', component_config.get('ref', 'UNKNOWN')))) component = from_params(component_config, mode=mode) if 'id' in component_config: model._components_dict[component_config['id']] = component if 'in' in component_config: c_in = component_config['in'] c_out = component_config['out'] in_y = component_config.get('in_y', None) main = component_config.get('main', False) model.append(component, c_in, c_out, in_y, main) return model def interact_model(config: Union[str, Path, dict]) -> None: """Start interaction with the model described in corresponding configuration file.""" model = build_model(config) while True: args = [] for in_x in model.in_x: args.append((input('{}::'.format(in_x)),)) # check for exit command if args[-1][0] in {'exit', 'stop', 'quit', 'q'}: return pred = model(*args) if len(model.out_params) > 1: pred = zip(*pred) print('>>', *pred) def predict_on_stream(config: Union[str, Path, dict], batch_size: Optional[int] = None, file_path: Optional[str] = None) -> None: """Make a prediction with the component described in corresponding configuration file.""" batch_size = batch_size or 1 if file_path is None or file_path == '-': if sys.stdin.isatty(): raise RuntimeError('To process data from terminal please use interact mode') f = sys.stdin else: f = open(file_path, encoding='utf8') model: Chainer = build_model(config) args_count = len(model.in_x) while True: batch = list((l.strip() for l in islice(f, batch_size * args_count))) if not batch: break args = [] for i in range(args_count): args.append(batch[i::args_count]) res = model(*args) if len(model.out_params) == 1: res = [res] for res in zip(*res): res = json.dumps(jsonify_data(res), ensure_ascii=False) print(res, flush=True) if f is not sys.stdin: f.close() ================================================ FILE: deeppavlov/core/commands/train.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from pathlib import Path from typing import Dict, Union, Optional, Iterable from deeppavlov.core.commands.utils import expand_path, import_packages, parse_config from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.params import resolve from deeppavlov.core.common.registry import get_model from deeppavlov.core.data.data_fitting_iterator import DataFittingIterator from deeppavlov.core.data.data_learning_iterator import DataLearningIterator from deeppavlov.core.data.utils import get_all_elems_from_json from deeppavlov.download import deep_download from deeppavlov.utils.pip_wrapper import install_from_config log = getLogger(__name__) def read_data_by_config(config: dict): """Read data by dataset_reader from specified config.""" dataset_config = config.get('dataset', None) if dataset_config: config.pop('dataset') ds_type = dataset_config['type'] if ds_type == 'classification': reader = {'class_name': 'basic_classification_reader'} iterator = {'class_name': 'basic_classification_iterator'} config['dataset_reader'] = {**dataset_config, **reader} config['dataset_iterator'] = {**dataset_config, **iterator} else: raise Exception("Unsupported dataset type: {}".format(ds_type)) try: reader_config = dict(config['dataset_reader']) except KeyError: raise ConfigError("No dataset reader is provided in the JSON config.") reader = get_model(reader_config.pop('class_name'))() data_path = reader_config.get('data_path') if isinstance(data_path, list): reader_config['data_path'] = [expand_path(path) for path in data_path] elif data_path is not None: reader_config['data_path'] = expand_path(data_path) return reader.read(**reader_config) def get_iterator_from_config(config: dict, data: dict): """Create iterator (from config) for specified data.""" iterator_config = {k: resolve(v) for k, v in config['dataset_iterator'].items()} iterator: Union[DataLearningIterator, DataFittingIterator] = get_model(iterator_config.pop('class_name'))( **iterator_config, data=data) return iterator def train_evaluate_model_from_config(config: Union[str, Path, dict], iterator: Union[DataLearningIterator, DataFittingIterator] = None, *, to_train: bool = True, evaluation_targets: Optional[Iterable[str]] = None, install: bool = False, download: bool = False, start_epoch_num: Optional[int] = None, recursive: bool = False) -> Dict[str, Dict[str, float]]: """Make training and evaluation of the model described in corresponding configuration file.""" config = parse_config(config) if install: install_from_config(config) if download: deep_download(config) if to_train and recursive: for subconfig in get_all_elems_from_json(config['chainer'], 'config_path'): log.info(f'Training "{subconfig}"') train_evaluate_model_from_config(subconfig, download=False, recursive=True) import_packages(config.get('metadata', {}).get('imports', [])) if iterator is None: try: data = read_data_by_config(config) # TODO: check class objects, not strings is_mtl = config['dataset_reader']['class_name'] == 'multitask_reader' if config.get('train', {}).get('val_every_n_epochs') and not data.get('valid') and not is_mtl: error_message = 'The value "val_every_n_epochs" is set in the config but no validation data is provided' raise AttributeError(error_message) except ConfigError as e: to_train = False log.warning(f'Skipping training. {e.message}') else: iterator = get_iterator_from_config(config, data) if 'train' not in config: log.warning('Train config is missing. Populating with default values') train_config = config.get('train', {}) if start_epoch_num is not None: train_config['start_epoch_num'] = start_epoch_num trainer_class = get_model(train_config.pop('class_name', 'torch_trainer')) trainer = trainer_class(config['chainer'], **train_config) if to_train: trainer.train(iterator) res = {} if iterator is not None: res = trainer.evaluate(iterator, evaluation_targets) trainer.get_chainer().destroy() res = {k: v['metrics'] for k, v in res.items()} return res ================================================ FILE: deeppavlov/core/commands/utils.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os from copy import deepcopy from pathlib import Path from typing import Any, Union, Dict, TypeVar, Optional from deeppavlov.core.common.file import read_json, find_config from deeppavlov.core.common.registry import inverted_registry from deeppavlov.core.data.utils import get_all_elems_from_json # noinspection PyShadowingBuiltins _T = TypeVar('_T', str, float, bool, list, dict) def _parse_config_property(item: _T, variables: Dict[str, Union[str, Path, float, bool, int, None]], variables_exact: Dict[str, Union[str, Path, float, bool, int, None]]) -> _T: """Recursively apply config's variables values to its property""" if isinstance(item, str): if item in variables_exact: return variables_exact[item] return item.format(**variables) elif isinstance(item, list): return [_parse_config_property(item, variables, variables_exact) for item in item] elif isinstance(item, dict): return {k: _parse_config_property(v, variables, variables_exact) for k, v in item.items()} else: return item def _get_variables_from_config(config: Union[str, Path, dict]): """Read config's variables""" if isinstance(config, (str, Path)): config = read_json(find_config(config)) variables = { 'DEEPPAVLOV_PATH': os.getenv(f'DP_DEEPPAVLOV_PATH', Path(__file__).parent.parent.parent) } variables_exact = {f'{{{k}}}': v for k, v in variables.items()} for name, value in config.get('metadata', {}).get('variables', {}).items(): env_name = f'DP_{name}' if env_name in os.environ: value = os.getenv(env_name) if value in variables_exact: value = variables_exact[value] elif isinstance(value, str): value = value.format(**variables) variables[name] = value variables_exact[f'{{{name}}}'] = value return variables, variables_exact def _update_requirements(config: dict) -> dict: """ Generates requirements for DeepPavlov model and adds them as ``metadata.requirements`` field to the returned dict. Searches for the ``class_name`` keys in the passed config at all nesting levels. For each found component, function looks for dependencies in the requirements registry. Found dependencies are added to the returned copy of the config as ``metadata.requirements``. If the config already has ``metadata.requirements``, the existing one is complemented by the found requirements. Args: config: DeepPavlov model config Returns: config copy with updated ``metadata.requirements`` field according to the config components. """ components = get_all_elems_from_json(config, 'class_name') components = {inverted_registry.get(component, component) for component in components} requirements_registry_path = Path(__file__).parents[1] / 'common' / 'requirements_registry.json' requirements_registry = read_json(requirements_registry_path) requirements = [] for component in components: requirements.extend(requirements_registry.get(component, [])) requirements.extend(config.get('metadata', {}).get('requirements', [])) response = deepcopy(config) response['metadata'] = response.get('metadata', {}) response['metadata']['requirements'] = list(set(requirements)) return response def _overwrite(data: Any, value: Any, nested_keys: list) -> None: """Changes ``data`` nested key value to ``value`` using ``nested_keys`` as nested keys list. Example: >>> x = {'a': [None, {'b': 2}]} >>> _overwrite(x, 42, ['a', 1, 'b']) >>> x {'a': [None, {'b': 42}]} """ key = nested_keys.pop(0) if not nested_keys: data[key] = value else: _overwrite(data[key], value, nested_keys) def parse_config(config: Union[str, Path, dict], overwrite: Optional[dict] = None) -> dict: """Apply metadata.variables values to placeholders inside config and update nested configs using overwrite parameter Args: config: Config to parse. overwrite: If not None - key-value pairs of nested keys and values to overwrite config. For {'chainer.pipe.0.class_name': 'simple_vocab'} it will update config config['chainer']['pipe'][0]['class_name'] = 'simple_vocab'. """ if isinstance(config, (str, Path)): config = read_json(find_config(config)) if overwrite is not None: for key, value in overwrite.items(): items = [int(item) if item.isdigit() else item for item in key.split('.')] _overwrite(config, value, items) updated_config = _update_requirements(config) variables, variables_exact = _get_variables_from_config(updated_config) return _parse_config_property(updated_config, variables, variables_exact) def expand_path(path: Union[str, Path]) -> Path: """Convert relative paths to absolute with resolving user directory.""" return Path(path).expanduser().resolve() def import_packages(packages: list) -> None: """Import packages from list to execute their code.""" for package in packages: __import__(package) def parse_value_with_config(value: Union[str, Path], config: Union[str, Path, dict]) -> Path: """Fill the variables in `value` with variables values from `config`. `value` should be a string. If `value` is a string of only variable, `value` will be replaced with variable's value from config (the variable's value could be anything then).""" variables, variables_exact = _get_variables_from_config(config) return _parse_config_property(str(value), variables, variables_exact) ================================================ FILE: deeppavlov/core/common/__init__.py ================================================ ================================================ FILE: deeppavlov/core/common/aliases.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ALIASES = { 'kbqa_cq': 'kbqa_cq_en', 'kbqa_cq_online': 'kbqa_cq_en', 'kbqa_cq_rus': 'kbqa_cq_ru', 'multi_squad_noans': 'qa_squad2_bert', 'multi_squad_noans_infer': 'qa_squad2_bert', 'multi_squad_retr_noans': 'qa_squad2_bert', 'ner_collection3_m1': 'ner_collection3_bert', 'ner_conll2003': 'ner_conll2003_bert', 'ner_conll2003_torch_bert': 'ner_conll2003_bert', 'ner_dstc2': 'ner_conll2003_bert', 'ner_few_shot_ru': 'ner_rus_bert', 'ner_few_shot_ru_simulate': 'ner_rus_bert', 'ner_ontonotes': 'ner_ontonotes_bert', 'ner_ontonotes_bert_emb': 'ner_ontonotes_bert', 'ner_ontonotes_bert_mult_torch': 'ner_ontonotes_bert_mult', 'ner_ontonotes_bert_torch': 'ner_ontonotes_bert', 'ner_rus': 'ner_rus_bert', 'paraphraser_bert': 'paraphraser_rubert', 'ru_odqa_infer_wiki_rubert': 'ru_odqa_infer_wiki', 'sentseg_dailydialog': 'sentseg_dailydialog_bert', 'squad': 'squad_bert', 'squad_bert_infer': 'squad_bert', 'squad_bert_multilingual_freezed_emb': 'squad_bert', 'squad_ru': 'squad_ru_bert', 'squad_ru_bert_infer': 'squad_ru_bert', 'squad_ru_convers_distilrubert_2L_infer': 'squad_ru_convers_distilrubert_2L', 'squad_ru_convers_distilrubert_6L_infer': 'squad_ru_convers_distilrubert_6L', 'squad_ru_rubert': 'squad_ru_bert', 'squad_ru_rubert_infer': 'squad_ru_bert', 'squad_torch_bert': 'squad_bert', 'squad_torch_bert_infer': 'squad_bert' } ================================================ FILE: deeppavlov/core/common/base.py ================================================ # Copyright 2021 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from types import FunctionType from typing import List, Optional, Union from deeppavlov.core.common.chainer import Chainer from deeppavlov.core.models.component import Component class Element: """DeepPavlov model pipeline element.""" def __init__(self, component: Union[Component, FunctionType], x: Optional[Union[str, list]] = None, out: Optional[Union[str, list]] = None, y: Optional[Union[str, list]] = None, main: bool = False) -> None: """ Args: component: Pipeline component object. x: Names of the component inference inputs. Output from other pipeline elements with such names will be fed to the input of this component. out: Names of the component inference outputs. Component outputs can be fed to other pipeline elements using this names. y: Names of additional inputs (targets) for component training and evaluation. main: Set True if this is the main component. Main component is trained during model training process. """ self.component = component self.x = x self.y = y self.out = out self.main = main class Model(Chainer): """Builds a component pipeline to train and infer models.""" def __init__(self, x: Optional[Union[str, list]] = None, out: Optional[Union[str, list]] = None, y: Optional[Union[str, list]] = None, pipe: Optional[List[Element]] = None) -> None: """ Args: x: Names of pipeline inference inputs. out: Names of pipeline inference outputs. y: Names of additional inputs (targets) for pipeline training and evaluation. pipe: List of pipeline elements. """ super().__init__(in_x=x, out_params=out, in_y=y) if pipe is not None: for element in pipe: self.append(element.component, element.x, element.out, element.y, element.main) ================================================ FILE: deeppavlov/core/common/chainer.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import pickle from itertools import islice from logging import getLogger from types import FunctionType from typing import Union, Tuple, List, Optional, Hashable, Reversible from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.models.component import Component from deeppavlov.core.models.nn_model import NNModel from deeppavlov.core.models.serializable import Serializable log = getLogger(__name__) class Chainer(Component): """ Builds a component pipeline from heterogeneous components (Rule-based/ML/DL). It allows to train and infer models in a pipeline as a whole. Attributes: pipe: list of components and their input and output variable names for inference train_pipe: list of components and their input and output variable names for training and evaluation in_x: names of inputs for pipeline inference mode out_params: names of pipeline inference outputs in_y: names of additional inputs for pipeline training and evaluation modes forward_map: list of all variables in chainer's memory after running every component in ``self.pipe`` train_map: list of all variables in chainer's memory after running every component in ``train_pipe.pipe`` main: reference to the main component Args: in_x: names of inputs for pipeline inference mode out_params: names of pipeline inference outputs in_y: names of additional inputs for pipeline training and evaluation modes """ def __init__(self, in_x: Union[str, list] = None, out_params: Union[str, list] = None, in_y: Union[str, list] = None, *args, **kwargs) -> None: self.pipe: List[Tuple[Tuple[List[str], List[str]], List[str], Component]] = [] self.train_pipe = [] if isinstance(in_x, str): in_x = [in_x] if isinstance(in_y, str): in_y = [in_y] if isinstance(out_params, str): out_params = [out_params] self.in_x = in_x or ['x'] self.in_y = in_y or ['y'] self.out_params = out_params or self.in_x self.forward_map = set(self.in_x) self.train_map = self.forward_map.union(self.in_y) self._components_dict = {} self.main = None def __getitem__(self, item): if isinstance(item, int): in_params, out_params, component = self.train_pipe[item] return component return self._components_dict[item] def _ipython_key_completions_(self): return self._components_dict.keys() def __repr__(self): reversed_components_dict = {v: f'{repr(k)}: ' for k, v in self._components_dict.items() if isinstance(v, Hashable)} components_list = [] for in_params, out_params, component in self.train_pipe: component_repr = repr(component) if isinstance(component, Hashable) and component in reversed_components_dict: component_repr = reversed_components_dict[component] + component_repr else: for k, v in self._components_dict.items(): if v is component: component_repr = f'{k}: {component_repr}' break components_list.append(component_repr) return f'Chainer[{", ".join(components_list)}]' def _repr_pretty_(self, p, cycle): """method that defines ``Struct``'s pretty printing rules for iPython Args: p (IPython.lib.pretty.RepresentationPrinter): pretty printer object cycle (bool): is ``True`` if pretty detected a cycle """ if cycle: p.text('Chainer(...)') else: with p.group(8, 'Chainer[', ']'): reversed_components_dict = {v: k for k, v in self._components_dict.items() if isinstance(v, Hashable)} # p.pretty(self.__prepare_repr()) for i, (in_params, out_params, component) in enumerate(self.train_pipe): if i > 0: p.text(',') p.breakable() if isinstance(component, Hashable) and component in reversed_components_dict: p.pretty(reversed_components_dict[component]) p.text(': ') else: for k, v in self._components_dict.items(): if v is component: p.pretty(k) p.text(': ') break p.pretty(component) def append(self, component: Union[Component, FunctionType], in_x: [str, list, dict] = None, out_params: [str, list] = None, in_y: [str, list, dict] = None, main: bool = False): if isinstance(in_x, str): in_x = [in_x] if isinstance(in_y, str): in_y = [in_y] if isinstance(out_params, str): out_params = [out_params] in_x = in_x or self.in_x if isinstance(in_x, dict): x_keys, in_x = zip(*in_x.items()) else: x_keys = [] out_params = out_params or in_x if in_y is not None: if isinstance(in_y, dict): y_keys, in_y = zip(*in_y.items()) else: y_keys = [] keys = x_keys + y_keys if bool(x_keys) != bool(y_keys): raise ConfigError('`in` and `in_y` for a component have to both be lists or dicts') component: NNModel main = True assert self.train_map.issuperset(in_x + in_y), ('Arguments {} are expected but only {} are set' .format(in_x + in_y, self.train_map)) preprocessor = Chainer(self.in_x, in_x + in_y, self.in_y) for (t_in_x_keys, t_in_x), t_out, t_component in self.train_pipe: if t_in_x_keys: t_in_x = dict(zip(t_in_x_keys, t_in_x)) preprocessor.append(t_component, t_in_x, t_out) def train_on_batch(*args, **kwargs): preprocessed = preprocessor.compute(*args, **kwargs) if len(in_x + in_y) == 1: preprocessed = [preprocessed] if keys: return component.train_on_batch(**dict(zip(keys, preprocessed))) else: return component.train_on_batch(*preprocessed) self.train_on_batch = train_on_batch self.process_event = component.process_event if main: self.main = component if self.forward_map.issuperset(in_x): self.pipe.append(((x_keys, in_x), out_params, component)) self.forward_map = self.forward_map.union(out_params) if self.train_map.issuperset(in_x): self.train_pipe.append(((x_keys, in_x), out_params, component)) self.train_map = self.train_map.union(out_params) else: raise ConfigError('Arguments {} are expected but only {} are set'.format(in_x, self.train_map)) def compute(self, x, y=None, targets=None): if targets is None: targets = self.out_params in_params = list(self.in_x) if len(in_params) == 1: args = [x] else: args = list(zip(*x)) if y is None: pipe = self.pipe else: pipe = self.train_pipe if len(self.in_y) == 1: args.append(y) else: args += list(zip(*y)) in_params += self.in_y return self._compute(*args, pipe=pipe, param_names=in_params, targets=targets) def __call__(self, *args): return self._compute(*args, param_names=self.in_x, pipe=self.pipe, targets=self.out_params) @staticmethod def _compute(*args, param_names, pipe, targets): expected = set(targets) final_pipe = [] for (in_keys, in_params), out_params, component in reversed(pipe): if expected.intersection(out_params): expected = expected - set(out_params) | set(in_params) final_pipe.append(((in_keys, in_params), out_params, component)) final_pipe.reverse() if not expected.issubset(param_names): raise RuntimeError(f'{expected} are required to compute {targets} but were not found in memory or inputs') pipe = final_pipe mem = dict(zip(param_names, args)) del args for (in_keys, in_params), out_params, component in pipe: x = [mem[k] for k in in_params] if in_keys: res = component.__call__(**dict(zip(in_keys, x))) else: res = component.__call__(*x) if len(out_params) == 1: mem[out_params[0]] = res else: mem.update(zip(out_params, res)) res = [mem[k] for k in targets] if len(res) == 1: res = res[0] return res def batched_call(self, *args: Reversible, batch_size: int = 16) -> Union[list, Tuple[list, ...]]: """ Partitions data into mini-batches and applies :meth:`__call__` to each batch. Args: args: input data, each element of the data corresponds to a single model inputs sequence. batch_size: the size of a batch. Returns: the model output as if the data was passed to the :meth:`__call__` method. """ args = [iter(arg) for arg in args] answer = [[] for _ in self.out_params] while True: batch = [list(islice(arg, batch_size)) for arg in args] if not any(batch): # empty batch, reached the end break curr_answer = self.__call__(*batch) if len(self.out_params) == 1: curr_answer = [curr_answer] for y, curr_y in zip(answer, curr_answer): y.extend(curr_y) if len(self.out_params) == 1: answer = answer[0] return answer def get_main_component(self) -> Optional[Serializable]: try: return self.main or self.pipe[-1][-1] except IndexError: log.warning('Cannot get a main component for an empty chainer') return None def save(self) -> None: main_component = self.get_main_component() if isinstance(main_component, Serializable): main_component.save() def load(self) -> None: for in_params, out_params, component in self.train_pipe: if callable(getattr(component, 'load', None)): component.load() def reset(self) -> None: for in_params, out_params, component in self.train_pipe: if callable(getattr(component, 'reset', None)): component.reset() def destroy(self): if hasattr(self, 'train_pipe'): for in_params, out_params, component in self.train_pipe: if callable(getattr(component, 'destroy', None)): component.destroy() self.train_pipe.clear() if hasattr(self, 'pipe'): self.pipe.clear() super().destroy() ================================================ FILE: deeppavlov/core/common/cross_validation.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import shutil from collections import OrderedDict from logging import getLogger from pathlib import Path import numpy as np from sklearn.model_selection import KFold from deeppavlov.core.commands.train import train_evaluate_model_from_config, get_iterator_from_config, \ read_data_by_config from deeppavlov.core.commands.utils import expand_path, parse_config from deeppavlov.core.common.params_search import ParamsSearch SAVE_PATH_ELEMENT_NAME = 'save_path' TEMP_DIR_FOR_CV = 'cv_tmp' log = getLogger(__name__) def change_savepath_for_model(config): params_helper = ParamsSearch() dirs_for_saved_models = set() for p in params_helper.find_model_path(config, SAVE_PATH_ELEMENT_NAME): p.append(SAVE_PATH_ELEMENT_NAME) save_path = Path(params_helper.get_value_from_config(config, p)) new_save_path = save_path.parent / TEMP_DIR_FOR_CV / save_path.name dirs_for_saved_models.add(expand_path(new_save_path.parent)) params_helper.insert_value_or_dict_into_config(config, p, str(new_save_path)) return config, dirs_for_saved_models def delete_dir_for_saved_models(dirs_for_saved_models): for new_save_dir in dirs_for_saved_models: shutil.rmtree(str(new_save_dir)) def create_dirs_to_save_models(dirs_for_saved_models): for new_save_dir in dirs_for_saved_models: new_save_dir.mkdir(exist_ok=True, parents=True) def generate_train_valid(data, n_folds=5, is_loo=False): all_data = data['train'] + data['valid'] if is_loo: # for Leave One Out for i in range(len(all_data)): data_i = { 'train': all_data.copy(), 'test': data['test'] } data_i['valid'] = [data_i['train'].pop(i)] yield data_i else: # for Cross Validation kf = KFold(n_splits=n_folds, shuffle=True) for train_index, valid_index in kf.split(all_data): data_i = { 'train': [all_data[i] for i in train_index], 'valid': [all_data[i] for i in valid_index], 'test': data['test'] } yield data_i def calc_cv_score(config, data=None, n_folds=5, is_loo=False): config = parse_config(config) if data is None: data = read_data_by_config(config) config, dirs_for_saved_models = change_savepath_for_model(config) cv_score = OrderedDict() for data_i in generate_train_valid(data, n_folds=n_folds, is_loo=is_loo): iterator = get_iterator_from_config(config, data_i) create_dirs_to_save_models(dirs_for_saved_models) score = train_evaluate_model_from_config(config, iterator=iterator) delete_dir_for_saved_models(dirs_for_saved_models) for key, value in score['valid'].items(): if key not in cv_score: cv_score[key] = [] cv_score[key].append(value) for key, value in cv_score.items(): cv_score[key] = np.mean(value) log.info('Cross-Validation \"{}\" is: {}'.format(key, cv_score[key])) return cv_score ================================================ FILE: deeppavlov/core/common/errors.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging logger = logging.getLogger(__name__) class ConfigError(Exception): """Any configuration error.""" def __init__(self, message): super(ConfigError, self).__init__() self.message = message def __str__(self): return repr(self.message) ================================================ FILE: deeppavlov/core/common/file.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json import pickle from collections import OrderedDict from logging import getLogger from pathlib import Path from typing import Union, Any, Iterable from deeppavlov.core.common.aliases import ALIASES log = getLogger(__name__) _red_text, _reset_text_color, _sharp_line = "\x1b[31;20m", "\x1b[0m", '#'*80 DEPRECATOIN_MSG = f"{_red_text}\n\n{_sharp_line}\n" \ "# The model '{0}' has been removed from the DeepPavlov configs.\n" \ "# The model '{1}' is used instead.\n" \ "# To disable this message please switch to '{1}'.\n" \ "# Automatic name resolving will be disabled in the deeppavlov 1.2.0,\n" \ "# and if you try to use '{0}' you will get an ERROR.\n" \ f"{_sharp_line}{_reset_text_color}\n" def find_config(pipeline_config_path: Union[str, Path]) -> Path: if pipeline_config_path in ALIASES: new_pipeline_config_path = ALIASES[pipeline_config_path] log.warning(DEPRECATOIN_MSG.format(pipeline_config_path, new_pipeline_config_path)) pipeline_config_path = new_pipeline_config_path if not Path(pipeline_config_path).is_file(): configs = [c for c in Path(__file__).parent.parent.parent.glob(f'configs/**/{pipeline_config_path}.json') if str(c.with_suffix('')).endswith(pipeline_config_path)] # a simple way to not allow * and ? if configs: log.debug(f"Interpreting '{pipeline_config_path}' as '{configs[0]}'") pipeline_config_path = configs[0] return Path(pipeline_config_path) def read_json(fpath: Union[str, Path]) -> dict: with open(fpath, encoding='utf8') as fin: return json.load(fin, object_pairs_hook=OrderedDict) def save_json(data: dict, fpath: Union[str, Path]) -> None: with open(fpath, 'w', encoding='utf8') as fout: json.dump(data, fout, ensure_ascii=False, indent=2) def save_pickle(data: dict, fpath: Union[str, Path]) -> None: with open(fpath, 'wb') as fout: pickle.dump(data, fout, protocol=4) def load_pickle(fpath: Union[str, Path]) -> Any: with open(fpath, 'rb') as fin: return pickle.load(fin) def save_jsonl(data: Iterable[dict], fpath: Union[str, Path]) -> None: with open(fpath, 'w') as f: for item in data: f.write(f"{json.dumps(item, ensure_ascii=False)}\n") ================================================ FILE: deeppavlov/core/common/log.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json import logging import logging.config from pathlib import Path from .paths import get_settings_path LOG_CONFIG_FILENAME = 'log_config.json' TRACEBACK_LOGGER_ERRORS = True root_path = Path(__file__).resolve().parents[3] log_config_path = get_settings_path() / LOG_CONFIG_FILENAME with log_config_path.open(encoding='utf8') as log_config_json: log_config = json.load(log_config_json) class ProbeFilter(logging.Filter): """ProbeFilter class is used to filter POST requests to /probe endpoint from logs.""" def filter(self, record: logging.LogRecord) -> bool: """To log the record method should return True.""" return 'POST /probe HTTP' not in record.getMessage() def init_logger(): configured_loggers = [log_config.get('root', {})] + [logger for logger in log_config.get('loggers', {}).values()] used_handlers = {handler for log in configured_loggers for handler in log.get('handlers', [])} for handler_id, handler in list(log_config['handlers'].items()): if handler_id not in used_handlers: del log_config['handlers'][handler_id] elif 'filename' in handler.keys(): filename = handler['filename'] logfile_path = Path(filename).expanduser().resolve() handler['filename'] = str(logfile_path) logging.config.dictConfig(log_config) ================================================ FILE: deeppavlov/core/common/log_events.py ================================================ # Copyright 2019 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from typing import Optional from deeppavlov.core.commands.utils import expand_path log = getLogger(__name__) class TBWriter: def __init__(self, tensorboard_log_dir: str): # TODO: After adding wandb logger, create common parent class for both loggers from torch.utils.tensorboard import SummaryWriter tensorboard_log_dir = expand_path(tensorboard_log_dir) self.tb_train_writer = SummaryWriter(str(tensorboard_log_dir / 'train_log')) self.tb_valid_writer = SummaryWriter(str(tensorboard_log_dir / 'valid_log')) # TODO: find how to write Summary def write_train(self, tag, scalar_value, global_step): self.tb_train_writer.add_scalar(tag, scalar_value, global_step) def write_valid(self, tag, scalar_value, global_step): self.tb_valid_writer.add_scalar(tag, scalar_value, global_step) def flush(self): self.tb_train_writer.flush() self.tb_valid_writer.flush() def get_tb_writer(tensorboard_log_dir: Optional[str]) -> Optional[TBWriter]: try: if tensorboard_log_dir is not None: tb_writer = TBWriter(tensorboard_log_dir) else: tb_writer = None except ImportError: log.error('Failed to import SummaryWriter from torch.utils.tensorboard. Failed to initialize Tensorboard ' 'logger. Install appropriate Pytorch version to use this logger or remove tensorboard_log_dir ' 'parameter from the train parameters list in the configuration file.') tb_writer = None return tb_writer ================================================ FILE: deeppavlov/core/common/metrics_registry.json ================================================ { "acc": "deeppavlov.metrics.accuracy:round_accuracy", "accuracy": "deeppavlov.metrics.accuracy:accuracy", "average__ner_f1__f1_macro__f1": "deeppavlov.metrics.fmeasure:ner_f1__f1_macro__f1", "average__roc_auc__roc_auc__ner_f1": "deeppavlov.metrics.fmeasure:roc_auc__roc_auc__ner_f1", "bleu": "deeppavlov.metrics.bleu:bleu", "bleu_advanced": "deeppavlov.metrics.bleu:bleu_advanced", "elmo_loss2ppl": "deeppavlov.metrics.elmo_metrics:elmo_loss2ppl", "f1": "deeppavlov.metrics.fmeasure:round_f1", "f1_macro": "deeppavlov.metrics.fmeasure:round_f1_macro", "f1_weighted": "deeppavlov.metrics.fmeasure:round_f1_weighted", "google_bleu": "deeppavlov.metrics.bleu:google_bleu", "kbqa_accuracy": "deeppavlov.metrics.accuracy:kbqa_accuracy", "log_loss": "deeppavlov.metrics.log_loss:sk_log_loss", "matthews_correlation": "deeppavlov.metrics.correlation:matthews_correlation", "mean_squared_error": "deeppavlov.metrics.mse:mse", "multitask_accuracy": "deeppavlov.metrics.accuracy:multitask_accuracy", "multitask_sequence_accuracy": "deeppavlov.metrics.accuracy:multitask_sequence_accuracy", "multitask_token_accuracy": "deeppavlov.metrics.accuracy:multitask_token_accuracy", "ner_f1": "deeppavlov.metrics.fmeasure:ner_f1", "ner_token_f1": "deeppavlov.metrics.fmeasure:ner_token_f1", "pearson_correlation": "deeppavlov.metrics.correlation:pearson_correlation", "per_item_bleu": "deeppavlov.metrics.bleu:per_item_bleu", "per_item_dialog_accuracy": "deeppavlov.metrics.accuracy:per_item_dialog_accuracy", "per_item_dialog_bleu": "deeppavlov.metrics.bleu:per_item_dialog_bleu", "per_token_accuracy": "deeppavlov.metrics.accuracy:per_token_accuracy", "r@1": "deeppavlov.metrics.recall_at_k:r_at_1", "r@10": "deeppavlov.metrics.recall_at_k:r_at_10", "r@1_insQA": "deeppavlov.models.ranking.metrics:r_at_1_insQA", "r@2": "deeppavlov.metrics.recall_at_k:r_at_2", "r@5": "deeppavlov.metrics.recall_at_k:r_at_5", "rank_response": "deeppavlov.models.ranking.metrics:rank_response", "roc_auc": "deeppavlov.metrics.roc_auc_score:roc_auc_score", "sets_accuracy": "deeppavlov.metrics.accuracy:sets_accuracy", "slots_accuracy": "deeppavlov.metrics.accuracy:slots_accuracy", "spearman_correlation": "deeppavlov.metrics.correlation:spearman_correlation", "squad_v1_em": "deeppavlov.metrics.squad_metrics:squad_v1_exact_match", "squad_v1_f1": "deeppavlov.metrics.squad_metrics:squad_v1_f1", "squad_v2_em": "deeppavlov.metrics.squad_metrics:squad_v2_exact_match", "squad_v2_f1": "deeppavlov.metrics.squad_metrics:squad_v2_f1", "record_f1_score": "deeppavlov.metrics.record_metrics:record_f1_score", "record_em_score": "deeppavlov.metrics.record_metrics:record_em_score" } ================================================ FILE: deeppavlov/core/common/metrics_registry.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import importlib import json from logging import getLogger from pathlib import Path from typing import Callable, Any from deeppavlov.core.common.errors import ConfigError log = getLogger(__name__) _registry_path = Path(__file__).parent / 'metrics_registry.json' if _registry_path.exists(): with _registry_path.open(encoding='utf-8') as f: _REGISTRY = json.load(f) else: _REGISTRY = {} def fn_from_str(name: str) -> Callable[..., Any]: """Returns a function object with the name given in string.""" try: module_name, fn_name = name.split(':') return getattr(importlib.import_module(module_name), fn_name) except ValueError: raise ConfigError('Expected function description in a `module.submodules:function_name` form, but got `{}`' .format(name)) except AttributeError: # noinspection PyUnboundLocalVariable raise ConfigError(f"Incorrect metric: '{module_name}' has no attribute '{fn_name}'.") def register_metric(metric_name: str) -> Callable[..., Any]: """Decorator for metric registration.""" def decorate(fn): fn_name = fn.__module__ + ':' + fn.__name__ if metric_name in _REGISTRY and _REGISTRY[metric_name] != fn_name: log.warning('"{}" is already registered as a metric name, the old function will be ignored' .format(metric_name)) _REGISTRY[metric_name] = fn_name return fn return decorate def get_metric_by_name(name: str) -> Callable[..., Any]: """Returns a metric callable with a corresponding name.""" name = _REGISTRY.get(name, name) return fn_from_str(name) ================================================ FILE: deeppavlov/core/common/params.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import inspect from logging import getLogger from types import FunctionType from typing import Any, Dict, Union from deeppavlov.core.commands.utils import expand_path, parse_config from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import get_model from deeppavlov.core.models.component import Component log = getLogger(__name__) _refs = {} def resolve(val): if isinstance(val, str) and val.startswith('#'): component_id, *attributes = val[1:].split('.') try: val = _refs[component_id] except KeyError: e = ConfigError('Component with id "{id}" was referenced but not initialized' .format(id=component_id)) log.exception(e) raise e attributes = ['val'] + attributes val = eval('.'.join(attributes)) return val def _init_param(param, mode): if isinstance(param, str): param = resolve(param) elif isinstance(param, (list, tuple)): param = [_init_param(p, mode) for p in param] elif isinstance(param, dict): if {'ref', 'class_name', 'config_path'}.intersection(param.keys()): param = from_params(param, mode=mode) else: param = {k: _init_param(v, mode) for k, v in param.items()} return param def from_params(params: Dict, mode: str = 'infer', **kwargs) -> Union[Component, FunctionType]: """Builds and returns the Component from corresponding dictionary of parameters.""" # what is passed in json: config_params = {k: resolve(v) for k, v in params.items()} # get component by reference (if any) if 'ref' in config_params: try: return _refs[config_params['ref']] except KeyError: e = ConfigError('Component with id "{id}" was referenced but not initialized' .format(id=config_params['ref'])) log.exception(e) raise e elif 'config_path' in config_params: from deeppavlov.core.commands.infer import build_model refs = _refs.copy() _refs.clear() config = parse_config(expand_path(config_params['config_path']), config_params.get('overwrite')) model = build_model(config) _refs.clear() _refs.update(refs) try: _refs[config_params['id']] = model except KeyError: pass return model cls_name = config_params.pop('class_name', None) if not cls_name: e = ConfigError('Component config has no `class_name` nor `ref` fields') log.exception(e) raise e obj = get_model(cls_name) if inspect.isclass(obj): # find the submodels params recursively config_params = {k: _init_param(v, mode) for k, v in config_params.items()} try: spec = inspect.getfullargspec(obj) if 'mode' in spec.args + spec.kwonlyargs or spec.varkw is not None: kwargs['mode'] = mode component = obj(**dict(config_params, **kwargs)) try: _refs[config_params['id']] = component except KeyError: pass except Exception: log.exception("Exception in {}".format(obj)) raise else: component = obj return component ================================================ FILE: deeppavlov/core/common/params_search.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import random from copy import deepcopy from logging import getLogger from typing import List, Generator, Any, Tuple import numpy as np from deeppavlov.core.common.registry import register log = getLogger(__name__) @register('params_search') class ParamsSearch: """ Class determine the main operations for parameters search like finding all changing parameters. Args: prefix: prefix to determine special keys like "`prefix`_range", "`prefix`_bool", "`prefix`_choice" seed: random seed for initialization **kwargs: basic config with parameters Attributes: basic_config: dictionary with initial config with possible values of searched parameters prefix: prefix to determine special keys like "`prefix`_range", "`prefix`_bool", "`prefix`_choice" paths_to_params: list of lists of keys and/or integers (for list) with relative paths to searched parameters n_params: number of searched parameters eps: threshold value """ def __init__(self, prefix="search", seed: int = None, **kwargs): """ Initialize evolution with random population """ self.basic_config = deepcopy(kwargs) self.prefix = prefix self.paths_to_params = [] for search_type in [prefix + "_range", prefix + "_choice", prefix + "_bool"]: for path_ in self.find_model_path(self.basic_config, search_type): self.paths_to_params.append(path_) self.n_params = len(self.paths_to_params) self.eps = 1e-6 if seed is None: pass else: np.random.seed(seed) random.seed(seed) def find_model_path(self, config: dict, key_model: str, path: list = []) -> Generator: """ Find paths to all dictionaries in config that contain key 'key_model' Args: config: dictionary key_model: key of sub-dictionary to be found path: list of keys and/or integers (for list) with relative path (needed for recursion) Returns: path in config -- list of keys (strings and integers) """ config_pointer = config if isinstance(config_pointer, dict) and key_model in config_pointer.keys(): yield path else: if isinstance(config_pointer, dict): for key in list(config_pointer.keys()): for path_ in self.find_model_path(config_pointer[key], key_model, path + [key]): yield path_ elif isinstance(config_pointer, list): for i in range(len(config_pointer)): for path_ in self.find_model_path(config_pointer[i], key_model, path + [i]): yield path_ @staticmethod def insert_value_or_dict_into_config(config: dict, path: list, value: [int, float, str, bool, list, dict, np.ndarray]) -> None: """ Insert value to dictionary determined by path[:-1] in field with key path[-1] Args: config: dictionary path: list of keys and/or integers (for list) value: value to be inserted Returns: config with inserted value """ config_pointer = config for el in path[:-1]: if isinstance(config_pointer, dict): config_pointer = config_pointer.setdefault(el, {}) elif isinstance(config_pointer, list): config_pointer = config_pointer[el] else: pass config_pointer[path[-1]] = value @staticmethod def get_value_from_config(config: dict, path: list) -> Any: """ Return value of config element determined by path Args: config: dictionary path: list of keys and/or integers (for list) Returns: value """ config_copy = deepcopy(config) config_pointer = config_copy for el in path[:-1]: if isinstance(config_pointer, dict): config_pointer = config_pointer.setdefault(el, {}) elif isinstance(config_pointer, list): config_pointer = config_pointer[el] else: pass return config_pointer[path[-1]] @staticmethod def remove_key_from_config(config: dict, path: list) -> Tuple[dict, Any]: """ Remove config element determined by path Args: config: dictionary path: list of keys and/or integers (for list) Returns: dictionary without value from path, value from path """ config_copy = deepcopy(config) config_pointer = config_copy for el in path[:-1]: if isinstance(config_pointer, dict): config_pointer = config_pointer.setdefault(el, {}) elif isinstance(config_pointer, list): config_pointer = config_pointer[el] else: pass value = config_pointer.pop(path[-1]) return config_copy, value def initialize_params_in_config(self, basic_config: dict, paths: List[list]) -> dict: """ Randomly initialize all the changable parameters in config Args: basic_config: config where changable parameters are dictionaries with keys ``prefix`_range`, ``prefix`_bool`, ``prefix`_choice` paths: list of paths to changable parameters Returns: config """ config = deepcopy(basic_config) for path_ in paths: param_name = path_[-1] value = self.get_value_from_config(basic_config, path_) if isinstance(value, dict): if (value.get(self.prefix + "_choice") or value.get(self.prefix + "_range") or value.get(self.prefix + "_bool")): self.insert_value_or_dict_into_config( config, path_, self.sample_params(**{param_name: deepcopy(value)})[param_name]) return config def sample_params(self, **params) -> dict: """ Sample parameters according to the given possible values Args: **params: dictionary like {"param_0": {"`prefix`_range": [0, 10]}, "param_1": {"`prefix`_range": [0, 10], "discrete": true}, "param_2": {"`prefix`_range": [0, 1], "scale": "log"}, "param_3": {"`prefix`_bool": true}, "param_4": {"`prefix`_choice": [0, 1, 2, 3]}} Returns: dictionary with randomly sampled parameters """ if not params: return {} else: params_copy = deepcopy(params) params_sample = dict() for param, param_val in params_copy.items(): if isinstance(param_val, dict): if self.prefix + '_bool' in param_val and param_val[self.prefix + '_bool']: sample = bool(random.choice([True, False])) elif self.prefix + '_range' in param_val: sample = self._sample_from_ranges(param_val) elif self.prefix + '_choice' in param_val: sample = random.choice(param_val[self.prefix + '_choice']) else: sample = param_val params_sample[param] = sample else: params_sample[param] = params_copy[param] return params_sample def _sample_from_ranges(self, opts: dict) -> [int, float]: """ Sample parameters from ranges Args: opts: dictionary {"`prefix`_range": [0, 10]} or \ {"`prefix`_range": [0, 10], "discrete": true} or \ {"`prefix`_range": [0, 1], "scale": "log"} Returns: random parameter value from range """ from_ = opts[self.prefix + '_range'][0] to_ = opts[self.prefix + '_range'][1] if opts.get('scale', None) == 'log': sample = self._sample_log(from_, to_) else: sample = np.random.uniform(from_, to_) if opts.get('discrete', False): sample = int(np.round(sample)) return sample @staticmethod def _sample_log(from_: float = 0., to_: float = 1.) -> float: """ Sample parameters from ranges with log scale Args: from_: lower boundary of values to_: upper boundary of values Returns: random parameters value from range with log scale """ sample = np.exp(np.random.uniform(np.log(from_), np.log(to_))) return float(sample) ================================================ FILE: deeppavlov/core/common/paths.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import shutil from pathlib import Path _root_path = Path(__file__).resolve().parents[3] _default_settings_path: Path = _root_path / 'deeppavlov' / 'utils' / 'settings' _settings_path = Path(os.getenv('DP_SETTINGS_PATH', _default_settings_path)).expanduser().resolve() if _settings_path.is_file(): raise FileExistsError(f'DP_SETTINGS_PATH={_settings_path} is a file and not a directory') if _default_settings_path in _settings_path.parents: raise RecursionError(f'DP_SETTINGS_PATH={_settings_path} is relative' f' to the default settings path {_default_settings_path}') def get_settings_path() -> Path: """Return an absolute path to the DeepPavlov settings directory""" populate_settings_dir() return _settings_path def populate_settings_dir(force: bool = False) -> bool: """ Populate settings directory with default settings files Args: force: if ``True``, replace existing settings files with default ones Returns: ``True`` if any files were copied and ``False`` otherwise """ res = False if _default_settings_path == _settings_path: return res for src in list(_default_settings_path.glob('**/*.json')): dest = _settings_path / src.relative_to(_default_settings_path) if not force and dest.exists(): continue res = True dest.parent.mkdir(parents=True, exist_ok=True) shutil.copy(src, dest) return res ================================================ FILE: deeppavlov/core/common/prints.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys from contextlib import redirect_stdout class RedirectedPrints(redirect_stdout): """Context manager for temporarily redirecting stdout to another stream """ def __init__(self, new_target=sys.stderr): super().__init__(new_target=new_target) ================================================ FILE: deeppavlov/core/common/registry.json ================================================ { "answer_types_extractor": "deeppavlov.models.kbqa.type_define:AnswerTypesExtractor", "api_requester": "deeppavlov.models.api_requester.api_requester:ApiRequester", "api_router": "deeppavlov.models.api_requester.api_router:ApiRouter", "basic_classification_iterator": "deeppavlov.dataset_iterators.basic_classification_iterator:BasicClassificationDatasetIterator", "basic_classification_reader": "deeppavlov.dataset_readers.basic_classification_reader:BasicClassificationDatasetReader", "boolqa_reader": "deeppavlov.dataset_readers.boolqa_reader:BoolqaReader", "bpr": "deeppavlov.models.doc_retrieval.bpr:BPR", "chu_liu_edmonds_transformer": "deeppavlov.models.morpho_syntax_parser.dependency_decoding:ChuLiuEdmonds", "concat_lists": "deeppavlov.models.doc_retrieval.utils:concat_lists", "conll2003_reader": "deeppavlov.dataset_readers.conll2003_reader:Conll2003DatasetReader", "cos_sim_classifier": "deeppavlov.models.classifiers.cos_sim_classifier:CosineSimilarityClassifier", "data_fitting_iterator": "deeppavlov.core.data.data_fitting_iterator:DataFittingIterator", "data_learning_iterator": "deeppavlov.core.data.data_learning_iterator:DataLearningIterator", "dependency_output_prettifier": "deeppavlov.models.morpho_syntax_parser.syntax_parsing:DependencyOutputPrettifier", "dirty_comments_preprocessor": "deeppavlov.models.preprocessors.dirty_comments_preprocessor:DirtyCommentsPreprocessor", "docred_reader": "deeppavlov.dataset_readers.docred_reader:DocREDDatasetReader", "document_chunker": "deeppavlov.models.preprocessors.odqa_preprocessors:DocumentChunker", "dnnc_pair_generator": "deeppavlov.models.preprocessors.dnnc_preprocessor:PairGenerator", "dnnc_proba2labels": "deeppavlov.models.classifiers.dnnc_proba2labels:Proba2Labels", "entity_detection_parser": "deeppavlov.models.entity_extraction.entity_detection_parser:EntityDetectionParser", "entity_linker": "deeppavlov.models.entity_extraction.entity_linking:EntityLinker", "entity_type_split": "deeppavlov.models.entity_extraction.entity_detection_parser:entity_type_split", "faq_reader": "deeppavlov.dataset_readers.faq_reader:FaqDatasetReader", "fasttext": "deeppavlov.models.embedders.fasttext_embedder:FasttextEmbedder", "fit_trainer": "deeppavlov.core.trainers.fit_trainer:FitTrainer", "hashing_tfidf_vectorizer": "deeppavlov.models.vectorizers.hashing_tfidf_vectorizer:HashingTfIdfVectorizer", "huggingface_dataset_iterator": "deeppavlov.dataset_iterators.huggingface_dataset_iterator:HuggingFaceDatasetIterator", "huggingface_dataset_reader": "deeppavlov.dataset_readers.huggingface_dataset_reader:HuggingFaceDatasetReader", "imdb_reader": "deeppavlov.dataset_readers.imdb_reader:ImdbReader", "joint_tagger_parser": "deeppavlov.models.morpho_syntax_parser.joint:JointTaggerParser", "kenlm_elector": "deeppavlov.models.spelling_correction.electors.kenlm_elector:KenlmElector", "lazy_tokenizer": "deeppavlov.models.tokenizers.lazy_tokenizer:lazy_tokenizer", "lcquad_reader": "deeppavlov.dataset_readers.sq_reader:LCQuADReader", "lemmatized_output_prettifier": "deeppavlov.models.morpho_syntax_parser.syntax_parsing:LemmatizedOutputPrettifier", "line_reader": "deeppavlov.dataset_readers.line_reader:LineReader", "logit_ranker": "deeppavlov.models.doc_retrieval.logit_ranker:LogitRanker", "mask": "deeppavlov.models.preprocessors.mask:Mask", "morphotagger_dataset_iterator": "deeppavlov.dataset_iterators.morphotagger_iterator:MorphoTaggerDatasetIterator", "morphotagger_dataset_reader": "deeppavlov.dataset_readers.morphotagging_dataset_reader:MorphotaggerDatasetReader", "multitask_reader":"deeppavlov.dataset_readers.multitask_reader:MultiTaskReader", "multitask_pipeline_preprocessor":"deeppavlov.models.preprocessors.multitask_preprocessor:MultiTaskPipelinePreprocessor", "multitask_transformer":"deeppavlov.models.torch_bert.multitask_transformer:MultiTaskTransformer", "multitask_iterator":"deeppavlov.dataset_iterators.multitask_iterator:MultiTaskIterator", "multi_squad_dataset_reader": "deeppavlov.dataset_readers.squad_dataset_reader:MultiSquadDatasetReader", "multi_squad_iterator": "deeppavlov.dataset_iterators.squad_iterator:MultiSquadIterator", "multi_squad_retr_iterator": "deeppavlov.dataset_iterators.squad_iterator:MultiSquadRetrIterator", "ner_chunk_model": "deeppavlov.models.entity_extraction.ner_chunker:NerChunkModel", "ner_chunker": "deeppavlov.models.entity_extraction.ner_chunker:NerChunker", "ner_vocab": "deeppavlov.models.preprocessors.ner_preprocessor:NerVocab", "nltk_moses_tokenizer": "deeppavlov.models.tokenizers.nltk_moses_tokenizer:NLTKMosesTokenizer", "nltk_tokenizer": "deeppavlov.models.tokenizers.nltk_tokenizer:NLTKTokenizer", "nn_trainer": "deeppavlov.core.trainers.nn_trainer:NNTrainer", "odqa_reader": "deeppavlov.dataset_readers.odqa_reader:ODQADataReader", "one_hotter": "deeppavlov.models.preprocessors.one_hotter:OneHotter", "params_search": "deeppavlov.core.common.params_search:ParamsSearch", "paraphraser_reader": "deeppavlov.dataset_readers.paraphraser_reader:ParaphraserReader", "path_ranking_preprocessor": "deeppavlov.models.preprocessors.torch_transformers_preprocessor:PathRankingPreprocessor", "pop_ranker": "deeppavlov.models.doc_retrieval.pop_ranker:PopRanker", "proba2labels": "deeppavlov.models.classifiers.proba2labels:Proba2Labels", "query_formatter": "deeppavlov.models.kbqa.query_generator:QueryFormatter", "query_generator": "deeppavlov.models.kbqa.query_generator:QueryGenerator", "question_sign_checker": "deeppavlov.models.entity_extraction.entity_detection_parser:QuestionSignChecker", "re_classifier": "deeppavlov.models.relation_extraction.relation_extraction_bert:REBertModel", "re_postprocessor": "deeppavlov.models.preprocessors.re_preprocessor:REPostprocessor", "re_preprocessor": "deeppavlov.models.preprocessors.re_preprocessor:REPreprocessor", "rel_ranking_infer": "deeppavlov.models.kbqa.rel_ranking_infer:RelRankerInfer", "rel_ranking_preprocessor": "deeppavlov.models.preprocessors.torch_transformers_preprocessor:RelRankingPreprocessor", "rel_ranking_reader": "deeppavlov.dataset_readers.rel_ranking_reader:ParaphraserReader", "response_base_loader": "deeppavlov.models.preprocessors.response_base_loader:ResponseBaseLoader", "ru_adj_to_noun": "deeppavlov.models.kbqa.ru_adj_to_noun:RuAdjToNoun", "rubq_reader": "deeppavlov.dataset_readers.sq_reader:RuBQReader", "rured_reader": "deeppavlov.dataset_readers.rured_reader:RuREDDatasetReader", "russian_words_vocab": "deeppavlov.vocabs.typos:RussianWordsVocab", "sanitizer": "deeppavlov.models.preprocessors.sanitizer:Sanitizer", "sentseg_restore_sent": "deeppavlov.models.preprocessors.sentseg_preprocessor:SentSegRestoreSent", "siamese_iterator": "deeppavlov.dataset_iterators.siamese_iterator:SiameseIterator", "simple_vocab": "deeppavlov.core.data.simple_vocab:SimpleVocabulary", "sklearn_component": "deeppavlov.models.sklearn.sklearn_component:SklearnComponent", "slovnet_syntax_parser": "deeppavlov.models.kbqa.tree_to_sparql:SlovnetSyntaxParser", "spacy_lemmatizer": "deeppavlov.models.morpho_syntax_parser.spacy_lemmatizer:SpacyLemmatizer", "spelling_error_model": "deeppavlov.models.spelling_correction.brillmoore.error_model:ErrorModel", "spelling_levenshtein": "deeppavlov.models.spelling_correction.levenshtein.searcher_component:LevenshteinSearcherComponent", "split_tokenizer": "deeppavlov.models.tokenizers.split_tokenizer:SplitTokenizer", "sq_reader": "deeppavlov.dataset_readers.sq_reader:SQReader", "sqlite_iterator": "deeppavlov.dataset_iterators.sqlite_iterator:SQLiteDataIterator", "squad_bert_ans_postprocessor": "deeppavlov.models.preprocessors.squad_preprocessor:SquadBertAnsPostprocessor", "squad_bert_ans_preprocessor": "deeppavlov.models.preprocessors.squad_preprocessor:SquadBertAnsPreprocessor", "squad_bert_mapping": "deeppavlov.models.preprocessors.squad_preprocessor:SquadBertMappingPreprocessor", "squad_dataset_reader": "deeppavlov.dataset_readers.squad_dataset_reader:SquadDatasetReader", "squad_iterator": "deeppavlov.dataset_iterators.squad_iterator:SquadIterator", "static_dictionary": "deeppavlov.vocabs.typos:StaticDictionary", "str_lower": "deeppavlov.models.preprocessors.str_lower:str_lower", "str_token_reverser": "deeppavlov.models.preprocessors.str_token_reverser:StrTokenReverser", "str_utf8_encoder": "deeppavlov.models.preprocessors.str_utf8_encoder:StrUTF8Encoder", "stream_spacy_tokenizer": "deeppavlov.models.tokenizers.spacy_tokenizer:StreamSpacyTokenizer", "string_multiplier": "deeppavlov.models.preprocessors.odqa_preprocessors:StringMultiplier", "template_matcher": "deeppavlov.models.kbqa.template_matcher:TemplateMatcher", "tfidf_ranker": "deeppavlov.models.doc_retrieval.tfidf_ranker:TfidfRanker", "tfidf_weighted": "deeppavlov.models.embedders.tfidf_weighted_embedder:TfidfWeightedEmbedder", "top1_elector": "deeppavlov.models.spelling_correction.electors.top1_elector:TopOneElector", "torch_bert_ranker": "deeppavlov.models.torch_bert.torch_bert_ranker:TorchBertRankerModel", "torch_bert_ranker_preprocessor": "deeppavlov.models.preprocessors.torch_transformers_preprocessor:TorchBertRankerPreprocessor", "torch_record_postprocessor": "deeppavlov.models.preprocessors.torch_transformers_preprocessor:TorchRecordPostprocessor", "torch_squad_transformers_preprocessor": "deeppavlov.models.preprocessors.torch_transformers_preprocessor:TorchSquadTransformersPreprocessor", "torch_text_classification_model": "deeppavlov.models.classifiers.torch_classification_model:TorchTextClassificationModel", "torch_trainer": "deeppavlov.core.trainers.torch_trainer:TorchTrainer", "torch_transformers_classifier": "deeppavlov.models.torch_bert.torch_transformers_classifier:TorchTransformersClassifierModel", "torch_transformers_el_ranker": "deeppavlov.models.torch_bert.torch_transformers_el_ranker:TorchTransformersElRanker", "torch_transformers_entity_ranker_infer": "deeppavlov.models.torch_bert.torch_transformers_el_ranker:TorchTransformersEntityRankerInfer", "torch_transformers_entity_ranker_preprocessor": "deeppavlov.models.preprocessors.torch_transformers_preprocessor:TorchTransformersEntityRankerPreprocessor", "torch_transformers_multiplechoice": "deeppavlov.models.torch_bert.torch_transformers_multiplechoice:TorchTransformersMultiplechoiceModel", "torch_transformers_multiplechoice_preprocessor": "deeppavlov.models.preprocessors.torch_transformers_preprocessor:TorchTransformersMultiplechoicePreprocessor", "torch_transformers_ner_preprocessor": "deeppavlov.models.preprocessors.torch_transformers_preprocessor:TorchTransformersNerPreprocessor", "torch_transformers_nll_ranker": "deeppavlov.models.torch_bert.torch_transformers_nll_ranking:TorchTransformersNLLRanker", "torch_transformers_preprocessor": "deeppavlov.models.preprocessors.torch_transformers_preprocessor:TorchTransformersPreprocessor", "torch_transformers_sequence_tagger": "deeppavlov.models.torch_bert.torch_transformers_sequence_tagger:TorchTransformersSequenceTagger", "torch_transformers_squad": "deeppavlov.models.torch_bert.torch_transformers_squad:TorchTransformersSquad", "torch_transformers_syntax_parser": "deeppavlov.models.torch_bert.torch_transformers_syntax_parser:TorchTransformersSyntaxParser", "transformers_bert_embedder": "deeppavlov.models.embedders.transformers_embedder:TransformersBertEmbedder", "transformers_bert_preprocessor": "deeppavlov.models.preprocessors.transformers_preprocessor:TransformersBertPreprocessor", "tree_to_sparql": "deeppavlov.models.kbqa.tree_to_sparql:TreeToSparql", "typos_custom_reader": "deeppavlov.dataset_readers.typos_reader:TyposCustom", "typos_iterator": "deeppavlov.dataset_iterators.typos_iterator:TyposDatasetIterator", "typos_kartaslov_reader": "deeppavlov.dataset_readers.typos_reader:TyposKartaslov", "typos_wikipedia_reader": "deeppavlov.dataset_readers.typos_reader:TyposWikipedia", "ubuntu_v2_reader": "deeppavlov.dataset_readers.ubuntu_v2_reader:UbuntuV2Reader", "wiki_parser": "deeppavlov.models.kbqa.wiki_parser:WikiParser", "wiki_sqlite_vocab": "deeppavlov.vocabs.wiki_sqlite:WikiSQLiteVocab", "wikitionary_100K_vocab": "deeppavlov.vocabs.typos:Wiki100KDictionary" } ================================================ FILE: deeppavlov/core/common/registry.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import importlib import json from logging import getLogger from pathlib import Path from deeppavlov.core.common.errors import ConfigError logger = getLogger(__name__) _registry_path = Path(__file__).parent / 'registry.json' if _registry_path.exists(): with _registry_path.open(encoding='utf-8') as f: _REGISTRY = json.load(f) else: _REGISTRY = {} inverted_registry = {val: key for key, val in _REGISTRY.items()} def cls_from_str(name: str) -> type: """Returns a class object with the name given as a string.""" try: module_name, cls_name = name.split(':') except ValueError: raise ConfigError('Expected class description in a `module.submodules:ClassName` form, but got `{}`' .format(name)) return getattr(importlib.import_module(module_name), cls_name) def register(name: str = None) -> type: """ Register classes that could be initialized from JSON configuration file. If name is not passed, the class name is converted to snake-case. """ def decorate(model_cls: type, reg_name: str = None) -> type: model_name = reg_name or short_name(model_cls) global _REGISTRY cls_name = model_cls.__module__ + ':' + model_cls.__name__ if model_name in _REGISTRY and _REGISTRY[model_name] != cls_name: logger.warning('Registry name "{}" has been already registered and will be overwritten.'.format(model_name)) _REGISTRY[model_name] = cls_name return model_cls return lambda model_cls_name: decorate(model_cls_name, name) def short_name(cls: type) -> str: """Returns just a class name (without package and module specification).""" return cls.__name__.split('.')[-1] def get_model(name: str) -> type: """Returns a registered class object with the name given in the string.""" if name not in _REGISTRY: if ':' not in name: raise ConfigError("Model {} is not registered.".format(name)) return cls_from_str(name) return cls_from_str(_REGISTRY[name]) def list_models() -> list: """Returns a list of names of registered classes.""" return list(_REGISTRY) ================================================ FILE: deeppavlov/core/common/requirements_registry.json ================================================ { "answer_types_extractor": [ "{DEEPPAVLOV_PATH}/requirements/en_core_web_sm.txt", "{DEEPPAVLOV_PATH}/requirements/ru_core_news_sm.txt" ], "chu_liu_edmonds_transformer": [ "{DEEPPAVLOV_PATH}/requirements/dependency_decoding.txt" ], "bpr": [ "{DEEPPAVLOV_PATH}/requirements/faiss.txt", "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "entity_linker": [ "{DEEPPAVLOV_PATH}/requirements/hdt.txt", "{DEEPPAVLOV_PATH}/requirements/rapidfuzz.txt", "{DEEPPAVLOV_PATH}/requirements/en_core_web_sm.txt", "{DEEPPAVLOV_PATH}/requirements/ru_core_news_sm.txt" ], "fasttext": [ "{DEEPPAVLOV_PATH}/requirements/fasttext.txt" ], "huggingface_dataset_iterator": [ "{DEEPPAVLOV_PATH}/requirements/datasets.txt" ], "huggingface_dataset_reader": [ "{DEEPPAVLOV_PATH}/requirements/datasets.txt" ], "kenlm_elector": [ "{DEEPPAVLOV_PATH}/requirements/kenlm.txt" ], "ner_chunk_model": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "ner_chunker": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "nltk_moses_tokenizer": [ "{DEEPPAVLOV_PATH}/requirements/sacremoses.txt" ], "path_ranking_preprocessor": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "query_generator": [ "{DEEPPAVLOV_PATH}/requirements/en_core_web_sm.txt", "{DEEPPAVLOV_PATH}/requirements/hdt.txt", "{DEEPPAVLOV_PATH}/requirements/rapidfuzz.txt", "{DEEPPAVLOV_PATH}/requirements/whapi.txt" ], "re_classifier": [ "{DEEPPAVLOV_PATH}/requirements/opt_einsum.txt", "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "re_postprocessor": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "re_preprocessor": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "rel_ranking_infer": [ "{DEEPPAVLOV_PATH}/requirements/en_core_web_sm.txt", "{DEEPPAVLOV_PATH}/requirements/hdt.txt" ], "rel_ranking_preprocessor": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "ru_adj_to_noun": [ "{DEEPPAVLOV_PATH}/requirements/ru_core_news_sm.txt" ], "russian_words_vocab": [ "{DEEPPAVLOV_PATH}/requirements/lxml.txt" ], "slovnet_syntax_parser": [ "{DEEPPAVLOV_PATH}/requirements/slovnet.txt", "{DEEPPAVLOV_PATH}/requirements/razdel.txt", "{DEEPPAVLOV_PATH}/requirements/ru_core_news_sm.txt" ], "spacy_lemmatizer": [ "{DEEPPAVLOV_PATH}/requirements/en_core_web_sm.txt", "{DEEPPAVLOV_PATH}/requirements/ru_core_news_sm.txt" ], "spelling_error_model": [ "{DEEPPAVLOV_PATH}/requirements/lxml.txt" ], "spelling_levenshtein": [ "{DEEPPAVLOV_PATH}/requirements/sortedcontainers.txt" ], "static_dictionary": [ "{DEEPPAVLOV_PATH}/requirements/lxml.txt" ], "stream_spacy_tokenizer": [ "{DEEPPAVLOV_PATH}/requirements/en_core_web_sm.txt", "{DEEPPAVLOV_PATH}/requirements/ru_core_news_sm.txt" ], "torch_bert_ranker": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "torch_bert_ranker_preprocessor": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "torch_record_postprocessor": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "torch_squad_transformers_preprocessor": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "torch_text_classification_model": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt" ], "torch_transformers_classifier": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "multitask_transformer": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "torch_transformers_el_ranker": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "torch_transformers_entity_ranker_infer": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "torch_transformers_entity_ranker_preprocessor": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "torch_transformers_multiplechoice": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "torch_transformers_multiplechoice_preprocessor": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "torch_transformers_ner_preprocessor": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt", "{DEEPPAVLOV_PATH}/requirements/sentencepiece.txt", "{DEEPPAVLOV_PATH}/requirements/protobuf.txt" ], "torch_transformers_nll_ranker": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "torch_transformers_preprocessor": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "torch_transformers_sequence_tagger": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/torchcrf.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "torch_transformers_syntax_parser": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/torchcrf.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "multitask_pipeline_preprocessor": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/torchcrf.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "torch_transformers_squad": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "transformers_bert_embedder": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "transformers_bert_preprocessor": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "tree_to_sparql": [ "{DEEPPAVLOV_PATH}/requirements/udapi.txt", "{DEEPPAVLOV_PATH}/requirements/razdel.txt", "{DEEPPAVLOV_PATH}/requirements/ru_core_news_sm.txt" ], "typos_custom_reader": [ "{DEEPPAVLOV_PATH}/requirements/lxml.txt" ], "typos_kartaslov_reader": [ "{DEEPPAVLOV_PATH}/requirements/lxml.txt" ], "typos_wikipedia_reader": [ "{DEEPPAVLOV_PATH}/requirements/lxml.txt" ], "wiki_parser": [ "{DEEPPAVLOV_PATH}/requirements/hdt.txt" ], "wikitionary_100K_vocab": [ "{DEEPPAVLOV_PATH}/requirements/lxml.txt" ] } ================================================ FILE: deeppavlov/core/data/__init__.py ================================================ ================================================ FILE: deeppavlov/core/data/data_fitting_iterator.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from random import Random from typing import List, Generator, Tuple, Any, Optional from deeppavlov.core.common.registry import register logger = getLogger(__name__) @register('data_fitting_iterator') class DataFittingIterator: """Dataset iterator for fitting estimator models, like vocabs, kNN, vectorizers. Data is passed as a list of strings(documents). Generate batches (for large datasets). Args: data: list of documents doc_ids: provided document ids seed: random seed for data shuffling shuffle: whether to shuffle data during batching Attributes: shuffle: whether to shuffle data during batching random: instance of :class:`Random` initialized with a seed data: list of documents doc_ids: provided by a user ids or generated automatically ids """ def __init__(self, data: List[str], doc_ids: List[Any] = None, seed: int = None, shuffle: bool = True, *args, **kwargs) -> None: self.shuffle = shuffle self.random = Random(seed) self.data = data self.doc_ids = doc_ids or self.get_doc_ids() def get_doc_ids(self): """Generate doc ids. Returns: doc ids """ return list(range(len(self.data))) def get_doc_content(self, doc_id: Any) -> Optional[str]: """Get doc content by id. Args: doc_id: an id for a doc which content should be extracted Returns: doc content as a string if id exists or raise an error """ return self.data[doc_id] def gen_batches(self, batch_size: int, shuffle: bool = None) \ -> Generator[Tuple[List[str], List[int]], Any, None]: """Gen batches of documents. Args: batch_size: a number of samples in a single batch shuffle: whether to shuffle data during batching Yields: generated tuple of documents and their ids """ if shuffle is None: shuffle = self.shuffle if shuffle: _doc_ids = self.random.sample(self.doc_ids, len(self.doc_ids)) else: _doc_ids = self.doc_ids if batch_size > 0: batches = [_doc_ids[i:i + batch_size] for i in range(0, len(_doc_ids), batch_size)] else: batches = [_doc_ids] # DEBUG # len_batches = len(batches) for i, doc_ids in enumerate(batches): # DEBUG # logger.info( # "Processing batch # {} of {} ({} documents)".format(i, len_batches, len(doc_index))) docs = [self.get_doc_content(doc_id) for doc_id in doc_ids] yield docs, doc_ids def get_instances(self): """Get all data""" doc_ids = list(self.doc_ids) docs = [self.get_doc_content(doc_id) for doc_id in doc_ids] return docs, doc_ids ================================================ FILE: deeppavlov/core/data/data_learning_iterator.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from random import Random from typing import List, Dict, Tuple, Any, Iterator from deeppavlov.core.common.registry import register @register('data_learning_iterator') class DataLearningIterator: """Dataset iterator for learning models, e. g. neural networks. Args: data: list of (x, y) pairs for every data type in ``'train'``, ``'valid'`` and ``'test'`` seed: random seed for data shuffling shuffle: whether to shuffle data during batching Attributes: shuffle: whether to shuffle data during batching random: instance of ``Random`` initialized with a seed """ def split(self, *args, **kwargs): """ Manipulate self.train, self.valid, and self.test into their final form. """ pass def preprocess(self, data: List[Tuple[Any, Any]], *args, **kwargs) -> List[Tuple[Any, Any]]: """ Transform the data for a specific data type (e.g. ``'train'``). """ return data def __init__(self, data: Dict[str, List[Tuple[Any, Any]]], seed: int = None, shuffle: bool = True, *args, **kwargs) -> None: self.shuffle = shuffle self.random = Random(seed) self.train = self.preprocess(data.get('train', []), *args, **kwargs) self.valid = self.preprocess(data.get('valid', []), *args, **kwargs) self.test = self.preprocess(data.get('test', []), *args, **kwargs) self.split(*args, **kwargs) self.data = { 'train': self.train, 'valid': self.valid, 'test': self.test, 'all': self.train + self.test + self.valid } def gen_batches(self, batch_size: int, data_type: str = 'train', shuffle: bool = None) -> Iterator[Tuple[tuple, tuple]]: """Generate batches of inputs and expected output to train neural networks Args: batch_size: number of samples in batch data_type: can be either 'train', 'test', or 'valid' shuffle: whether to shuffle dataset before batching Yields: a tuple of a batch of inputs and a batch of expected outputs """ if shuffle is None: shuffle = self.shuffle data = self.data[data_type] data_len = len(data) if data_len == 0: return order = list(range(data_len)) if shuffle: self.random.shuffle(order) if batch_size < 0: batch_size = data_len for i in range((data_len - 1) // batch_size + 1): yield tuple(zip(*[data[o] for o in order[i * batch_size:(i + 1) * batch_size]])) def get_instances(self, data_type: str = 'train') -> Tuple[tuple, tuple]: """Get all data for a selected data type Args: data_type (str): can be either ``'train'``, ``'test'``, ``'valid'`` or ``'all'`` Returns: a tuple of all inputs for a data type and all expected outputs for a data type """ data = self.data[data_type] return tuple(zip(*data)) ================================================ FILE: deeppavlov/core/data/dataset_reader.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import List, Dict, Tuple, Any class DatasetReader: """An abstract class for reading data from some location and construction of a dataset.""" def read(self, data_path: str, *args, **kwargs) -> Dict[str, List[Tuple[Any, Any]]]: """Reads a file from a path and returns data as a list of tuples of inputs and correct outputs for every data type in ``train``, ``valid`` and ``test``. """ raise NotImplementedError ================================================ FILE: deeppavlov/core/data/simple_vocab.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from collections import Counter, defaultdict from itertools import chain from logging import getLogger from typing import Iterable, Optional, Tuple import numpy as np from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register from deeppavlov.core.data.utils import zero_pad, is_str_batch, flatten_str_batch from deeppavlov.core.models.estimator import Estimator log = getLogger(__name__) @register('simple_vocab') class SimpleVocabulary(Estimator): """Implements simple vocabulary. Parameters: special_tokens: tuple of tokens that shouldn't be counted. max_tokens: upper bound for number of tokens in the vocabulary. min_freq: minimal count of a token (except special tokens). pad_with_zeros: if True, then batch of elements will be padded with zeros up to length of the longest element in batch. unk_token: label assigned to unknown tokens. freq_drop_load: if True, then frequencies of tokens are set to min_freq on the model load. """ def __init__(self, special_tokens: Tuple[str, ...] = tuple(), max_tokens: int = 2 ** 30, min_freq: int = 0, pad_with_zeros: bool = False, unk_token: Optional[str] = None, freq_drop_load: Optional[bool] = None, *args, **kwargs): super().__init__(**kwargs) self.special_tokens = special_tokens self._max_tokens = max_tokens self._min_freq = min_freq self._pad_with_zeros = pad_with_zeros self.unk_token = unk_token self.freq_drop_load = freq_drop_load self.reset() if self.load_path: self.load() def fit(self, *args): self.reset() tokens = chain(*args) # filter(None, <>) -- to filter empty tokens self.freqs = Counter(filter(None, flatten_str_batch(tokens))) for special_token in self.special_tokens: self._t2i[special_token] = self.count self._i2t.append(special_token) self.count += 1 for token, freq in self.freqs.most_common()[:self._max_tokens]: if token in self.special_tokens: continue if freq >= self._min_freq: self._t2i[token] = self.count self._i2t.append(token) self.count += 1 def _add_tokens_with_freqs(self, tokens, freqs): self.freqs = Counter() self.freqs.update(dict(zip(tokens, freqs))) for token, freq in zip(tokens, freqs): if freq >= self._min_freq or token in self.special_tokens: self._t2i[token] = self.count self._i2t.append(token) self.count += 1 def __call__(self, batch, is_top=True, **kwargs): if isinstance(batch, Iterable) and not isinstance(batch, str): if all([k is None for k in batch]): return batch else: looked_up_batch = [self(sample, is_top=False) for sample in batch] else: return self[batch] if self._pad_with_zeros and is_top and not is_str_batch(looked_up_batch): looked_up_batch = zero_pad(looked_up_batch) return looked_up_batch def save(self): log.info("[saving vocabulary to {}]".format(self.save_path)) with self.save_path.open('wt', encoding='utf8') as f: for n in range(len(self)): token = self._i2t[n] cnt = self.freqs[token] f.write('{}\t{:d}\n'.format(token, cnt)) def load(self): self.reset() if self.load_path: if self.load_path.is_file(): log.debug("[loading vocabulary from {}]".format(self.load_path)) tokens, counts = [], [] for ln in self.load_path.open('r', encoding='utf8'): token, cnt = self.load_line(ln) tokens.append(token) counts.append(int(cnt)) self._add_tokens_with_freqs(tokens, counts) elif not self.load_path.parent.is_dir(): raise ConfigError("Provided `load_path` for {} doesn't exist!".format( self.__class__.__name__)) else: raise ConfigError("`load_path` for {} is not provided!".format(self)) def load_line(self, ln): if self.freq_drop_load: token = ln.strip().split()[0] cnt = self._min_freq else: token, cnt = ln.rsplit('\t', 1) return token, cnt @property def len(self): return len(self) def keys(self): return (self[n] for n in range(self.len)) def values(self): return list(range(self.len)) def items(self): return zip(self.keys(), self.values()) def __getitem__(self, key): if isinstance(key, (int, np.integer)): return self._i2t[key] elif isinstance(key, str): return self._t2i[key] else: raise NotImplementedError("not implemented for type `{}`".format(type(key))) def __contains__(self, item): return item in self._t2i def __len__(self): return len(self._i2t) def reset(self): self.freqs = None unk_index = 0 if self.unk_token in self.special_tokens: unk_index = self.special_tokens.index(self.unk_token) self._t2i = defaultdict(lambda: unk_index) self._i2t = [] self.count = 0 def idxs2toks(self, idxs): return [self[idx] for idx in idxs] ================================================ FILE: deeppavlov/core/data/utils.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import collections import gzip import os import secrets import shutil import tarfile import zipfile from hashlib import md5 from itertools import chain from logging import getLogger from pathlib import Path from typing import Any, Generator, Iterable, List, Mapping, Optional, Sequence, Sized, Union, Collection from urllib.parse import urlencode, parse_qs, urlsplit, urlunsplit, urlparse import numpy as np import requests from tqdm import tqdm log = getLogger(__name__) _MARK_DONE = '.done' tqdm.monitor_interval = 0 def get_download_token() -> str: """Return a download token from ~/.deeppavlov/token file. If token file does not exists, creates the file and writes to it a random URL-safe text string containing 32 random bytes. Returns: 32 byte URL-safe text string from ~/.deeppavlov/token. """ token_file = Path.home() / '.deeppavlov' / 'token' if not token_file.exists(): if token_file.parent.is_file(): token_file.parent.unlink() token_file.parent.mkdir(parents=True, exist_ok=True) token_file.write_text(secrets.token_urlsafe(32), encoding='utf8') return token_file.read_text(encoding='utf8').strip() def s3_download(url: str, destination: str) -> None: """Download a file from an Amazon S3 path `s3:///` Requires the boto3 library to be installed and AWS credentials being set via environment variables or a credentials file Args: url: The source URL. destination: Path to the file destination (including file name). """ import boto3 s3 = boto3.resource('s3', endpoint_url=os.environ.get('AWS_ENDPOINT_URL')) bucket, key = url[5:].split('/', maxsplit=1) file_object = s3.Object(bucket, key) file_size = file_object.content_length with tqdm(total=file_size, unit='B', unit_scale=True) as pbar: file_object.download_file(destination, Callback=pbar.update) def simple_download(url: str, destination: Union[Path, str], headers: Optional[dict] = None, n_tries: int = 3) -> None: """Download a file from URL to target location. Displays a progress bar to the terminal during the download process. Args: url: The source URL. destination: Path to the file destination (including file name). headers: Headers for file server. n_tries: Number of retries if download fails. """ try: destination = Path(destination) destination.parent.mkdir(parents=True, exist_ok=True) log.info('Downloading from {} to {}'.format(url, destination)) if url.startswith('s3://'): return s3_download(url, str(destination)) chunk_size = 32 * 1024 temporary = destination.with_suffix(destination.suffix + '.part') r = requests.get(url, stream=True, headers=headers) if r.status_code != 200: raise RuntimeError(f'Got status code {r.status_code} when trying to download {url}') total_length = int(r.headers.get('content-length', 0)) if temporary.exists() and temporary.stat().st_size > total_length: temporary.write_bytes(b'') # clearing temporary file when total_length is inconsistent with temporary.open('ab') as f: downloaded = f.tell() if downloaded != 0: log.warning(f'Found a partial download {temporary}') with tqdm(initial=downloaded, total=total_length, unit='B', unit_scale=True) as pbar: while True: if downloaded != 0: log.warning(f'Download stopped abruptly, trying to resume from {downloaded} ' f'to reach {total_length}') headers['Range'] = f'bytes={downloaded}-' r = requests.get(url, headers=headers, stream=True) if 'content-length' not in r.headers or \ total_length - downloaded != int(r.headers['content-length']): raise RuntimeError('It looks like the server does not support resuming downloads.') try: for chunk in r.iter_content(chunk_size=chunk_size): if chunk: # filter out keep-alive new chunks downloaded += len(chunk) pbar.update(len(chunk)) f.write(chunk) except requests.exceptions.ChunkedEncodingError: if downloaded == 0: r = requests.get(url, stream=True, headers=headers) if downloaded >= total_length: # Note that total_length is 0 if the server didn't return the content length, # in this case we perform just one iteration and assume that we are done. break temporary.rename(destination) except Exception as e: if n_tries > 0: log.warning(f'Download failed: {e}, retrying') simple_download(url, destination, headers, n_tries - 1) else: raise e def download(dest_file_path: [List[Union[str, Path]]], source_url: str, force_download: bool = True, headers: Optional[dict] = None) -> None: """Download a file from URL to one or several target locations. Args: dest_file_path: Path or list of paths to the file destination (including file name). source_url: The source URL. force_download: Download file if it already exists, or not. headers: Headers for file server. """ if isinstance(dest_file_path, list): dest_file_paths = [Path(path) for path in dest_file_path] else: dest_file_paths = [Path(dest_file_path).absolute()] if not force_download: to_check = list(dest_file_paths) dest_file_paths = [] for p in to_check: if p.exists(): log.info(f'File already exists in {p}') else: dest_file_paths.append(p) if dest_file_paths: cache_dir = os.getenv('DP_CACHE_DIR') cached_exists = False if cache_dir: first_dest_path = Path(cache_dir) / md5(source_url.encode('utf8')).hexdigest()[:15] cached_exists = first_dest_path.exists() else: first_dest_path = dest_file_paths.pop() if not cached_exists: first_dest_path.parent.mkdir(parents=True, exist_ok=True) simple_download(source_url, first_dest_path, headers) else: log.info(f'Found cached {source_url} in {first_dest_path}') for dest_path in dest_file_paths: dest_path.parent.mkdir(parents=True, exist_ok=True) shutil.copy(str(first_dest_path), str(dest_path)) def untar(file_path: Union[Path, str], extract_folder: Optional[Union[Path, str]] = None) -> None: """Simple tar archive extractor. Args: file_path: Path to the tar file to be extracted. extract_folder: Folder to which the files will be extracted. """ file_path = Path(file_path) if extract_folder is None: extract_folder = file_path.parent extract_folder = Path(extract_folder) tar = tarfile.open(file_path) tar.extractall(extract_folder) tar.close() def ungzip(file_path: Union[Path, str], extract_path: Optional[Union[Path, str]] = None) -> None: """Simple .gz archive extractor. Args: file_path: Path to the gzip file to be extracted. extract_path: Path where the file will be extracted. """ chunk_size = 16 * 1024 file_path = Path(file_path) if extract_path is None: extract_path = file_path.with_suffix('') extract_path = Path(extract_path) with gzip.open(file_path, 'rb') as fin, extract_path.open('wb') as fout: while True: block = fin.read(chunk_size) if not block: break fout.write(block) def download_decompress(url: str, download_path: Union[Path, str], extract_paths: Optional[Union[List[Union[Path, str]], Path, str]] = None, headers: Optional[dict] = None) -> None: """Download and extract .tar.gz or .gz file to one or several target locations. The archive is deleted if extraction was successful. Args: url: URL for file downloading. download_path: Path to the directory where downloaded file will be stored until the end of extraction. extract_paths: Path or list of paths where contents of archive will be extracted. headers: Headers for file server. """ file_name = Path(urlparse(url).path).name download_path = Path(download_path) if extract_paths is None: extract_paths = [download_path] elif isinstance(extract_paths, list): extract_paths = [Path(path) for path in extract_paths] else: extract_paths = [Path(extract_paths)] cache_dir = os.getenv('DP_CACHE_DIR') extracted = False if cache_dir: cache_dir = Path(cache_dir) url_hash = md5(url.encode('utf8')).hexdigest()[:15] arch_file_path = cache_dir / url_hash extracted_path = cache_dir / (url_hash + '_extracted') extracted = extracted_path.exists() if not extracted and not arch_file_path.exists(): simple_download(url, arch_file_path, headers) else: if extracted: log.info(f'Found cached and extracted {url} in {extracted_path}') else: log.info(f'Found cached {url} in {arch_file_path}') else: arch_file_path = download_path / file_name simple_download(url, arch_file_path, headers) extracted_path = extract_paths.pop() if not extracted: log.info('Extracting {} archive into {}'.format(arch_file_path, extracted_path)) extracted_path.mkdir(parents=True, exist_ok=True) if file_name.endswith('.tar.gz'): untar(arch_file_path, extracted_path) elif file_name.endswith('.gz'): ungzip(arch_file_path, extracted_path / Path(file_name).with_suffix('').name) elif file_name.endswith('.zip'): with zipfile.ZipFile(arch_file_path, 'r') as zip_ref: zip_ref.extractall(extracted_path) else: raise RuntimeError(f'Trying to extract an unknown type of archive {file_name}') if not cache_dir: arch_file_path.unlink() for extract_path in extract_paths: for src in extracted_path.iterdir(): dest = extract_path / src.name if src.is_dir(): _copytree(src, dest) else: extract_path.mkdir(parents=True, exist_ok=True) shutil.copy(str(src), str(dest)) def _copytree(src: Path, dest: Path) -> None: """Recursively copies directory. Destination directory could exist (unlike if we used shutil.copytree). Args: src: Path to copied directory. dest: Path to destination directory. """ dest.mkdir(parents=True, exist_ok=True) for f in src.iterdir(): f_dest = dest / f.name if f.is_dir(): _copytree(f, f_dest) else: shutil.copy(str(f), str(f_dest)) def file_md5(fpath: Union[str, Path], chunk_size: int = 2 ** 16) -> Optional[str]: """Return md5 hash value for file contents. Args: fpath: Path to file. chunk_size: md5 object updated by ``chunk_size`` bytes from file. Returns: None if ``fpath`` does not point to a file, else returns md5 hash value as string. """ fpath = Path(fpath) if not fpath.is_file(): return None file_hash = md5() with fpath.open('rb') as f: for chunk in iter(lambda: f.read(chunk_size), b""): file_hash.update(chunk) return file_hash.hexdigest() def mark_done(path: Union[Path, str]) -> None: """Create ``.done`` empty file in the directory. Args: path: Path to directory. Raises: NotADirectoryError: If ``path`` does not point to a directory. """ path = Path(path) if not path.is_dir(): raise NotADirectoryError(f"Not a directory: '{path}'") mark = path / _MARK_DONE mark.touch(exist_ok=True) def is_done(path: Union[Path, str]) -> bool: """Check if ``.done`` file exists in directory. Args: path: Path to directory. Returns: True if directory contains ``.done`` file, False otherwise. """ mark = Path(path) / _MARK_DONE return mark.is_file() def _get_all_dimensions(batch: Sequence, level: int = 0, res: Optional[List[List[int]]] = None) -> List[List[int]]: """Return all presented element sizes of each dimension. Args: batch: Data array. level: Recursion level. res: List containing element sizes of each dimension. Return: List, i-th element of which is list containing all presented sized of batch's i-th dimension. Examples: >>> x = [[[1], [2, 3]], [[4], [5, 6, 7], [8, 9]]] >>> _get_all_dimensions(x) [[2], [2, 3], [1, 2, 1, 3, 2]] """ if not level: res = [[len(batch)]] if len(batch) and isinstance(batch[0], Sized) and not isinstance(batch[0], str): level += 1 if len(res) <= level: res.append([]) for item in batch: res[level].append(len(item)) _get_all_dimensions(item, level, res) return res def get_dimensions(batch: Sequence) -> List[int]: """Return maximal size of each batch dimension.""" return list(map(max, _get_all_dimensions(batch))) def zero_pad(batch: Sequence, zp_batch: Optional[np.ndarray] = None, dtype: type = np.float32, padding: Union[int, float] = 0) -> np.ndarray: """Fills the end of each array item to make its length maximal along each dimension. Args: batch: Initial array. zp_batch: Padded array. dtype = Type of padded array. padding = Number to will initial array with. Returns: Padded array. Examples: >>> x = np.array([[1, 2, 3], [4], [5, 6]]) >>> zero_pad(x) array([[1., 2., 3.], [4., 0., 0.], [5., 6., 0.]], dtype=float32) """ if zp_batch is None: dims = get_dimensions(batch) zp_batch = np.ones(dims, dtype=dtype) * padding if zp_batch.ndim == 1: zp_batch[:len(batch)] = batch else: for b, zp in zip(batch, zp_batch): zero_pad(b, zp) return zp_batch def is_str_batch(batch: Iterable) -> bool: """Checks if iterable argument contains string at any nesting level.""" while True: if isinstance(batch, Iterable): if isinstance(batch, str): return True elif isinstance(batch, np.ndarray): return batch.dtype.kind == 'U' else: if len(batch) > 0: batch = batch[0] else: return True else: return False def flatten_str_batch(batch: Union[str, Iterable]) -> Union[list, chain]: """Joins all strings from nested lists to one ``itertools.chain``. Args: batch: List with nested lists to flatten. Returns: Generator of flat List[str]. For str ``batch`` returns [``batch``]. Examples: >>> [string for string in flatten_str_batch(['a', ['b'], [['c', 'd']]])] ['a', 'b', 'c', 'd'] """ if isinstance(batch, str): return [batch] else: return chain(*[flatten_str_batch(sample) for sample in batch]) def zero_pad_truncate(batch: Sequence[Sequence[Union[int, float, np.integer, np.floating, Sequence[Union[int, float, np.integer, np.floating]]]]], max_len: int, pad: str = 'post', trunc: str = 'post', dtype: Optional[Union[type, str]] = None) -> np.ndarray: """ Args: batch: assumes a batch of lists of word indexes or their vector representations max_len: resulting length of every batch item pad: how to pad shorter batch items: can be ``'post'`` or ``'pre'`` trunc: how to truncate a batch item: can be ``'post'`` or ``'pre'`` dtype: overrides dtype for the resulting ``ndarray`` if specified, otherwise ``np.int32`` is used for 2-d arrays and ``np.float32`` — for 3-d arrays Returns: a 2-d array of size ``(len(batch), max_len)`` or a 3-d array of size ``(len(batch), max_len, len(batch[0][0]))`` """ if isinstance(batch[0][0], Collection): # ndarray behaves like a Sequence without actually being one size = (len(batch), max_len, len(batch[0][0])) dtype = dtype or np.float32 else: size = (len(batch), max_len) dtype = dtype or np.int32 padded_batch = np.zeros(size, dtype=dtype) for i, batch_item in enumerate(batch): if len(batch_item) > max_len: # trunc padded_batch[i] = batch_item[slice(max_len) if trunc == 'post' else slice(-max_len, None)] else: # pad padded_batch[i, slice(len(batch_item)) if pad == 'post' else slice(-len(batch_item), None)] = batch_item return np.asarray(padded_batch) def get_all_elems_from_json(search_json: dict, search_key: str) -> list: """Returns values by key in all nested dicts. Args: search_json: Dictionary in which one needs to find all values by specific key. search_key: Key for search. Returns: List of values stored in nested structures by ``search_key``. Examples: >>> get_all_elems_from_json({'a':{'b': [1,2,3]}, 'b':42}, 'b') [[1, 2, 3], 42] """ result = [] if isinstance(search_json, dict): for key in search_json: if key == search_key: result.append(search_json[key]) else: result.extend(get_all_elems_from_json(search_json[key], search_key)) elif isinstance(search_json, list): for item in search_json: result.extend(get_all_elems_from_json(item, search_key)) return result def check_nested_dict_keys(check_dict: dict, keys: list) -> bool: """Checks if dictionary contains nested keys from keys list. Args: check_dict: Dictionary to check. keys: Keys list. i-th nested dict of ``check_dict`` should contain dict containing (i+1)-th key from the ``keys`` list by i-th key. Returns: True if dictionary contains nested keys from keys list, False otherwise. Examples: >>> check_nested_dict_keys({'x': {'y': {'z': 42}}}, ['x', 'y', 'z']) True >>> check_nested_dict_keys({'x': {'y': {'z': 42}}}, ['x', 'z', 'y']) False >>> check_nested_dict_keys({'x': {'y': 1, 'z': 42}}, ['x', 'y', 'z']) False """ if isinstance(keys, list) and len(keys) > 0: element = check_dict for key in keys: if isinstance(element, dict) and key in element.keys(): element = element[key] else: return False return True else: return False def jsonify_data(data: Any) -> Any: """Replaces JSON-non-serializable objects with JSON-serializable. Function replaces numpy arrays and numbers with python lists and numbers, tuples is replaces with lists. All other object types remain the same. Args: data: Object to make JSON-serializable. Returns: Modified input data. """ if isinstance(data, (list, tuple)): result = [jsonify_data(item) for item in data] elif isinstance(data, dict): result = {} for key in data.keys(): result[key] = jsonify_data(data[key]) elif isinstance(data, np.ndarray): result = data.tolist() elif isinstance(data, np.integer): result = int(data) elif isinstance(data, np.floating): result = float(data) elif callable(getattr(data, "to_serializable_dict", None)): result = data.to_serializable_dict() else: result = data return result def chunk_generator(items_list: list, chunk_size: int) -> Generator[list, None, None]: """Yields consecutive slices of list. Args: items_list: List to slice. chunk_size: Length of slice. Yields: list: ``items_list`` consecutive slices. """ for i in range(0, len(items_list), chunk_size): yield items_list[i:i + chunk_size] def update_dict_recursive(editable_dict: dict, editing_dict: Mapping) -> None: """Updates dict recursively. You need to use this function to update dictionary if depth of editing_dict is more then 1. Args: editable_dict: Dictionary to edit. editing_dict: Dictionary containing edits. """ for k, v in editing_dict.items(): if isinstance(v, collections.Mapping): update_dict_recursive(editable_dict.get(k, {}), v) else: editable_dict[k] = v def path_set_md5(url: str) -> str: """Given a file URL, return a md5 query of the file. Args: url: A given URL. Returns: URL of the md5 file. """ scheme, netloc, path, query_string, fragment = urlsplit(url) path += '.md5' return urlunsplit((scheme, netloc, path, query_string, fragment)) def set_query_parameter(url: str, param_name: str, param_value: str) -> str: """Given a URL, set or replace a query parameter and return the modified URL. Args: url: A given URL. param_name: The parameter name to add. param_value: The parameter value. Returns: URL with the added parameter. """ scheme, netloc, path, query_string, fragment = urlsplit(url) query_params = parse_qs(query_string) query_params[param_name] = [param_value] new_query_string = urlencode(query_params, doseq=True) return urlunsplit((scheme, netloc, path, new_query_string, fragment)) ================================================ FILE: deeppavlov/core/models/__init__.py ================================================ ================================================ FILE: deeppavlov/core/models/component.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from abc import ABCMeta, abstractmethod from logging import getLogger log = getLogger(__name__) class Component(metaclass=ABCMeta): """Abstract class for all callables that could be used in Chainer's pipe.""" @abstractmethod def __call__(self, *args, **kwargs): pass def reset(self): pass def destroy(self): attr_list = list(self.__dict__.keys()) for attr_name in attr_list: attr = getattr(self, attr_name) if hasattr(attr, 'destroy'): attr.destroy() delattr(self, attr_name) ================================================ FILE: deeppavlov/core/models/estimator.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from abc import abstractmethod from .component import Component from .serializable import Serializable class Estimator(Component, Serializable): """Abstract class for components that could be fitted on the data as a whole.""" @abstractmethod def fit(self, *args, **kwargs): pass ================================================ FILE: deeppavlov/core/models/nn_model.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from abc import abstractmethod from .component import Component from .serializable import Serializable class NNModel(Component, Serializable): """Abstract class for deep learning components.""" @abstractmethod def train_on_batch(self, x: list, y: list): pass def process_event(self, event_name, data): pass ================================================ FILE: deeppavlov/core/models/serializable.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from abc import ABCMeta, abstractmethod from logging import getLogger from pathlib import Path from typing import Union, Optional from deeppavlov.core.commands.utils import expand_path log = getLogger(__name__) class Serializable(metaclass=ABCMeta): """Abstract base class that expresses the interface for all models that can serialize data to a path.""" def __init__(self, save_path: Optional[Union[str, Path]], load_path: Optional[Union[str, Path]] = None, mode: str = 'infer', *args, **kwargs) -> None: if save_path: self.save_path = expand_path(save_path) self.save_path.parent.mkdir(parents=True, exist_ok=True) else: self.save_path = None if load_path: self.load_path = expand_path(load_path) if mode != 'train' and self.save_path and self.load_path != self.save_path: log.warning("Load path '{}' differs from save path '{}' in '{}' mode for {}." .format(self.load_path, self.save_path, mode, self.__class__.__name__)) elif mode != 'train' and self.save_path: self.load_path = self.save_path log.warning("No load path is set for {} in '{}' mode. Using save path instead" .format(self.__class__.__name__, mode)) else: self.load_path = None log.warning("No load path is set for {}!".format(self.__class__.__name__)) @abstractmethod def save(self, *args, **kwargs): pass @abstractmethod def load(self, *args, **kwargs): pass ================================================ FILE: deeppavlov/core/models/torch_model.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from abc import abstractmethod from logging import getLogger from pathlib import Path from typing import Optional, Union import torch from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.models.nn_model import NNModel log = getLogger(__name__) class TorchModel(NNModel): """Class implements torch model's main methods. Args: model: torch.nn.Model-based neural network model device: device to use optimizer: name of `torch.optim` optimizer optimizer_parameters: dictionary with optimizer parameters learning_rate_drop_patience: how many validations with no improvements to wait learning_rate_drop_div: the divider of the learning rate after `learning_rate_drop_patience` unsuccessful validations load_before_drop: whether to load best model before dropping learning rate or not min_learning_rate: min value of learning rate if learning rate decay is used args: kwargs: dictionary with other model parameters Attributes: device: `cpu` or `cuda` device to use opt: dictionary with all model parameters model: torch model epochs_done: number of epochs that were done optimizer: `torch.optim` instance learning_rate_drop_patience: how many validations with no improvements to wait learning_rate_drop_div: the divider of the learning rate after `learning_rate_drop_patience` unsuccessful validations load_before_drop: whether to load best model before dropping learning rate or not min_learning_rate: min value of learning rate if learning rate decay is used clip_norm: clip gradients by norm coefficient """ def __init__(self, model: torch.nn.Module, device: Union[torch.device, str] = "cuda", optimizer: str = "AdamW", optimizer_parameters: Optional[dict] = None, learning_rate_drop_patience: Optional[int] = None, learning_rate_drop_div: Optional[float] = None, load_before_drop: bool = True, min_learning_rate: float = 1e-07, clip_norm: Optional[float] = None, *args, **kwargs): super().__init__(*args, **kwargs) self.model = model self.device = self._init_device(device) self.model.to(self.device) if self.device.type == "cuda" and torch.cuda.device_count() > 1: self.model = torch.nn.DataParallel(self.model) if optimizer_parameters is None: optimizer_parameters = {"lr": 0.01} self.optimizer = getattr(torch.optim, optimizer)(self.model.parameters(), **optimizer_parameters) self.epochs_done = 0 self.learning_rate_drop_patience = learning_rate_drop_patience self.learning_rate_drop_div = learning_rate_drop_div self.load_before_drop = load_before_drop self.min_learning_rate = min_learning_rate self.clip_norm = clip_norm self.load() # we need to switch to eval mode here because by default it's in `train` mode. # But in case of `interact/build_model` usage, we need to have model in eval mode. self.model.eval() log.debug(f"Model was successfully initialized! Model summary:\n {self.model}") def _init_device(self, device: Union[torch.device, str]) -> torch.device: if device == "gpu": device = "cuda" if isinstance(device, str): device = torch.device(device) if device.type == "cuda" and not torch.cuda.is_available(): log.warning(f"Unable to place component {self.__class__.__name__} on GPU, " "since no CUDA GPUs are available. Using CPU.") device = torch.device('cpu') return device @property def is_data_parallel(self) -> bool: return isinstance(self.model, torch.nn.DataParallel) def load(self, fname: Optional[str] = None, *args, **kwargs) -> None: """Load model from `fname` (if `fname` is not given, use `self.load_path`) to `self.model` along with the optimizer `self.optimizer`. If `fname` (if `fname` is not given, use `self.load_path`) does not exist, initialize model from scratch. Args: fname: string path to checkpoint *args: **kwargs: Returns: None """ if fname is not None: self.load_path = fname if self.load_path: log.debug(f"Load path {self.load_path} is given.") if isinstance(self.load_path, Path) and not self.load_path.parent.is_dir(): raise ConfigError("Provided load path is incorrect!") weights_path = Path(self.load_path.resolve()) weights_path = weights_path.with_suffix(f".pth.tar") if weights_path.exists(): log.debug(f"Load path {weights_path} exists.") log.debug(f"Initializing `{self.__class__.__name__}` from saved.") # now load the weights, optimizer from saved log.debug(f"Loading weights from {weights_path}.") checkpoint = torch.load(weights_path, map_location=self.device) model_state = checkpoint["model_state_dict"] optimizer_state = checkpoint["optimizer_state_dict"] # load a multi-gpu model on a single device if all([key.startswith("module.") for key in list(model_state.keys())]): model_state = {key.replace("module.", "", 1): val for key, val in model_state.items()} if self.is_data_parallel: self.model.module.load_state_dict(model_state) else: self.model.load_state_dict(model_state) try: # TODO: remove this try-except after hf models deep update self.optimizer.load_state_dict(optimizer_state) except ValueError as e: log.error(f'Failed to load optimizer state due to {repr(e)}') self.epochs_done = checkpoint.get("epochs_done", 0) else: log.warning(f"Init from scratch. Load path {weights_path} does not exist.") else: log.warning(f"Init from scratch. Load path {self.load_path} is not provided.") self.model.to(self.device) def save(self, fname: Optional[str] = None, *args, **kwargs) -> None: """Save torch model to `fname` (if `fname` is not given, use `self.save_path`). Checkpoint includes `model_state_dict`, `optimizer_state_dict`, and `epochs_done` (number of training epochs). Args: fname: *args: **kwargs: Returns: """ if fname is None: fname = self.save_path if not fname.parent.is_dir(): raise ConfigError("Provided save path is incorrect!") weights_path = Path(fname).with_suffix(f".pth.tar") log.info(f"Saving model to {weights_path}.") # move the model to `cpu` before saving to provide consistency if self.is_data_parallel: model_state_dict = self.model.module.cpu().state_dict() else: model_state_dict = self.model.cpu().state_dict() torch.save({ "model_state_dict": model_state_dict, "optimizer_state_dict": self.optimizer.state_dict(), "epochs_done": self.epochs_done }, weights_path) # return it back to device (necessary if it was on `cuda`) self.model.to(self.device) def process_event(self, event_name: str, data: dict) -> None: """Process event. After epoch, increase `self.epochs_done`. After validation, decrease learning rate in `self.learning_rate_drop_div` times (not lower than `self.min_learning_rate`) if given `self.learning_rate_drop_patience`. Args: event_name: whether event is send after epoch or batch. Set of values: ``"after_epoch", "after_batch"`` data: event data (dictionary) Returns: None """ if event_name == "after_epoch": self.epochs_done += 1 if event_name == "after_validation" and 'impatience' in data and self.learning_rate_drop_patience: if data['impatience'] == self.learning_rate_drop_patience: log.info(f"----------Current LR is decreased in {self.learning_rate_drop_div} times----------") if self.load_before_drop: self.load(self.save_path) self.model.eval() for param_group in self.optimizer.param_groups: param_group['lr'] = max(param_group['lr'] / self.learning_rate_drop_div, self.min_learning_rate) @abstractmethod def train_on_batch(self, x: list, y: list): pass def _make_step(self, loss: torch.Tensor) -> None: loss.backward() if self.clip_norm is not None: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip_norm) self.optimizer.step() ================================================ FILE: deeppavlov/core/trainers/__init__.py ================================================ # Copyright 2019 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from .fit_trainer import FitTrainer from .nn_trainer import NNTrainer from .torch_trainer import TorchTrainer ================================================ FILE: deeppavlov/core/trainers/fit_trainer.py ================================================ # Copyright 2019 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import datetime import json import time from itertools import islice from logging import getLogger from typing import Tuple, Dict, Union, Optional, Iterable, Any, Collection from tqdm import tqdm from deeppavlov.core.commands.infer import build_model from deeppavlov.core.common.chainer import Chainer from deeppavlov.core.common.params import from_params from deeppavlov.core.common.registry import register from deeppavlov.core.data.data_fitting_iterator import DataFittingIterator from deeppavlov.core.data.data_learning_iterator import DataLearningIterator from deeppavlov.core.models.estimator import Estimator from deeppavlov.core.trainers.utils import Metric, parse_metrics, prettify_metrics, NumpyArrayEncoder log = getLogger(__name__) report_log = getLogger('train_report') @register('fit_trainer') class FitTrainer: """ Trainer class for fitting and evaluating :class:`Estimators ` Args: chainer_config: ``"chainer"`` block of a configuration file batch_size: batch_size to use for partial fitting (if available) and evaluation, the whole dataset is used if ``batch_size`` is negative or zero (default is ``-1``) metrics: iterable of metrics where each metric can be a registered metric name or a dict of ``name`` and ``inputs`` where ``name`` is a registered metric name and ``inputs`` is a collection of parameter names from chainer’s inner memory that will be passed to the metric function; default value for ``inputs`` parameter is a concatenation of chainer’s ``in_y`` and ``out`` fields (default is ``('accuracy',)``) evaluation_targets: data types on which to evaluate trained pipeline (default is ``('valid', 'test')``) show_examples: a flag used to print inputs, expected outputs and predicted outputs for the last batch in evaluation logs (default is ``False``) max_test_batches: maximum batches count for pipeline testing and evaluation, ignored if negative (default is ``-1``) **kwargs: additional parameters whose names will be logged but otherwise ignored """ def __init__(self, chainer_config: dict, *, batch_size: int = -1, metrics: Iterable[Union[str, dict]] = ('accuracy',), evaluation_targets: Iterable[str] = ('valid', 'test'), show_examples: bool = False, max_test_batches: int = -1, **kwargs) -> None: if kwargs: log.warning(f'{self.__class__.__name__} got additional init parameters {list(kwargs)} that will be ignored:') self.chainer_config = chainer_config self._chainer = Chainer(chainer_config['in'], chainer_config['out'], chainer_config.get('in_y')) self.batch_size = batch_size self.metrics = parse_metrics(metrics, self._chainer.in_y, self._chainer.out_params) self.evaluation_targets = tuple(evaluation_targets) self.show_examples = show_examples self.max_test_batches = None if max_test_batches < 0 else max_test_batches self._built = False self._saved = False self._loaded = False def fit_chainer(self, iterator: Union[DataFittingIterator, DataLearningIterator]) -> None: """ Build the pipeline :class:`~deeppavlov.core.common.chainer.Chainer` and successively fit :class:`Estimator ` components using a provided data iterator """ if self._built: raise RuntimeError('Cannot fit already built chainer') for component_index, component_config in enumerate(self.chainer_config['pipe'], 1): component = from_params(component_config, mode='train') if 'fit_on' in component_config: component: Estimator targets = component_config['fit_on'] if isinstance(targets, str): targets = [targets] if self.batch_size > 0 and callable(getattr(component, 'partial_fit', None)): for i, (x, y) in tqdm(enumerate(iterator.gen_batches(self.batch_size, shuffle=False))): preprocessed = self._chainer.compute(x, y, targets=targets) # noinspection PyUnresolvedReferences component.partial_fit(*preprocessed) else: preprocessed = self._chainer.compute(*iterator.get_instances(), targets=targets) if len(targets) == 1: preprocessed = [preprocessed] component.fit(*preprocessed) component.save() if 'in' in component_config: c_in = component_config['in'] c_out = component_config['out'] in_y = component_config.get('in_y', None) main = component_config.get('main', False) self._chainer.append(component, c_in, c_out, in_y, main) self._built = True def _load(self) -> None: if not self._loaded: self._chainer.destroy() self._chainer = build_model({'chainer': self.chainer_config}, load_trained=self._saved) self._loaded = True def get_chainer(self) -> Chainer: """Returns a :class:`~deeppavlov.core.common.chainer.Chainer` built from ``self.chainer_config`` for inference""" self._load() return self._chainer def train(self, iterator: Union[DataFittingIterator, DataLearningIterator]) -> None: """Calls :meth:`~fit_chainer` with provided data iterator as an argument""" self.fit_chainer(iterator) self._saved = True def test(self, data: Iterable[Tuple[Collection[Any], Collection[Any]]], metrics: Optional[Collection[Metric]] = None, *, start_time: Optional[float] = None, show_examples: Optional[bool] = None) -> dict: """ Calculate metrics and return reports on provided data for currently stored :class:`~deeppavlov.core.common.chainer.Chainer` Args: data: iterable of batches of inputs and expected outputs metrics: collection of metrics namedtuples containing names for report, metric functions and their inputs names (if omitted, ``self.metrics`` is used) start_time: start time for test report show_examples: a flag used to return inputs, expected outputs and predicted outputs for the last batch in a result report (if omitted, ``self.show_examples`` is used) Returns: a report dict containing calculated metrics, spent time value, examples count in tested data and maybe examples """ if start_time is None: start_time = time.time() if show_examples is None: show_examples = self.show_examples if metrics is None: metrics = self.metrics expected_outputs = list(set().union(self._chainer.out_params, *[m.inputs for m in metrics])) outputs = {out: [] for out in expected_outputs} examples = 0 data = islice(data, self.max_test_batches) for x, y_true in tqdm(data): examples += len(x) y_predicted = list(self._chainer.compute(list(x), list(y_true), targets=expected_outputs)) if len(expected_outputs) == 1: y_predicted = [y_predicted] for out, val in zip(outputs.values(), y_predicted): out += list(val) if examples == 0: log.warning('Got empty data iterable for scoring') return {'eval_examples_count': 0, 'metrics': None, 'time_spent': str(datetime.timedelta(seconds=0))} # metrics_values = [(m.name, m.fn(*[outputs[i] for i in m.inputs])) for m in metrics] metrics_values = [] for metric in metrics: calculate_metric = True for i in metric.inputs: outputs[i] = [k for k in outputs[i] if k is not None] if len(outputs[i]) == 0: log.info(f'Metric {metric.alias} is not calculated due to absense of true and predicted samples') calculate_metric = False value = -1 if calculate_metric: value = metric.fn(*[outputs[i] for i in metric.inputs]) metrics_values.append((metric.alias, value)) report = { 'eval_examples_count': examples, 'metrics': prettify_metrics(metrics_values), 'time_spent': str(datetime.timedelta(seconds=round(time.time() - start_time + 0.5))) } if show_examples: y_predicted = zip(*[y_predicted_group for out_name, y_predicted_group in zip(expected_outputs, y_predicted) if out_name in self._chainer.out_params]) if len(self._chainer.out_params) == 1: y_predicted = [y_predicted_item[0] for y_predicted_item in y_predicted] report['examples'] = [{ 'x': x_item, 'y_predicted': y_predicted_item, 'y_true': y_true_item } for x_item, y_predicted_item, y_true_item in zip(x, y_predicted, y_true)] return report def evaluate(self, iterator: DataLearningIterator, evaluation_targets: Optional[Iterable[str]] = None) -> Dict[str, dict]: """ Run :meth:`test` on multiple data types using provided data iterator Args: iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation evaluation_targets: iterable of data types to evaluate on Returns: a dictionary with data types as keys and evaluation reports as values """ self._load() if evaluation_targets is None: evaluation_targets = self.evaluation_targets res = {} for data_type in evaluation_targets: data_gen = iterator.gen_batches(self.batch_size, data_type=data_type, shuffle=False) report = self.test(data_gen) res[data_type] = report report_log.info(json.dumps({data_type: report}, ensure_ascii=False, cls=NumpyArrayEncoder)) return res ================================================ FILE: deeppavlov/core/trainers/nn_trainer.py ================================================ # Copyright 2019 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import datetime import json import time from itertools import islice from logging import getLogger from pathlib import Path from typing import List, Tuple, Union, Optional, Iterable from tqdm import tqdm from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.log_events import get_tb_writer from deeppavlov.core.common.registry import register from deeppavlov.core.data.data_learning_iterator import DataLearningIterator from deeppavlov.core.trainers.fit_trainer import FitTrainer from deeppavlov.core.trainers.utils import parse_metrics, NumpyArrayEncoder log = getLogger(__name__) report_log = getLogger('train_report') @register('nn_trainer') class NNTrainer(FitTrainer): """ | Bases :class:`~deeppavlov.core.trainers.FitTrainer` | Trainer class for training and evaluating pipelines containing :class:`Estimators ` and an :class:`~deeppavlov.core.models.nn_model.NNModel` Args: chainer_config: ``"chainer"`` block of a configuration file batch_size: batch_size to use for partial fitting (if available) and evaluation, the whole dataset is used if ``batch_size`` is negative or zero (default is ``1``) epochs: maximum epochs number to train the pipeline, ignored if negative or zero (default is ``-1``) start_epoch_num: starting epoch number for reports (default is ``0``) max_batches: maximum batches number to train the pipeline, ignored if negative or zero (default is ``-1``) metrics: iterable of metrics where each metric can be a registered metric name or a dict of ``name`` and ``inputs`` where ``name`` is a registered metric name and ``inputs`` is a collection of parameter names from chainer’s inner memory that will be passed to the metric function; default value for ``inputs`` parameter is a concatenation of chainer’s ``in_y`` and ``out`` fields; the first metric is used for early stopping (default is ``('accuracy',)``) train_metrics: metrics calculated for train logs (if omitted, ``metrics`` argument is used) metric_optimization: one of ``'maximize'`` or ``'minimize'`` — strategy for metric optimization used in early stopping (default is ``'maximize'``) evaluation_targets: data types on which to evaluate a trained pipeline (default is ``('valid', 'test')``) show_examples: a flag used to print inputs, expected outputs and predicted outputs for the last batch in evaluation logs (default is ``False``) tensorboard_log_dir: path to a directory where tensorboard logs can be stored, ignored if None (default is ``None``) validate_first: flag used to calculate metrics on the ``'valid'`` data type before starting training (default is ``True``) validation_patience: how many times in a row the validation metric has to not improve for early stopping, ignored if negative or zero (default is ``5``) val_every_n_epochs: how often (in epochs) to validate the pipeline, ignored if negative or zero (default is ``-1``) val_every_n_batches: how often (in batches) to validate the pipeline, ignored if negative or zero (default is ``-1``) log_every_n_epochs: how often (in epochs) to calculate metrics on train data, ignored if negative or zero (default is ``-1``) log_every_n_batches: how often (in batches) to calculate metrics on train data, ignored if negative or zero (default is ``-1``) log_on_k_batches: count of random train batches to calculate metrics in log (default is ``1``) max_test_batches: maximum batches count for pipeline testing and evaluation, overrides ``log_on_k_batches``, ignored if negative (default is ``-1``) **kwargs: additional parameters whose names will be logged but otherwise ignored Trainer saves the model if it sees progress in scores. The full rules look like following: - For the validation savepoint: * 0-th validation (optional). Don't save model, establish a baseline. * 1-th validation. + If we have a baseline, save the model if we see an improvement, don't save otherwise. + If we don't have a baseline, save the model. * 2nd and later validations. Save the model if we see an improvement - For the at-train-exit savepoint: * Save the model if it happened before 1st validation (to capture early training results), don't save otherwise. """ def __init__(self, chainer_config: dict, *, batch_size: int = 1, epochs: int = -1, start_epoch_num: int = 0, max_batches: int = -1, metrics: Iterable[Union[str, dict]] = ('accuracy',), train_metrics: Optional[Iterable[Union[str, dict]]] = None, metric_optimization: str = 'maximize', evaluation_targets: Iterable[str] = ('valid', 'test'), show_examples: bool = False, tensorboard_log_dir: Optional[Union[str, Path]] = None, max_test_batches: int = -1, validate_first: bool = True, validation_patience: int = 5, val_every_n_epochs: int = -1, val_every_n_batches: int = -1, log_every_n_batches: int = -1, log_every_n_epochs: int = -1, log_on_k_batches: int = 1, **kwargs) -> None: super().__init__(chainer_config, batch_size=batch_size, metrics=metrics, evaluation_targets=evaluation_targets, show_examples=show_examples, max_test_batches=max_test_batches, **kwargs) if train_metrics is None: self.train_metrics = self.metrics else: self.train_metrics = parse_metrics(train_metrics, self._chainer.in_y, self._chainer.out_params) metric_optimization = metric_optimization.strip().lower() self.score_best = None def _improved(op): return lambda score, baseline: False if baseline is None or score is None \ else op(score, baseline) if metric_optimization == 'maximize': self.improved = _improved(lambda a, b: a > b) elif metric_optimization == 'minimize': self.improved = _improved(lambda a, b: a < b) else: raise ConfigError('metric_optimization has to be one of {}'.format(['maximize', 'minimize'])) self.validate_first = validate_first self.validation_number = 0 if validate_first else 1 self.validation_patience = validation_patience self.val_every_n_epochs = val_every_n_epochs self.val_every_n_batches = val_every_n_batches self.log_every_n_epochs = log_every_n_epochs self.log_every_n_batches = log_every_n_batches self.log_on_k_batches = log_on_k_batches if log_on_k_batches >= 0 else None self.max_epochs = epochs self.epoch = start_epoch_num self.max_batches = max_batches self.train_batches_seen = 0 self.examples = 0 self.patience = 0 self.last_result = {} self.losses = [] self.start_time: Optional[float] = None self.tb_writer = get_tb_writer(tensorboard_log_dir) def save(self) -> None: if self._loaded: raise RuntimeError('Cannot save already finalized chainer') self._chainer.save() def _is_initial_validation(self): return self.validation_number == 0 def _is_first_validation(self): return self.validation_number == 1 def _validate(self, iterator: DataLearningIterator, tensorboard_tag: Optional[str] = None, tensorboard_index: Optional[int] = None) -> None: self._send_event(event_name='before_validation') report = self.test(iterator.gen_batches(self.batch_size, data_type='valid', shuffle=False), start_time=self.start_time) report['epochs_done'] = self.epoch report['batches_seen'] = self.train_batches_seen report['train_examples_seen'] = self.examples metrics = list(report['metrics'].items()) if tensorboard_tag is not None and self.tb_writer is not None: if tensorboard_index is None: tensorboard_index = self.train_batches_seen for name, score in metrics: self.tb_writer.write_valid(tag=f'{tensorboard_tag}/{name}', scalar_value=score, global_step=tensorboard_index) self.tb_writer.flush() m_name, score = metrics[0] # Update the patience if self.score_best is None: self.patience = 0 else: if self.improved(score, self.score_best): self.patience = 0 else: self.patience += 1 # Run the validation model-saving logic if self._is_initial_validation(): log.info('Initial best {} of {}'.format(m_name, score)) self.score_best = score elif self._is_first_validation() and self.score_best is None: log.info('First best {} of {}'.format(m_name, score)) self.score_best = score log.info('Saving model') self.save() elif self.improved(score, self.score_best): log.info(f'Improved best {m_name} from {self.score_best} to {score}') self.score_best = score log.info('Saving model') self.save() else: log.info('Did not improve on the {} of {}'.format(m_name, self.score_best)) report['impatience'] = self.patience if self.validation_patience > 0: report['patience_limit'] = self.validation_patience self._send_event(event_name='after_validation', data=report) report = {'valid': report} report_log.info(json.dumps(report, ensure_ascii=False, cls=NumpyArrayEncoder)) self.validation_number += 1 def _log(self, iterator: DataLearningIterator, tensorboard_tag: Optional[str] = None, tensorboard_index: Optional[int] = None) -> None: self._send_event(event_name='before_log') if self.log_on_k_batches == 0: report = { 'time_spent': str(datetime.timedelta(seconds=round(time.time() - self.start_time + 0.5))) } else: data = islice(iterator.gen_batches(self.batch_size, data_type='train', shuffle=True), self.log_on_k_batches) report = self.test(data, self.train_metrics, start_time=self.start_time) report.update({ 'epochs_done': self.epoch, 'batches_seen': self.train_batches_seen, 'train_examples_seen': self.examples }) metrics: List[Tuple[str, float]] = list(report.get('metrics', {}).items()) + list(self.last_result.items()) report.update(self.last_result) if self.losses: report['loss'] = sum(self.losses) / len(self.losses) self.losses.clear() metrics.append(('loss', report['loss'])) if metrics and self.tb_writer is not None: for name, score in metrics: self.tb_writer.write_train(tag=f'{tensorboard_tag}/{name}', scalar_value=score, global_step=tensorboard_index) self.tb_writer.flush() self._send_event(event_name='after_train_log', data=report) report = {'train': report} report_log.info(json.dumps(report, ensure_ascii=False, cls=NumpyArrayEncoder)) def _send_event(self, event_name: str, data: Optional[dict] = None) -> None: report = { 'time_spent': str(datetime.timedelta(seconds=round(time.time() - self.start_time + 0.5))), 'epochs_done': self.epoch, 'batches_seen': self.train_batches_seen, 'train_examples_seen': self.examples } if data is not None: report.update(data) self._chainer.process_event(event_name=event_name, data=report) def train_on_batches(self, iterator: DataLearningIterator) -> None: """Train pipeline on batches using provided data iterator and initialization parameters""" self.start_time = time.time() if self.validate_first: self._validate(iterator) while True: impatient = False self._send_event(event_name='before_train') for x, y_true in tqdm(iterator.gen_batches(self.batch_size, data_type='train')): self.last_result = self._chainer.train_on_batch(x, y_true) if self.last_result is None: self.last_result = {} elif not isinstance(self.last_result, dict): self.last_result = {'loss': self.last_result} if 'loss' in self.last_result: self.losses.append(self.last_result.pop('loss')) self.train_batches_seen += 1 self.examples += len(x) if self.log_every_n_batches > 0 and self.train_batches_seen % self.log_every_n_batches == 0: self._log(iterator, tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen) if self.val_every_n_batches > 0 and self.train_batches_seen % self.val_every_n_batches == 0: self._validate(iterator, tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen) self._send_event(event_name='after_batch') if 0 < self.max_batches <= self.train_batches_seen: impatient = True break if 0 < self.validation_patience <= self.patience: log.info('Ran out of patience') impatient = True break if impatient: break self.epoch += 1 if self.log_every_n_epochs > 0 and self.epoch % self.log_every_n_epochs == 0: self._log(iterator, tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch) if self.val_every_n_epochs > 0 and self.epoch % self.val_every_n_epochs == 0: self._validate(iterator, tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch) self._send_event(event_name='after_epoch') if 0 < self.max_epochs <= self.epoch: break if 0 < self.validation_patience <= self.patience: log.info('Ran out of patience') break def train(self, iterator: DataLearningIterator) -> None: """Call :meth:`~fit_chainer` and then :meth:`~train_on_batches` with provided data iterator as an argument""" self.fit_chainer(iterator) if callable(getattr(self._chainer, 'train_on_batch', None)): try: self.train_on_batches(iterator) except KeyboardInterrupt: log.info('Stopped training') else: log.warning(f'Using {self.__class__.__name__} for a pipeline without batched training') # Run the at-train-exit model-saving logic if self.validation_number < 1: log.info('Save model to capture early training results') self.save() ================================================ FILE: deeppavlov/core/trainers/torch_trainer.py ================================================ # Copyright 2019 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from typing import Tuple, Optional, Iterable, Collection, Any from deeppavlov.core.trainers.utils import Metric from deeppavlov.core.common.registry import register from deeppavlov.core.data.data_learning_iterator import DataLearningIterator from deeppavlov.core.trainers.nn_trainer import NNTrainer log = getLogger(__name__) @register('torch_trainer') class TorchTrainer(NNTrainer): def test(self, data: Iterable[Tuple[Collection[Any], Collection[Any]]], metrics: Optional[Collection[Metric]] = None, *, start_time: Optional[float] = None, show_examples: Optional[bool] = None) -> dict: self._chainer.get_main_component().model.eval() report = super(TorchTrainer, self).test(data=data, metrics=metrics, start_time=start_time, show_examples=show_examples) self._chainer.get_main_component().model.train() return report def train_on_batches(self, iterator: DataLearningIterator) -> None: self._chainer.get_main_component().model.train() super(TorchTrainer, self).train_on_batches(iterator=iterator) self._chainer.get_main_component().model.eval() ================================================ FILE: deeppavlov/core/trainers/utils.py ================================================ # Copyright 2019 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from collections import OrderedDict, namedtuple from dataclasses import is_dataclass from functools import partial from json import JSONEncoder from typing import List, Tuple, Union, Iterable import numpy as np from deeppavlov.core.common.metrics_registry import get_metric_by_name Metric = namedtuple('Metric', ['name', 'fn', 'inputs', 'alias']) def parse_metrics(metrics: Iterable[Union[str, dict]], in_y: List[str], out_vars: List[str]) -> List[Metric]: metrics_functions = [] for metric in metrics: if isinstance(metric, str): metric = {'name': metric, 'alias': metric} metric_name = metric.pop('name') alias = metric.pop('alias', metric_name) f = get_metric_by_name(metric_name) inputs = metric.pop('inputs', in_y + out_vars) if isinstance(inputs, str): inputs = [inputs] metrics_functions.append(Metric(metric_name, partial(f, **metric), inputs, alias)) return metrics_functions def prettify_metrics(metrics: List[Tuple[str, float]], precision: int = 4) -> OrderedDict: """Prettifies the dictionary of metrics.""" prettified_metrics = OrderedDict() for key, value in metrics: if key in prettified_metrics: Warning("Multiple metrics with the same name {}.".format(key)) if isinstance(value, float): value = round(value, precision) prettified_metrics[key] = value return prettified_metrics class NumpyArrayEncoder(JSONEncoder): def default(self, obj): if isinstance(obj, np.ndarray): return obj.tolist() elif isinstance(obj, np.integer): return int(obj) elif isinstance(obj, np.floating): return float(obj) elif is_dataclass(obj): return obj.__dict__ return JSONEncoder.default(self, obj) ================================================ FILE: deeppavlov/dataset_iterators/__init__.py ================================================ ================================================ FILE: deeppavlov/dataset_iterators/basic_classification_iterator.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from collections import defaultdict from logging import getLogger from typing import List from sklearn.model_selection import train_test_split from deeppavlov.core.common.registry import register from deeppavlov.core.data.data_learning_iterator import DataLearningIterator log = getLogger(__name__) @register('basic_classification_iterator') class BasicClassificationDatasetIterator(DataLearningIterator): """ Class gets data dictionary from DatasetReader instance, merge fields if necessary, split a field if necessary Args: data: dictionary of data with fields "train", "valid" and "test" (or some of them) fields_to_merge: list of fields (out of ``"train", "valid", "test"``) to merge merged_field: name of field (out of ``"train", "valid", "test"``) to which save merged fields field_to_split: name of field (out of ``"train", "valid", "test"``) to split split_fields: list of fields (out of ``"train", "valid", "test"``) to which save splitted field split_proportions: list of corresponding proportions for splitting seed: random seed for iterating shuffle: whether to shuffle examples in batches split_seed: random seed for splitting dataset, if ``split_seed`` is None, division is based on `seed`. stratify: whether to use stratified split shot: number of examples to sample for each class in training data. If None, all examples will remain in data. *args: arguments **kwargs: arguments Attributes: data: dictionary of data with fields "train", "valid" and "test" (or some of them) """ def __init__(self, data: dict, fields_to_merge: List[str] = None, merged_field: str = None, field_to_split: str = None, split_fields: List[str] = None, split_proportions: List[float] = None, seed: int = None, shuffle: bool = True, split_seed: int = None, stratify: bool = None, shot: int = None, *args, **kwargs): """ Initialize dataset using data from DatasetReader, merges and splits fields according to the given parameters. """ super().__init__(data, seed=seed, shuffle=shuffle) if fields_to_merge is not None: if merged_field is not None: log.info("Merging fields <<{}>> to new field <<{}>>".format(fields_to_merge, merged_field)) self._merge_data(fields_to_merge=fields_to_merge, merged_field=merged_field) else: raise IOError("Given fields to merge BUT not given name of merged field") if field_to_split is not None: if split_fields is not None: log.info("Splitting field <<{}>> to new fields <<{}>>".format(field_to_split, split_fields)) self._split_data(field_to_split=field_to_split, split_fields=split_fields, split_proportions=[float(s) for s in split_proportions], split_seed=split_seed, stratify=stratify) else: raise IOError("Given field to split BUT not given names of split fields") if shot is not None: train_data = self.data['train'] self.random.shuffle(train_data) self.random.seed(seed) data_dict = defaultdict(list) for text, label in train_data: if len(data_dict[label]) < shot: data_dict[label].append(text) if min(len(x) for x in data_dict.values()) < shot: log.warning(f"Some labels have less than {shot} examples") self.data['train'] = [(text, label) for label in data_dict for text in data_dict[label]] def _split_data(self, field_to_split: str = None, split_fields: List[str] = None, split_proportions: List[float] = None, split_seed: int = None, stratify: bool = None) -> bool: """ Split given field of dataset to the given list of fields with corresponding proportions Args: field_to_split: field name (out of ``"train", "valid", "test"``) which to split split_fields: list of names (out of ``"train", "valid", "test"``) of fields to which split split_proportions: corresponding proportions split_seed: random seed for splitting dataset stratify: whether to use stratified split Returns: None """ if split_seed is None: split_seed = self.random.randint(0, 10000) data_to_div = self.data[field_to_split].copy() data_size = len(self.data[field_to_split]) for i in range(len(split_fields) - 1): if stratify: stratify = [sample[1] for sample in data_to_div] self.data[split_fields[i]], data_to_div = train_test_split( data_to_div, test_size=len(data_to_div) - int(data_size * split_proportions[i]), random_state=split_seed, stratify=stratify) self.data[split_fields[-1]] = data_to_div return True def _merge_data(self, fields_to_merge: List[str] = None, merged_field: str = None) -> bool: """ Merge given fields of dataset Args: fields_to_merge: list of fields (out of ``"train", "valid", "test"``) to merge merged_field: name of field (out of ``"train", "valid", "test"``) to which save merged fields Returns: None """ data = self.data.copy() data[merged_field] = [] for name in fields_to_merge: data[merged_field] += self.data[name] self.data = data return True ================================================ FILE: deeppavlov/dataset_iterators/huggingface_dataset_iterator.py ================================================ # Copyright 2020 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import List, Tuple, Any, Union from datasets import Dataset from deeppavlov.core.common.registry import register from deeppavlov.core.data.data_learning_iterator import DataLearningIterator @register('huggingface_dataset_iterator') class HuggingFaceDatasetIterator(DataLearningIterator): """Dataset iterator for HuggingFace Datasets.""" def preprocess(self, data: Dataset, features: Union[str, List[str]], label: str = 'label', use_label_name: bool = True, *args, **kwargs) -> List[Tuple[Any, Any]]: """Extracts features and labels from HuggingFace Dataset Args: data: instance of HuggingFace Dataset features: Dataset fields names to be extracted as features label: Dataset field name to be used as label. use_label_name: Use actual label name instead of its index (0, 1, ...). Defaults to True. Returns: List[Tuple[Any, Any]]: list of pairs of extracted features and labels """ dataset = [] for i in range(len(data)): # for example in data example = data[i] if isinstance(features, str): feat = example[features] elif isinstance(features, list): try: feat = tuple(example[f] for f in features) except Exception as e: raise Exception(f"{e} for example {example} while trying to find keys {features}") else: raise RuntimeError(f"features should be str or list, but found: {features}") lb = example[label] if use_label_name and lb != -1: # -1 label is used if there is no label (test set) lb = data.info.features[label].names[lb] dataset += [(feat, lb)] return dataset ================================================ FILE: deeppavlov/dataset_iterators/morphotagger_iterator.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Tuple, List, Dict, Any, Iterator import numpy as np from deeppavlov.core.common.registry import register from deeppavlov.core.data.data_learning_iterator import DataLearningIterator @register('morphotagger_dataset_iterator') class MorphoTaggerDatasetIterator(DataLearningIterator): """ Iterates over data for Morphological Tagging. A subclass of :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator`. Args: seed: random seed for data shuffling shuffle: whether to shuffle data during batching validation_split: the fraction of validation data (is used only if there is no `valid` subset in `data`) """ def __init__(self, data: Dict[str, List[Tuple[Any, Any]]], seed: int = None, shuffle: bool = True, validation_split: float = 0.2) -> None: self.validation_split = validation_split super().__init__(data, seed, shuffle) def split(self, *args, **kwargs) -> None: """ Splits the `train` part to `train` and `valid`, if no `valid` part is specified. Moves deficient data from `valid` to `train` if both parts are given, but `train` subset is too small. """ if len(self.valid) == 0: if self.shuffle: self.random.shuffle(self.train) L = int(len(self.train) * (1.0 - self.validation_split)) self.train, self.valid = self.train[:L], self.train[L:] def gen_batches(self, batch_size: int, data_type: str = 'train', shuffle: bool = None, return_indexes: bool = False) -> Iterator[tuple]: """Generate batches of inputs and expected output to train neural networks Args: batch_size: number of samples in batch data_type: can be either 'train', 'test', or 'valid' shuffle: whether to shuffle dataset before batching return_indexes: whether to return indexes of batch elements in initial dataset Yields: a tuple of a batch of inputs and a batch of expected outputs. If `return_indexes` is True, also yields indexes of batch elements. """ if shuffle is None: shuffle = self.shuffle data = self.data[data_type] lengths = [len(x[0]) for x in data] indexes = np.argsort(lengths) L = len(data) if batch_size < 0: batch_size = L starts = list(range(0, L, batch_size)) if shuffle: self.random.shuffle(starts) for start in starts: indexes_to_yield = indexes[start:start + batch_size] data_to_yield = tuple(list(x) for x in zip(*([data[i] for i in indexes_to_yield]))) if return_indexes: yield indexes_to_yield, data_to_yield else: yield data_to_yield ================================================ FILE: deeppavlov/dataset_iterators/multitask_iterator.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import copy import math import random from logging import getLogger from typing import Iterator, Optional, Tuple, Union import numpy as np from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.params import from_params from deeppavlov.core.common.registry import register from deeppavlov.core.data.data_learning_iterator import DataLearningIterator log = getLogger(__name__) @register('multitask_iterator') class MultiTaskIterator: """ Class merges data from several dataset iterators. When used for batch generation batches from merged dataset iterators are united into one batch. If sizes of merged datasets are different smaller datasets are repeated until their size becomes equal to the largest dataset. Args: data: dictionary which keys are task names and values are dictionaries with fields ``"train", "valid", "test"``. num_train_epochs: number of training epochs tasks: dictionary which keys are task names and values are init params of dataset iterators. If task has key-value pair ``'use_task_defaults': False`` task_defaults for this task dataset iterator will be ignored. batch_size: batch_size sampling_mode: mode of sampling we use. It can be plain, uniform or anneal. gradient_accumulation_steps: number of gradient accumulation steps. Default is 1 steps_per_epoch: number of steps per epoch. Nesessary if gradient_accumulation_steps > 1 iterator_class_name: name of iterator class. use_label_name, seed, features - parameters for the iterator class one_element_tuples: if True, tuple of x consisting of one element is returned in this element. Default: True task_defaults: default task parameters. seed - random seed for sampling Attributes: data: dictionary of data with fields "train", "valid" and "test" (or some of them) """ def __init__( self, data: dict, num_train_epochs: int, tasks: dict, batch_size: int = 8, sampling_mode: str = 'plain', gradient_accumulation_steps: int = 1, steps_per_epoch: int = 0, one_element_tuples: bool = True, task_defaults: dict = None, seed: int = 42, **kwargs ): if data.keys() != tasks.keys(): raise ConfigError("Task names from dataset reader don't mach task names from dataset iterator: " f"{data.keys()} != {tasks.keys()}.") self.task_iterators = {} if task_defaults is None: task_defaults = dict() for task_name, task_params in tasks.items(): if task_params.pop('use_task_defaults', True) is True: task_config = copy.deepcopy(task_defaults) task_config.update(task_params) else: task_config = task_params try: self.task_iterators[task_name] = from_params(task_config, data=data[task_name]) except Exception as e: log.error(f'Failed to initialize dataset_iterator for "{task_name}" task. Make sure that all parameters' 'from `task_defaults` and task parameters are correct.') raise e self.n_tasks = len(tasks.keys()) self.num_train_epochs = num_train_epochs self.steps_per_epoch = steps_per_epoch self.gradient_accumulation_steps = gradient_accumulation_steps self.epochs_done = 0 self.steps_taken = 0 self.task_id = None self.sampling_mode = sampling_mode self.data = { "train": self._extract_data_type("train"), "valid": self._extract_data_type("valid"), "test": self._extract_data_type("test"), } for mode in ["train", "valid", "test"]: log.info(f'For {mode}') for task_name in self.data[mode]: log.info(f'{task_name} has {len(self.data[mode][task_name])} examples') self.train_sizes = self._get_data_size("train") if steps_per_epoch == 0: self.steps_per_epoch = sum(self.train_sizes) // batch_size else: self.steps_per_epoch = steps_per_epoch def is_nan(a): return a != a for mode in ['train', 'valid', 'test']: for task in self.data[mode]: for i in range(len(self.data[mode][task]) - 1, -1, -1): x = self.data[mode][task][i][0] y = self.data[mode][task][i][1] if is_nan(x) or any([is_nan(z) for z in x]) or is_nan(y): log.info(f'NAN detected {self.data[mode][task][i - 1:i]}') del self.data[mode][task][i] log.info(f'NAN for mode {mode} task {task} element {i} CLEARED') elif isinstance(x, tuple) and len(x) == 1 and one_element_tuples: # x is a tuple consisting of 1 element. return it as string self.data[mode][task][i] = (x[0], y) self.max_task_data_len = dict() for data_type in self.data: sizes = self._get_data_size(data_type) self.max_task_data_len[data_type] = max(sizes) random.seed(seed) def _get_data_size(self, data_type): """Returns list of sizes of each dataset for the given data_type: train,test or valid.""" return [len(self.data[data_type][key]) for key in self.data[data_type]] def _get_probs(self, data_type): """Returns sampling probabilities for different sampling modes - plain, uniform or anneal""" if self.sampling_mode == 'uniform': sizes = [1 for _ in self._get_data_size(data_type)] # as we sample uniformly s = sum(sizes) probs = [p / s for p in sizes] elif self.sampling_mode == 'plain': sizes = self._get_data_size(data_type) n_samples = sum(sizes) probs = [p / n_samples for p in sizes] elif self.sampling_mode == 'anneal': alpha = 1.0 - 0.8 * (self.epochs_done / self.num_train_epochs) annealed_sizes = [p ** alpha for p in self._get_data_size(data_type)] n_samples = sum(annealed_sizes) probs = [p / n_samples for p in annealed_sizes] else: raise ValueError(f'Unsupported sampling mode {self.sampling_mode}') return probs def _extract_data_type(self, data_type): """Function that merges data of the current data_type (e.g. train) from all task_iterators into one dict""" dataset_part = {} for task, iterator in self.task_iterators.items(): dataset_part[task] = getattr(iterator, data_type) return dataset_part def _transform_before_yielding(self, x, y, batch_size): """Function that transforms data from dataset before yielding""" if len(x) != len(y): raise Exception(f'x has len {len(x)} but y has len {len(y)}') new_x, new_y = [], [] for i in range(batch_size): x_tuple = tuple([x[t_id][i] for t_id in range(self.n_tasks)]) y_tuple = tuple([y[t_id][i] for t_id in range(self.n_tasks)]) if self.n_tasks == 1: x_tuple = x_tuple[0] y_tuple = y_tuple[0] new_x.append(x_tuple) new_y.append(y_tuple) batches = (tuple(new_x), tuple(new_y)) return batches def gen_batches(self, batch_size: int, data_type: str = "train", shuffle: bool = None) -> Iterator[Tuple[tuple, tuple]]: """ Generates batches and expected output to train neural networks. If there are not enough samples from any task, samples are padded with None Args: batch_size: number of samples in batch data_type: can be either 'train', 'test', or 'valid' shuffle: whether to shuffle dataset before batching Yields: A tuple of a batch of inputs and a batch of expected outputs. Inputs and outputs are tuples. Element of inputs or outputs is a tuple which elements are x values of merged tasks in the order tasks are present in `tasks` argument of `__init__` method. """ max_task_data_len = self.max_task_data_len[data_type] log.info(f'Batch size {batch_size} with gradient accumulation steps {self.gradient_accumulation_steps}') log.info(f'Efficient batch size {batch_size // self.gradient_accumulation_steps}') batch_size = batch_size // self.gradient_accumulation_steps if data_type == "train": generators = [ SingleTaskBatchGenerator(iter_, batch_size, data_type, shuffle) for iter_ in self.task_iterators.values() ] # probs only required while training probs = self._get_probs("train") for step in range(self.steps_per_epoch): if (self.steps_taken + 1) % self.gradient_accumulation_steps == 0 or self.task_id is None: self.task_id = np.random.choice(self.n_tasks, p=probs) x = [[None for _ in range(batch_size)] for _ in range(self.n_tasks)] y = [[None for _ in range(batch_size)] for _ in range(self.n_tasks)] x[self.task_id], y[self.task_id] = generators[self.task_id].__next__() if not all([s is None for s in x[self.task_id]]): batch_to_yield = self._transform_before_yielding( x, y, batch_size) yield batch_to_yield self.epochs_done += 1 # one additional step is taken while logging training metrics self.steps_taken -= 1 else: eval_batch_size = 1 x = [[None for _ in range(eval_batch_size)] for _ in range(self.n_tasks)] y = [[None for _ in range(eval_batch_size)] for _ in range(self.n_tasks)] generators = [ SingleTaskBatchGenerator( iter_, batch_size=eval_batch_size, data_type=data_type, shuffle=shuffle) for iter_ in self.task_iterators.values() ] for step in range(max_task_data_len): for task_id in range(self.n_tasks): x[task_id], y[task_id] = generators[task_id].__next__() batches = self._transform_before_yielding(x, y, eval_batch_size) yield batches def get_instances(self, data_type: str = "train"): """ Returns a tuple of inputs and outputs from all datasets. Lengths of and outputs are equal to the size of the largest dataset. Smaller datasets are padded with Nones until their sizes are equal to the size of the largest dataset. Args: data_type: can be either 'train', 'test', or 'valid' Returns: A tuple of all inputs for a data type and all expected outputs for a data type. """ max_task_data_len = max( [ len(iter_.get_instances(data_type)[0]) for iter_ in self.task_iterators.values() ] ) x_instances = [] y_instances = [] for task_name, iter_ in self.task_iterators.items(): x, y = iter_.get_instances(data_type) n_repeats = math.ceil(max_task_data_len / len(x)) x *= n_repeats y *= n_repeats x_instances.append(x[:max_task_data_len]) y_instances.append(y[:max_task_data_len]) error_msg = f'Len of x_instances {len(x_instances)} and y_instances {len(y_instances)} dont match' if len(x_instances) != len(y_instances): raise Exception(error_msg) instances = (tuple(zip(*x_instances)), tuple(zip(*y_instances))) return instances class SingleTaskBatchGenerator: """ Batch generator for a single task. If there are no elements in the dataset to form another batch, Nones are returned. Args: dataset_iterator: dataset iterator from which batches are drawn. batch_size: size fo the batch. data_type: "train", "valid", or "test" shuffle: whether dataset will be shuffled. n_batches: the number of batches that will be generated. """ def __init__( self, dataset_iterator: Union[DataLearningIterator], batch_size: int, data_type: str, shuffle: bool, n_batches: Optional[int] = None, size_of_last_batch: Optional[int] = None, ): self.dataset_iterator = dataset_iterator self.batch_size = batch_size self.data_type = data_type self.shuffle = shuffle self.n_batches = n_batches self.size_of_last_batch = ( self.batch_size if size_of_last_batch is None else size_of_last_batch) self.inner_batch_size = math.gcd( len(self.dataset_iterator.data[data_type]), batch_size ) self.gen = self.dataset_iterator.gen_batches( self.inner_batch_size, self.data_type, self.shuffle ) self.batch_count = 0 def __iter__(self): return self def __next__(self): if self.n_batches is not None and self.batch_count > self.n_batches: raise StopIteration x, y = (), () while len(x) < self.batch_size or len(y) < self.batch_size: try: xx, yy = next(self.gen) x += xx y += yy except StopIteration: x_nones = tuple([None for _ in range(self.batch_size)]) y_nones = x_nones return x_nones, y_nones self.batch_count += 1 if self.batch_count == self.n_batches: x = x[:self.size_of_last_batch] y = y[:self.size_of_last_batch] return x, y ================================================ FILE: deeppavlov/dataset_iterators/siamese_iterator.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from typing import Dict, List, Tuple from deeppavlov.core.common.registry import register from deeppavlov.core.data.data_learning_iterator import DataLearningIterator log = getLogger(__name__) @register('siamese_iterator') class SiameseIterator(DataLearningIterator): """The class contains methods for iterating over a dataset for ranking in training, validation and test mode.""" def split(self, *args, len_valid=1000, len_test=1000, **kwargs) -> None: if len(self.valid) == 0 and len_valid != 0: self.random.shuffle(self.train) self.valid = self.train[-len_valid:] self.train = self.train[:-len_valid] if len(self.test) == 0 and len_test != 0: self.random.shuffle(self.train) self.test = self.train[-len_test:] self.train = self.train[:-len_test] ================================================ FILE: deeppavlov/dataset_iterators/sqlite_iterator.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sqlite3 from logging import getLogger from pathlib import Path from random import Random from typing import List, Any, Dict, Optional, Union, Generator, Tuple from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.registry import register from deeppavlov.core.data.data_fitting_iterator import DataFittingIterator logger = getLogger(__name__) @register('sqlite_iterator') class SQLiteDataIterator(DataFittingIterator): """Iterate over SQLite database. Gen batches from SQLite data. Get document ids and document. Args: load_path: a path to local DB file batch_size: a number of samples in a single batch shuffle: whether to shuffle data during batching seed: random seed for data shuffling Attributes: connect: a DB connection db_name: a DB name doc_ids: DB document ids doc2index: a dictionary of document indices and their titles batch_size: a number of samples in a single batch shuffle: whether to shuffle data during batching random: an instance of :class:`Random` class. """ def __init__(self, load_path: Union[str, Path], batch_size: Optional[int] = None, shuffle: Optional[bool] = None, seed: Optional[int] = None, **kwargs) -> None: load_path = str(expand_path(load_path)) logger.info("Connecting to database, path: {}".format(load_path)) try: self.connect = sqlite3.connect(load_path, check_same_thread=False) except sqlite3.OperationalError as e: e.args = e.args + ("Check that DB path exists and is a valid DB file",) raise e try: self.db_name = self.get_db_name() except TypeError as e: e.args = e.args + ( 'Check that DB path was created correctly and is not empty. ' 'Check that a correct dataset_format is passed to the ODQAReader config',) raise e self.doc_ids = self.get_doc_ids() self.doc2index = self.map_doc2idx() self.batch_size = batch_size self.shuffle = shuffle self.random = Random(seed) def get_doc_ids(self) -> List[Any]: """Get document ids. Returns: document ids """ cursor = self.connect.cursor() cursor.execute('SELECT id FROM {}'.format(self.db_name)) ids = [ids[0] for ids in cursor.fetchall()] cursor.close() return ids def get_db_name(self) -> str: """Get DB name. Returns: DB name """ cursor = self.connect.cursor() cursor.execute("SELECT name FROM sqlite_master WHERE type='table';") assert cursor.arraysize == 1 name = cursor.fetchone()[0] cursor.close() return name def map_doc2idx(self) -> Dict[int, Any]: """Map DB ids to integer ids. Returns: a dictionary of document titles and correspondent integer indices """ doc2idx = {doc_id: i for i, doc_id in enumerate(self.doc_ids)} logger.info( "SQLite iterator: The size of the database is {} documents".format(len(doc2idx))) return doc2idx def get_doc_content(self, doc_id: Any) -> Optional[str]: """Get document content by id. Args: doc_id: a document id Returns: document content if success, else raise Exception """ cursor = self.connect.cursor() cursor.execute( "SELECT text FROM {} WHERE id = ?".format(self.db_name), (doc_id,) ) result = cursor.fetchone() cursor.close() return result if result is None else result[0] def gen_batches(self, batch_size: int, shuffle: bool = None) \ -> Generator[Tuple[List[str], List[int]], Any, None]: """Gen batches of documents. Args: batch_size: a number of samples in a single batch shuffle: whether to shuffle data during batching Yields: generated tuple of documents and their ids """ if shuffle is None: shuffle = self.shuffle if shuffle: _doc_ids = self.random.sample(self.doc_ids, len(self.doc_ids)) else: _doc_ids = self.doc_ids if batch_size > 0: batches = [_doc_ids[i:i + batch_size] for i in range(0, len(_doc_ids), batch_size)] else: batches = [_doc_ids] for i, doc_ids in enumerate(batches): docs = [self.get_doc_content(doc_id) for doc_id in doc_ids] doc_nums = [self.doc2index[doc_id] for doc_id in doc_ids] yield docs, zip(doc_ids, doc_nums) def get_instances(self): """Get all data""" doc_ids = list(self.doc_ids) docs = [self.get_doc_content(doc_id) for doc_id in doc_ids] doc_nums = [self.doc2index[doc_id] for doc_id in doc_ids] return docs, zip(doc_ids, doc_nums) ================================================ FILE: deeppavlov/dataset_iterators/squad_iterator.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json from typing import Dict, Any, List, Tuple, Generator, Optional import numpy as np from deeppavlov.core.common.registry import register from deeppavlov.core.data.data_learning_iterator import DataLearningIterator @register('squad_iterator') class SquadIterator(DataLearningIterator): """SquadIterator allows to iterate over examples in SQuAD-like datasets. SquadIterator is used to train :class:`~deeppavlov.models.torch_bert.torch_transformers_squad:TorchTransformersSquad`. It extracts ``context``, ``question``, ``answer_text`` and ``answer_start`` position from dataset. Example from a dataset is a tuple of ``(context, question)`` and ``(answer_text, answer_start)`` Attributes: train: train examples valid: validation examples test: test examples """ def preprocess(self, data: Dict[str, Any], *args, **kwargs) -> \ List[Tuple[Tuple[str, str], Tuple[List[str], List[int]]]]: """Extracts context, question, answer, answer_start from SQuAD data Args: data: data in squad format Returns: list of (context, question), (answer_text, answer_start) answer text and answer_start are lists """ cqas = [] if data: for article in data['data']: for par in article['paragraphs']: context = par['context'] for qa in par['qas']: q = qa['question'] ans_text = [] ans_start = [] if qa['answers']: for answer in qa['answers']: ans_text.append(answer['text']) ans_start.append(answer['answer_start']) else: ans_text = [''] ans_start = [-1] cqas.append(((context, q), (ans_text, ans_start))) return cqas @register('multi_squad_iterator') class MultiSquadIterator(DataLearningIterator): """Dataset iterator for multiparagraph-SQuAD dataset. With ``with_answer_rate`` rate samples context with answer and with ``1 - with_answer_rate`` samples context from the same article, but without an answer. Contexts without an answer are sampled according to their tfidf scores (tfidf score between question and context). It extracts ``context``, ``question``, ``answer_text`` and ``answer_start`` position from dataset. Example from a dataset is a tuple of ``(context, question)`` and ``(answer_text, answer_start)``. If there is no answer in context, then ``answer_text`` is empty string and `answer_start` is equal to -1. Args: data: dict with keys ``'train'``, ``'valid'`` and ``'test'`` and values seed: random seed for data shuffling shuffle: whether to shuffle data during batching with_answer_rate: sampling rate of contexts with answer Attributes: shuffle: whether to shuffle data during batching random: instance of ``Random`` initialized with a seed """ def __init__(self, data, seed: Optional[int] = None, shuffle: bool = True, with_answer_rate: float = 0.666, *args, **kwargs) -> None: self.with_answer_rate = with_answer_rate self.seed = seed self.np_random = np.random.RandomState(seed) super().__init__(data, seed, shuffle, *args, **kwargs) def gen_batches(self, batch_size: int, data_type: str = 'train', shuffle: bool = None) \ -> Generator[Tuple[Tuple[Tuple[str, str]], Tuple[List[str], List[int]]], None, None]: if shuffle is None: shuffle = self.shuffle if data_type == 'train': random = self.np_random else: random = np.random.RandomState(self.seed) if shuffle: random.shuffle(self.data[data_type]) data = self.data[data_type] data_len = len(data) for i in range((data_len - 1) // batch_size + 1): batch = [] for j in range(i * batch_size, min((i + 1) * batch_size, data_len)): q = data[j]['question'] contexts = data[j]['contexts'] ans_contexts = [c for c in contexts if len(c['answer']) > 0] noans_contexts = [c for c in contexts if len(c['answer']) == 0] # sample context with answer or without answer if random.rand() < self.with_answer_rate or len(noans_contexts) == 0: # select random context with answer context = random.choice(ans_contexts) else: # select random context without answer # prob ~ context tfidf score noans_scores = np.array([x['score'] for x in noans_contexts]) noans_scores = noans_scores / np.sum(noans_scores) context = noans_contexts[np.argmax(random.multinomial(1, noans_scores))] answer_text = [ans['text'] for ans in context['answer']] if len(context['answer']) > 0 else [''] answer_start = [ans['answer_start'] for ans in context['answer']] if len(context['answer']) > 0 else [-1] batch.append(((context['context'], q), (answer_text, answer_start))) yield tuple(zip(*batch)) def get_instances(self, data_type: str = 'train') -> Tuple[Tuple[Tuple[str, str]], Tuple[List[str], List[int]]]: data_examples = [] for qcas in self.data[data_type]: # question, contexts, answers question = qcas['question'] for context in qcas['contexts']: answer_text = [x['text'] for x in context['answer']] answer_start = [x['answer_start'] for x in context['answer']] data_examples.append(((context['context'], question), (answer_text, answer_start))) return tuple(zip(*data_examples)) @register('multi_squad_retr_iterator') class MultiSquadRetrIterator(DataLearningIterator): """Dataset iterator for multiparagraph-SQuAD dataset. reads data from jsonl files With ``with_answer_rate`` rate samples context with answer and with ``1 - with_answer_rate`` samples context from the same article, but without an answer. Contexts without an answer are sampled from uniform distribution. If ``with_answer_rate`` is None than we compute actual ratio for each data example. It extracts ``context``, ``question``, ``answer_text`` and ``answer_start`` position from dataset. Example from a dataset is a tuple of ``(context, question)`` and ``(answer_text, answer_start)``. If there is no answer in context, then ``answer_text`` is empty string and `answer_start` is equal to -1. Args: data: dict with keys ``'train'``, ``'valid'`` and ``'test'`` and values seed: random seed for data shuffling shuffle: whether to shuffle data during batching with_answer_rate: sampling rate of contexts with answer squad_rate: sampling rate of context from squad dataset (actual rate would be with_answer_rate * squad_rate) Attributes: shuffle: whether to shuffle data during batching random: instance of ``Random`` initialized with a seed """ def __init__(self, data, seed: Optional[int] = None, shuffle: bool = False, with_answer_rate: Optional[float] = None, squad_rate: Optional[float] = None, *args, **kwargs) -> None: self.with_answer_rate = with_answer_rate self.squad_rate = squad_rate self.seed = seed self.np_random = np.random.RandomState(seed) self.shuffle = shuffle self.train = data.get('train', []) self.valid = data.get('valid', []) self.test = data.get('test', []) self.data = { 'train': self.train, 'valid': self.valid, 'test': self.test, } if self.shuffle: raise RuntimeError('MultiSquadIterator doesn\'t support shuffling.') def gen_batches(self, batch_size: int, data_type: str = 'train', shuffle: bool = None) \ -> Generator[Tuple[Tuple[Tuple[str, str]], Tuple[List[str], List[int]]], None, None]: if shuffle is None: shuffle = self.shuffle if data_type == 'train': random = self.np_random else: random = np.random.RandomState(self.seed) if shuffle: raise RuntimeError('MultiSquadIterator doesn\'t support shuffling.') datafile = self.data[data_type] with datafile.open('r', encoding='utf8') as fin: end_of_file = False while not end_of_file: batch = [] for i in range(batch_size): line = fin.readline() if len(line) == 0: end_of_file = True break qcas = json.loads(line) q = qcas['question'] contexts = qcas['contexts'] ans_contexts = [c for c in contexts if len(c['answer']) > 0] noans_contexts = [c for c in contexts if len(c['answer']) == 0] ans_clen = len(ans_contexts) noans_clen = len(noans_contexts) # sample context with answer or without answer with_answer_rate = self.with_answer_rate if with_answer_rate is None: with_answer_rate = 1.0 if noans_clen == 0 else ans_clen / (ans_clen + noans_clen) if random.rand() < with_answer_rate or noans_clen == 0: # select random context with answer if self.squad_rate is not None: if random.rand() < self.squad_rate or len(ans_contexts) == 1: # first context is always from squad dataset context = ans_contexts[0] else: context = random.choice(ans_contexts[1:]) else: context = random.choice(ans_contexts) else: # select random context without answer # prob ~ context tfidf score # noans_scores = np.array([x['score'] for x in noans_contexts]) # noans_scores = noans_scores / np.sum(noans_scores) # context = noans_contexts[np.argmax(random.multinomial(1, noans_scores))] context = random.choice(noans_contexts) answer_text = [ans['text'] for ans in context['answer']] if len(context['answer']) > 0 else [''] answer_start = [ans['answer_start'] for ans in context['answer']] if len(context['answer']) > 0 else [-1] batch.append(((context['context'], q), (answer_text, answer_start))) if batch: yield tuple(zip(*batch)) def get_instances(self, data_type: str = 'train') -> Tuple[Tuple[Tuple[str, str]], Tuple[List[str], List[int]]]: data_examples = [] for f in self.data[data_type]: # question, contexts, answers for line in f.open('r', encoding='utf8'): qcas = json.loads(line) question = qcas['question'] for context in qcas['contexts']: answer_text = [x['text'] for x in context['answer']] answer_start = [x['answer_start'] for x in context['answer']] data_examples.append(((context['context'], question), (answer_text, answer_start))) return tuple(zip(*data_examples)) ================================================ FILE: deeppavlov/dataset_iterators/typos_iterator.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from deeppavlov.core.common.registry import register from deeppavlov.core.data.data_learning_iterator import DataLearningIterator @register('typos_iterator') class TyposDatasetIterator(DataLearningIterator): """Implementation of :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for training :class:`~deeppavlov.models.spelling_correction.brillmoore.ErrorModel` """ def split(self, test_ratio: float = 0., *args, **kwargs): """Split all data into train and test Args: test_ratio: ratio of test data to train, from 0. to 1. """ self.train += self.valid + self.test split = int(len(self.train) * test_ratio) self.random.shuffle(self.train) self.test = self.train[:split] self.train = self.train[split:] self.valid = [] ================================================ FILE: deeppavlov/dataset_readers/__init__.py ================================================ ================================================ FILE: deeppavlov/dataset_readers/basic_classification_reader.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from pathlib import Path import pandas as pd from deeppavlov.core.common.registry import register from deeppavlov.core.data.dataset_reader import DatasetReader from deeppavlov.core.data.utils import download log = getLogger(__name__) @register('basic_classification_reader') class BasicClassificationDatasetReader(DatasetReader): """ Class provides reading dataset in .csv format """ def read(self, data_path: str, url: str = None, format: str = "csv", class_sep: str = None, *args, **kwargs) -> dict: """ Read dataset from data_path directory. Reading files are all data_types + extension (i.e for data_types=["train", "valid"] files "train.csv" and "valid.csv" form data_path will be read) Args: data_path: directory with files url: download data files if data_path not exists or empty format: extension of files. Set of Values: ``"csv", "json"`` class_sep: string separator of labels in column with labels sep (str): delimeter for ``"csv"`` files. Default: None -> only one class per sample header (int): row number to use as the column names names (array): list of column names to use orient (str): indication of expected JSON string format lines (boolean): read the file as a json object per line. Default: ``False`` Returns: dictionary with types from data_types. Each field of dictionary is a list of tuples (x_i, y_i) """ data_types = ["train", "valid", "test"] train_file = kwargs.get('train', 'train.csv') if not Path(data_path, train_file).exists(): if url is None: raise Exception( "data path {} does not exist or is empty, and download url parameter not specified!".format( data_path)) log.info("Loading train data from {} to {}".format(url, data_path)) download(source_url=url, dest_file_path=Path(data_path, train_file)) data = {"train": [], "valid": [], "test": []} for data_type in data_types: file_name = kwargs.get(data_type, '{}.{}'.format(data_type, format)) if file_name is None: continue file = Path(data_path).joinpath(file_name) if file.exists(): if format == 'csv': keys = ('sep', 'header', 'names') options = {k: kwargs[k] for k in keys if k in kwargs} df = pd.read_csv(file, **options) elif format == 'json': keys = ('orient', 'lines') options = {k: kwargs[k] for k in keys if k in kwargs} df = pd.read_json(file, **options) else: raise Exception('Unsupported file format: {}'.format(format)) x = kwargs.get("x", "text") y = kwargs.get('y', 'labels') if isinstance(x, list): if class_sep is None: # each sample is a tuple ("text", "label") data[data_type] = [([row[x_] for x_ in x], str(row[y])) for _, row in df.iterrows()] else: # each sample is a tuple ("text", ["label", "label", ...]) data[data_type] = [([row[x_] for x_ in x], str(row[y]).split(class_sep)) for _, row in df.iterrows()] else: if class_sep is None: # each sample is a tuple ("text", "label") data[data_type] = [(row[x], str(row[y])) for _, row in df.iterrows()] else: # each sample is a tuple ("text", ["label", "label", ...]) data[data_type] = [(row[x], str(row[y]).split(class_sep)) for _, row in df.iterrows()] else: log.warning("Cannot find {} file".format(file)) return data ================================================ FILE: deeppavlov/dataset_readers/boolqa_reader.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json from pathlib import Path from typing import Dict, List, Tuple from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.registry import register from deeppavlov.core.data.dataset_reader import DatasetReader from deeppavlov.core.data.utils import download_decompress @register('boolqa_reader') class BoolqaReader(DatasetReader): """ The class to read the BoolQ dataset from files. BoolQ is a question answering dataset for yes/no questions containing 15942 examples. Each example is a triplet of (question, passage, answer). More details about the English BoolQ are available in https://arxiv.org/abs/1905.10044 https://github.com/google-research-datasets/boolean-questions The details about the Russian DaNetQA are available in https://russiansuperglue.com/ru/tasks/task_info/DaNetQA The reader supports English and Russian variants of the dataset. The config example is boolqa_rubert.json. """ urls = { 'en': 'http://files.deeppavlov.ai/datasets/BoolQ.tar.gz', 'ru': 'http://files.deeppavlov.ai/datasets/DaNetQA.tar.gz' } def read(self, data_path: str, language: str = 'en', *args, **kwargs) -> Dict[str, List[Tuple[Tuple[str, str], int]]]: """ Reads BoolQ dataset from files. Args: data_path: A path to a folder with dataset files. language: The dataset language ('ru', 'en' are available) Returns: dataset: items of the dataset [(question, passage), label] """ if language in self.urls: self.url = self.urls[language] else: raise RuntimeError(f'The dataset for {language} is unavailable') data_path = expand_path(data_path) if not data_path.exists(): data_path.mkdir(parents=True) download_decompress(self.url, data_path) dataset = {} for filename in ['train.jsonl', 'valid.jsonl']: dataset[filename.split('.')[0]] = self._build_data(language, data_path / filename) return dataset @staticmethod def _build_data(ln: str, data_path: Path) -> List[Tuple[Tuple[str, str], int]]: data = {} with open(data_path, 'r') as f: for line in f: jline = json.loads(line) if ln == 'ru': if 'label' in jline: data[jline['question'], jline['passage']] = int(jline['label']) if ln == 'en': if 'answer' in jline: data[jline['question'], jline['passage']] = int(jline['answer']) return list(data.items()) ================================================ FILE: deeppavlov/dataset_readers/conll2003_reader.py ================================================ from logging import getLogger from pathlib import Path from deeppavlov.core.common.registry import register from deeppavlov.core.data.dataset_reader import DatasetReader from deeppavlov.core.data.utils import download_decompress log = getLogger(__name__) @register('conll2003_reader') class Conll2003DatasetReader(DatasetReader): """Class to read training datasets in CoNLL-2003 format""" def read(self, data_path: str, dataset_name: str = None, provide_pos: bool = False, provide_chunk: bool = False, provide_doc_ids: bool = False, iob: bool = False, iobes: bool = False, docstart_token: str = None, *args, **kwargs): self.provide_pos = provide_pos self.provide_chunk = provide_chunk self.provide_doc_ids = provide_doc_ids self.iob = iob self.iobes = iobes self.docstart_token = docstart_token self.num_docs = 0 self.x_is_tuple = self.provide_pos or self.provide_doc_ids data_path = Path(data_path) files = list(data_path.glob('*.txt')) if 'train.txt' not in {file_path.name for file_path in files}: if dataset_name == 'conll2003': url = 'http://files.deeppavlov.ai/deeppavlov_data/conll2003_v2.tar.gz' elif dataset_name == 'collection_rus': url = 'http://files.deeppavlov.ai/deeppavlov_data/collection3_v2.tar.gz' elif dataset_name == 'ontonotes': url = 'http://files.deeppavlov.ai/deeppavlov_data/ontonotes_ner.tar.gz' elif dataset_name == 'vlsp2016': url = 'http://files.deeppavlov.ai/deeppavlov_data/vlsp2016.tar.gz' elif dataset_name == 'dailydialog': url = 'http://files.deeppavlov.ai/deeppavlov_data/dailydialog.tar.gz' elif dataset_name == 'collection3': url = 'http://files.deeppavlov.ai/deeppavlov_data/collection3_anh.tar.gz' else: raise RuntimeError('train.txt not found in "{}"'.format(data_path)) data_path.mkdir(exist_ok=True, parents=True) download_decompress(url, data_path) files = list(data_path.glob('*.txt')) dataset = {} for file_name in files: name = file_name.with_suffix('').name dataset[name] = self.parse_ner_file(file_name) return dataset def parse_ner_file(self, file_name: Path): samples = [] with file_name.open(encoding='utf8') as f: tokens = [] pos_tags = [] chunk_tags = [] tags = [] expected_items = 2 + int(self.provide_pos) + int(self.provide_chunk) for line in f: # Check end of the document if 'DOCSTART' in line: if len(tokens) > 1: x = tokens if not self.x_is_tuple else (tokens,) if self.provide_pos: x = x + (pos_tags,) if self.provide_chunk: x = x + (chunk_tags,) if self.provide_doc_ids: x = x + (self.num_docs,) samples.append((x, tags)) tokens = [] pos_tags = [] chunk_tags = [] tags = [] self.num_docs += 1 if self.docstart_token is not None: tokens = [self.docstart_token] pos_tags = ['O'] chunk_tags = ['O'] tags = ['O'] elif len(line) < 2: if (len(tokens) > 0) and (tokens != [self.docstart_token]): x = tokens if not self.x_is_tuple else (tokens,) if self.provide_pos: x = x + (pos_tags,) if self.provide_chunk: x = x + (chunk_tags,) if self.provide_doc_ids: x = x + (self.num_docs,) samples.append((x, tags)) tokens = [] pos_tags = [] chunk_tags = [] tags = [] else: items = line.split() if len(items) < expected_items: raise Exception(f"Input is not valid {line}") tokens.append(items[0]) tags.append(items[-1]) if self.provide_pos: pos_tags.append(items[1]) if self.provide_chunk: chunk_tags.append(items[2]) if tokens: x = tokens if not self.x_is_tuple else (tokens,) if self.provide_pos: x = x + (pos_tags,) if self.provide_chunk: x = x + (chunk_tags,) if self.provide_doc_ids: x = x + (self.num_docs,) samples.append((x, tags)) self.num_docs += 1 if self.iob: return [(x, self._iob2_to_iob(tags)) for x, tags in samples] if self.iobes: return [(x, self._iob2_to_iobes(tags)) for x, tags in samples] return samples @staticmethod def _iob2_to_iob(tags): iob_tags = [] for n, tag in enumerate(tags): if tag.startswith('B-') and (not n or (tags[n - 1][2:] != tag[2:])): tag = tag.replace("B-", "I-") iob_tags.append(tag) return iob_tags @staticmethod def _iob2_to_iobes(tags): tag_map = {"BB": "S", "BO": "S", "IB": "E", "IO": "E"} tags = tags + ["O"] iobes_tags = [] for i in range(len(tags) - 1): tagtag = tags[i][0] + tags[i + 1][0] if tagtag in tag_map: iobes_tags.append(tag_map[tagtag] + tags[i][1:]) else: iobes_tags.append(tags[i]) return iobes_tags ================================================ FILE: deeppavlov/dataset_readers/docred_reader.py ================================================ # Copyright 2021 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import itertools import json import os import random from logging import getLogger from pathlib import Path from typing import Dict, List, Tuple, Union import numpy as np import pandas as pd from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.registry import register from deeppavlov.core.data.dataset_reader import DatasetReader logger = getLogger(__name__) @register('docred_reader') class DocREDDatasetReader(DatasetReader): """ Class to read the datasets in DocRED format""" def read( self, data_path: str, rel2id_path: str, rel_info_path: str, negative_label: str = "Na", train_valid_test_proportion: int = None, valid_test_data_size: int = None, generate_additional_neg_samples: bool = False, num_neg_samples: int = None ) -> Dict[str, List[Tuple]]: """ This class processes the DocRED relation extraction dataset (https://arxiv.org/abs/1906.06127v3). Args: data_path: a path to a folder with dataset files. rel2id_path: a path to a file where information about relation to relation id corresponding is stored. rel_info_path: a path to a file where information about relations and their real names is stored negative_label: a label which will be used as a negative one (by default in DocRED: "Na") train_valid_test_proportion: a proportion in which the data will be splitted into train, valid and test sets valid_test_data_size: absolute amount of dev & test sets generate_additional_neg_samples: boolean; whether to generate additional negative samples or not. num_neg_samples: a number of additional negative samples that will be generated for each positive sample. Returns: DocRED output dictionary in the following format: {"data_type": List[ Tuple( List[ List[all tokens of the document], List[ List[Tuple(start pos of mention 1 of ent 1, end pos of mention 1 of ent 1), ...], List[Tuple(start position of entity 2, end position of entity 2), ...], List[str(NER tag of entity 1), str(NER tag of entity 2)] ], List(int(one-hot encoded relation label)) ) ] } """ with open(str(expand_path(rel2id_path))) as file: self.rel2id = json.load(file) self.id2rel = {value: key for key, value in self.rel2id.items()} with open(str(expand_path(rel_info_path))) as file: self.relid2rel = json.load(file) self.rel2relid = {value: key for key, value in self.relid2rel.items()} self.negative_label = negative_label self.if_add_neg_samples = generate_additional_neg_samples self.num_neg_samples = num_neg_samples if self.if_add_neg_samples and not self.num_neg_samples: raise ValueError("Please provide a number of negative samples to be generated!") if train_valid_test_proportion and valid_test_data_size: raise ValueError( f"The train, valid and test splitting should be done either basing on their proportional values to each" f"other (train_valid_test_proportion parameter), or on the absolute size of valid and test data " f"(valid_test_data_size parameter). They can't be used simultaneously." ) self.train_valid_test_proportion = train_valid_test_proportion self.valid_test_data_size = valid_test_data_size data_path = Path(data_path).resolve() with open(os.path.join(data_path, "train_annotated.json")) as file_ann: train_data = json.load(file_ann) with open(os.path.join(data_path, "dev.json")) as file: valid_data = json.load(file) # if you want to use test data from the original docred without labels (e.g. as negatives...), # uncomment lines below # with open(os.path.join(data_path, "test.json")) as file: # test_data = json.load(file) # test_processed = self.process_docred_file(test_data, neg_samples=None) # merge valid and train data and split them again into train, valid & test if self.train_valid_test_proportion: train_data, test_data, valid_data = self.split_by_relative(list(train_data + valid_data)) elif self.valid_test_data_size: train_data, test_data, valid_data = self.split_by_absolute(list(train_data + valid_data)) else: raise ValueError( f"The train, valid and test splitting should be done either basing on their proportional values to each" f"other (train_valid_test_proportion parameter), or on the absolute size of valid and test data " f"(valid_test_data_size parameter). One of them should be set to the not-None value." ) logger.info("Train data processing...") train_data, train_stat = self.process_docred_file(train_data, neg_samples="twice") logger.info("Valid data processing...") valid_data, valid_stat = self.process_docred_file(valid_data, neg_samples="equal") logger.info("Test data processing...") test_data, test_stat = self.process_docred_file(test_data, neg_samples="equal") self.print_statistics(train_stat, valid_stat, test_stat) data = {"train": train_data, "valid": valid_data, "test": test_data} return data def split_by_absolute(self, all_labeled_data: List) -> Tuple[List, List, List]: """ All annotated data from DocRED is splitted into train, valid and test sets in following proportions: len(valid_data) = len(test_data) = self.valid_test_data_size len(train_data) = len(all data) - 2 * self.valid_test_data_size Args: all_labeled_data: List of all annotated data samples Return: Lists of train, valid and test data """ if (int(self.valid_test_data_size) * 3) > len(all_labeled_data): raise ValueError( f"The dataset size {len(all_labeled_data)} is too small for taking {self.valid_test_data_size} samples" f"for valid and test. Reduce the size of valid and test set." ) random.shuffle(all_labeled_data) valid_data = all_labeled_data[:int(self.valid_test_data_size)] test_data = all_labeled_data[int(self.valid_test_data_size) + 1: 2 * int(self.valid_test_data_size)] train_data = all_labeled_data[2 * int(self.valid_test_data_size) + 1:] return train_data, valid_data, test_data def split_by_relative(self, all_labeled_data: List) -> Tuple[List, List, List]: """ All annotated data from DocRED is splitted into train, valid and test sets in following proportions: len(train_data) = train_valid_test_proportion * len(valid_data) = train_valid_test_proportion * len(test_data) """ random.shuffle(all_labeled_data) one_prop = int(len(all_labeled_data)/int(self.train_valid_test_proportion)) valid_data = all_labeled_data[:one_prop] test_data = all_labeled_data[one_prop + 1: 2 * one_prop] train_data = all_labeled_data[2 * one_prop + 1:] return train_data, valid_data, test_data def process_docred_file(self, data: List[Dict], neg_samples: str = None) -> Tuple[List, Dict]: """ Processes a DocRED data and returns a DeepPavlov relevant output Args: data: List of data units neg_samples: how many negative samples are to be generated Possible values: - None: no negative samples will be generated (relevant to the test set which has from neg samples only) - equal: there will be one negative sample pro positive sample - twice: there will be twice as many negative samples as positive ones - thrice: there will be thrice as many negative samples as positive ones Returns: one list of processed documents """ stat_rel_name = {rel_name: 0 for _, rel_name in self.relid2rel.items()} self.stat = {"POS_REL": 0, "NEG_REL": 0} # collect statistics of positive and negative samples processed_data_samples = [] for data_unit in data: ent_ids2ent_pos, ent_ids2ent_text, ent_ids2ent_tag = {}, {}, {} # get list of all tokens from the document doc = [token for sent in data_unit["sents"] for token in sent] # the sentence start indices are needed for entities' indices recalculation to the whole text sents_begins = list(np.cumsum([0] + [len(sent) for sent in data_unit["sents"]])) for ent_set_id, ent_set in enumerate(data_unit["vertexSet"]): ent_ids2ent_pos[ent_set_id], ent_ids2ent_text[ent_set_id], ent_ids2ent_tag[ent_set_id] = [], [], [] for ent in ent_set: # the list of tuples with each entity's new indices (recalculated regarding to the whole doc) ent_ids2ent_pos[ent_set_id].append( ((ent["pos"][0] + sents_begins[ent["sent_id"]]), (ent["pos"][1] + sents_begins[ent["sent_id"]])) ) # also save entity id to entity as exact text mentions correspondence ent_ids2ent_text[ent_set_id].append(ent["name"]) # get the sample NER tag (logically, the same for all entity mentions) ent_ids2ent_tag[ent_set_id] = ent_set[0]["type"] ent_ids2ent_text[ent_set_id] = list(set(ent_ids2ent_text[ent_set_id])) # if no labels are provided for the data, handle all samples as negative ones if "labels" not in data_unit: processed_data_samples += self.construct_neg_samples(ent_ids2ent_pos, ent_ids2ent_tag, doc) # if labels are provided, save samples as positive samples and generate negatives else: labels = data_unit["labels"] curr_processed_data_samples, stat_rel_name = self.construct_pos_neg_samples( labels, ent_ids2ent_pos, ent_ids2ent_tag, doc, stat_rel_name, neg_samples=neg_samples, ) processed_data_samples += curr_processed_data_samples logger.info(f"Pos samples: {self.stat['POS_REL']} Neg samples: {self.stat['NEG_REL']}.") self.stat.pop("POS_REL") self.stat.pop("NEG_REL") return processed_data_samples, stat_rel_name def construct_pos_neg_samples( self, labels: List, ent_id2ent: Dict, ent_id2ent_tag: Dict, doc: List, stat_rel: Dict, neg_samples: str, ) -> Tuple[List, Dict]: """ Transforms the relevant information into an entry of the DocRED reader output. The entities between which the relation is hold will serve as an annotation for positive samples, while all other entity pairs will be used to construct the negative samples. Args: labels: information about relation found in a document (whole labels list of the original DocRED) ent_id2ent: a dictionary {entity id: [entity mentions' positions]} stat_rel: a dictionary with relation statistics (will be updated) neg_samples: amount of negative samples that are to be generated ent_id2ent_tag: a dictionary {entity id: entity NER tag} doc: list of all tokens of the document Returns: a tuple with list of all doc tokens, entity information (positions & NER tags) and relation. """ num_pos_samples, num_neg_samples = 0, 0 data_samples = [] rel_triples = {} for label_info in labels: entity1_id, entity2_id = label_info["h"], label_info["t"] if (entity1_id, entity2_id) in rel_triples: rel_triples[(entity1_id, entity2_id)].append(self.rel2id[label_info['r']]) else: rel_triples[(entity1_id, entity2_id)] = [self.rel2id[label_info['r']]] # the one hot encoding of the negative label neg_label_one_hot = self.label_to_one_hot([self.rel2id[self.negative_label]]) # iterate over all entities for (ent1, ent2) in itertools.permutations(ent_id2ent, 2): # if there is a relation hold between entities, save them (and a corresponding sample) as positive one if (ent1, ent2) in rel_triples: num_pos_samples += 1 labels = rel_triples[(ent1, ent2)] label_one_hot = self.label_to_one_hot(labels) data_samples.append( self.generate_data_sample(doc, ent1, ent2, label_one_hot, ent_id2ent, ent_id2ent_tag) ) self.stat["POS_REL"] += 1 for label in labels: rel_name = self.relid2rel[self.id2rel[label]] stat_rel[rel_name] += 1 else: if not neg_samples: # if no negative samples should be generated, skip continue # if there is no relation hold between entities, save them (and a corresponding sample) as negative one if neg_samples == "equal" and num_neg_samples < num_pos_samples: num_neg_samples += 1 data_samples.append( self.generate_data_sample(doc, ent1, ent2, neg_label_one_hot, ent_id2ent, ent_id2ent_tag) ) self.stat["NEG_REL"] += 1 elif neg_samples == "twice" and num_neg_samples < 2 * num_pos_samples: num_neg_samples += 1 data_samples.append( self.generate_data_sample(doc, ent1, ent2, neg_label_one_hot, ent_id2ent, ent_id2ent_tag) ) self.stat["NEG_REL"] += 1 elif neg_samples == "thrice" and num_neg_samples < 3 * num_pos_samples: num_neg_samples += 1 data_samples.append( self.generate_data_sample(doc, ent1, ent2, neg_label_one_hot, ent_id2ent, ent_id2ent_tag) ) self.stat["NEG_REL"] += 1 return data_samples, stat_rel def construct_neg_samples( self, ent_id2ent: Dict, ent_id2ent_tag: Dict, doc: List ) -> List[Tuple[Tuple[List, List], List]]: """ Turn the annotated documents but without any positive relation label to the negative samples in a format of the DocRED reader output. Args: ent_id2ent: a dictionary {entity id: [entity mentions' positions]} ent_id2ent_tag: a dictionary {entity id: entity NER tag} doc: list of all tokens of the document Returns: a tuple with list of all doc tokens, entity information (positions & NER tags) and relation (=neg_label). """ neg_data_samples = [] neg_label_one_hot = self.label_to_one_hot([self.rel2id[self.negative_label]]) for ent1, ent2 in itertools.permutations(ent_id2ent.keys(), 2): neg_data_samples.append( self.generate_data_sample(doc, ent1, ent2, neg_label_one_hot, ent_id2ent, ent_id2ent_tag) ) self.stat["NEG_REL"] += 1 return neg_data_samples @staticmethod def generate_data_sample( doc: List, ent1: int, ent2: int, label: List, ent_id2ent: Dict, ent_id2ent_tag: Dict ) -> Tuple[List[Union[List, List]], List]: """ Creates an entry of processed docred corpus """ return ( [ doc, [ent_id2ent[ent1], ent_id2ent[ent2]], [ent_id2ent_tag[ent1], ent_id2ent_tag[ent2]] ], label ) def generate_additional_neg_samples(self, doc: List, forbidden_entities: List, num_neg_samples: int): """ Generated negative samples, i.e. the same document that is used for positive samples, but labeled with "no_relation" label and with entities, that are not connected with any relation, marked as such. Args: doc: list of positive sentences forbidden_entities: list of entities that participate in any of the relations (and, therefore, cannot be chosen for negative sample) num_neg_samples: number of negative samples that are to be generated out of this document Returns: a tuple with list of all doc tokens, entity information (positions & NER tags) and relation (=neg_label). """ # ATTENTION! To make it work, please run the following command: python3 -m deeppavlov install ner_ontonotes_bert from deeppavlov import build_model, configs ner = build_model(configs.ner.ner_ontonotes_bert_mult, download=True) neg_data_samples = [] analysed_sentences = ner([" ".join(doc)]) # returns [[[tokens]], [[ner tags]]] # select ids of tokens that were not part of any relation so far neg_entities_idx = random.sample( [ent_idx for ent_idx in range(len(analysed_sentences[0][0])) if analysed_sentences[0][0][ent_idx] not in forbidden_entities], num_neg_samples * 2 ) # the one hot encoding of the negative label neg_label_one_hot = self.label_to_one_hot([self.rel2id[self.negative_label]]) for n_ent_1_idx, n_ent_2_idx in itertools.permutations(neg_entities_idx, 2): # if already sufficient number of negative samples have been generated if len(neg_data_samples) == num_neg_samples: break neg_entity_1 = analysed_sentences[0][0][n_ent_1_idx] neg_entity_2 = analysed_sentences[0][0][n_ent_2_idx] neg_entity_1_tag = analysed_sentences[1][0][n_ent_1_idx] neg_entity_2_tag = analysed_sentences[1][0][n_ent_2_idx] neg_data_samples.append( (doc, [[neg_entity_1], [neg_entity_2], neg_entity_1_tag, neg_entity_2_tag], neg_label_one_hot) ) self.stat["NEG_REL"] += 1 return neg_data_samples def label_to_one_hot(self, labels: List[int]) -> List: """ Turn labels to one hot encodings """ relation = [0] * len(self.rel2id) for label in labels: relation[label] = 1 return relation def print_statistics(self, train_stat: Dict, valid_stat: Dict, test_stat: Dict) -> None: """ Print out the relation statistics as a markdown table """ df = pd.DataFrame([self.rel2relid, train_stat, valid_stat, test_stat]).T df.columns = ['d{}'.format(i) for i, col in enumerate(df, 1)] logger.info("\n") logger.info(df) ================================================ FILE: deeppavlov/dataset_readers/faq_reader.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, softwaredata # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Dict from pandas import read_csv from deeppavlov.core.common.registry import register from deeppavlov.core.data.dataset_reader import DatasetReader @register('faq_reader') class FaqDatasetReader(DatasetReader): """Reader for FAQ dataset""" def read(self, data_path: str = None, data_url: str = None, x_col_name: str = 'x', y_col_name: str = 'y') -> Dict: """ Read FAQ dataset from specified csv file or remote url Parameters: data_path: path to csv file of FAQ data_url: url to csv file of FAQ x_col_name: name of Question column in csv file y_col_name: name of Answer column in csv file Returns: A dictionary containing training, validation and test parts of the dataset obtainable via ``train``, ``valid`` and ``test`` keys. """ if data_url is not None: data = read_csv(data_url) elif data_path is not None: data = read_csv(data_path) else: raise ValueError("Please specify data_path or data_url parameter") x = data[x_col_name] y = data[y_col_name] train_xy_tuples = [(x[i].strip(), y[i].strip()) for i in range(len(x))] dataset = dict() dataset["train"] = train_xy_tuples dataset["valid"] = [] dataset["test"] = [] return dataset ================================================ FILE: deeppavlov/dataset_readers/huggingface_dataset_reader.py ================================================ # Copyright 2020 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import re from collections import Counter from math import floor from typing import Dict, Optional, List, Union from datasets import load_dataset, Dataset, Features, ClassLabel, concatenate_datasets from deeppavlov.core.common.registry import register from deeppavlov.core.data.dataset_reader import DatasetReader @register('huggingface_dataset_reader') class HuggingFaceDatasetReader(DatasetReader): """Adds HuggingFace Datasets https://huggingface.co/datasets/ to DeepPavlov """ def read(self, path: str, name: Optional[str] = None, train: Optional[str] = None, # for lidirus with no train valid: Optional[str] = None, test: Optional[str] = None, **kwargs) -> Dict[str, Dataset]: """Wraps datasets.load_dataset method Args: path: datasets.load_dataset path argument (e.g., `glue`) name: datasets.load_dataset name argument (e.g., `mrpc`) train: split name to use as training data. valid: split name to use as validation data. test: split name to use as test data. Returns: Dict[str, List[Dict]]: Dictionary with train, valid, test datasets """ if 'split' in kwargs: raise RuntimeError('Split argument was used. Use train, valid, test arguments instead of split.') # pop elements not relevant to BuilderConfig downsample_ratio: Union[List[float], float] = kwargs.pop("downsample_ratio", 1.) seed = kwargs.pop("seed", 42) percentage = kwargs.pop("dev_percentage", 50) do_index_correction = kwargs.pop("do_index_correction", True) split_mapping = {'train': train, 'valid': valid, 'test': test} # filter unused splits split_mapping = {el: split_mapping[el] for el in split_mapping if split_mapping[el]} if isinstance(downsample_ratio, float): downsample_ratio = [downsample_ratio] * len(split_mapping) elif isinstance(downsample_ratio, list) and len(downsample_ratio) != len(split_mapping): raise ValueError("The number of downsample ratios must be the same as the number of splits") if path == "russian_super_glue" and "_mixed" in name: name = name.replace("_mixed", "") dataset = load_dataset(path=path, name=name, split=list(split_mapping.values()), **kwargs) if (path == "super_glue" and name == "copa") or (path == "russian_super_glue" and name == "parus"): lang = "en" if name == "copa" else "ru" dataset = [ dataset_split.map(preprocess_copa, batched=True, fn_kwargs={"lang": lang}) for dataset_split in dataset ] elif path == "super_glue" and name == "boolq": # danetqa doesn't require the same preprocessing dataset = load_dataset(path=path, name=name, split=interleave_splits(splits=list(split_mapping.values()), percentage=percentage), **kwargs) dataset = [dataset_split.map(preprocess_boolq, batched=True) for dataset_split in dataset] elif (path == "super_glue" and name == "record") or (path == "russian_super_glue" and name == "rucos"): label_column = "label" dataset = [ binary_downsample( add_label_names( dataset_split.map(preprocess_record, batched=True, remove_columns=["answers"]), label_column=label_column, label_names=["False", "True"] ), ratio=ratio, seed=seed, label_column=label_column, do_correction=do_index_correction ).map(add_num_examples, batched=True, batch_size=None) for dataset_split, ratio in zip(dataset, downsample_ratio) ] elif (path == "super_glue" and name == "multirc") or (path == "russian_super_glue" and name == "muserc"): dataset = [ dataset_split.map( preprocess_multirc, batched=True, remove_columns=["paragraph", "question"] ) for dataset_split in dataset ] elif (path == "super_glue" and name == "wsc") or (path == "russian_super_glue" and name == "rwsd"): dataset = [ dataset_split.map( preprocess_wsc, batched=True, remove_columns=["span1_index", "span2_index", "span1_text", "span2_text"], ) for dataset_split in dataset ] elif path == "russian_super_glue" and name == "terra_mixed" and "train" in list(split_mapping.values()): tmp_dataset = [] for d, split in zip(dataset, split_mapping.values()): if split == "train": to_mix = load_dataset("super_glue", "rte", split="train") combined_train = concatenate_datasets([to_mix, d]) tmp_dataset.append(combined_train) else: tmp_dataset.append(d) dataset = tmp_dataset elif path == "russian_super_glue" and name == "rcb_mixed" and "train" in list(split_mapping.values()): tmp_dataset = [] for d, split in zip(dataset, split_mapping.values()): if split == "train": to_mix = load_dataset("super_glue", "cb", split="train") combined_train = concatenate_datasets([to_mix, d.remove_columns(["verb", "negation"])]) tmp_dataset.append(combined_train) else: tmp_dataset.append(d.remove_columns(["verb", "negation"])) dataset = tmp_dataset elif path == "russian_super_glue" and name == "danetqa_mixed" and "train" in list(split_mapping.values()): tmp_dataset = [] for d, split in zip(dataset, split_mapping.values()): if split == "train": to_mix = load_dataset( "super_glue", "boolq", split="train" ).map( preprocess_boolq, batched=True ).cast(d.features) combined_train = concatenate_datasets([to_mix, d]) tmp_dataset.append(combined_train) else: tmp_dataset.append(d) dataset = tmp_dataset return dict(zip(split_mapping.keys(), dataset)) def interleave_splits(splits: List[str], percentage: int = 50) -> List[str]: """Adds a portion of `dev` (or, `test` if there's only `train` and `test`) set to the `train` set. Assumes that there are at two splits are passed ordered as (train, dev, test). Args: splits: list of strings percentage: percentage (represented as an integer value between 0 and 100) of samples to extract from `dev` and add to `train` Returns: List[str] containing mixing instructions (e.g. ['train+validation[:50%]', 'validation[-50%:]']) """ if len(splits) < 2: raise ValueError("At least two splits should be passed to this function") mixed_splits = [f"{splits[0]}+{splits[1]}[:{percentage}%]", f"{splits[1]}[-{percentage}%:]"] if len(splits) == 3: mixed_splits += [splits[2]] return mixed_splits def preprocess_copa(examples: Dataset, *, lang: str = "en") -> Dict[str, List[List[str]]]: """COPA preprocessing to be applied by the map function. Args: examples: an instance of Dataset class lang: task language. Either `en` or `ru`. Returns: Dict[str, List[List[str]]]: processed features represented as nested list with number of elements corresponding to the number of choices (2 in this case) """ if lang == "en": question_dict = { "cause": "What was the cause of this?", "effect": "What happened as a result?", } elif lang == "ru": question_dict = { "cause": "Что было причиной этого?", "effect": "Что случилось в результате?", } else: raise ValueError(f"Incorrect `lang` value '{lang}'. Should be either 'en' or 'ru'.") num_choices = 2 questions = [question_dict[question] for question in examples["question"]] premises = examples["premise"] contexts = [f"{premise} {question}" for premise, question in zip(premises, questions)] contexts = [[context] * num_choices for context in contexts] choices = [[choice1, choice2] for choice1, choice2 in zip(examples["choice1"], examples["choice2"])] return {"contexts": contexts, "choices": choices} def preprocess_boolq(examples: Dataset) -> Dict[str, List[str]]: """BoolQ preprocessing to be applied by the map function. The preprocessing boils down to removing redundant titles from the passages. Args: examples: an instance of Dataset class Returns: Dict[str, List[str]]: processed features (just the passage in this case) """ def remove_passage_title(passage: str) -> str: """Removes the title of a given passage. The motivation is that the title duplicates the beginning of the text body, which means that it's redundant. We remove to save space. Args: passage: a single `passage` string Returns: str: the same `passage` string with the title removed """ return re.sub(r"^.+-- ", "", passage) passages = [remove_passage_title(passage) for passage in examples["passage"]] return {"passage": passages} def preprocess_record(examples: Dataset, *, clean_entities: bool = True) -> Dict[str, Union[List[str], List[int]]]: """ReCoRD preprocessing to be applied by the map function. This transforms the original nested structure of the dataset into a flat one. New indices are generated to allow for the restoration of the original structure. The resulting dataset amounts to a binary classification problem. Args: examples: an instance of Dataset class clean_entities: a boolean flag indicating whether to clean-up given entities Returns: Dict[str, Union[List[str], List[int]]]: flattened features of the dataset """ def fill_placeholder(sentence: str, candidate: str) -> str: """Fills `@placeholder` of a given query with the provided entity Args: sentence: query to fill candidate: entity candidate for the query Returns: str: `sentence` with `@placeholder` replaced with `candidate` """ return re.sub(r"@placeholder", candidate.replace("\\", ""), sentence) def remove_highlight(context: str) -> str: """Removes highlights from a given passage Args: context: a passage to remove highlights from Returns: str: `context` with highlights removed """ return re.sub(r"\n@highlight\n", ". ", context) queries: List[str] = examples["query"] passages: List[str] = [remove_highlight(passage) for passage in examples["passage"]] answers: List[List[str]] = examples["answers"] entities: List[List[str]] = examples["entities"] indices: List[Dict[str, int]] = examples["idx"] if clean_entities: tmp_entities = [] for list_of_entities in entities: tmp_entities.append( list(set([entity.strip("\n ,.!") for entity in list_of_entities])) ) entities = tmp_entities tmp_answers = [] for list_of_answers in answers: tmp_answers.append( list(set([answer.strip("\n ,.!") for answer in list_of_answers])) ) answers = tmp_answers # new indices for flat examples merged_indices: List[str] = [] # queries with placeholders filled filled_queries: List[str] = [] # duplicated passages extended_passages: List[str] = [] # contains one entity per flat example flat_entities: List[str] = [] # whether the entity in this example is found in the answers (0 or 1) labels: List[int] = [] for query, passage, list_of_answers, list_of_entities, index in zip(queries, passages, answers, entities, indices): num_candidates: int = len(list_of_entities) candidate_queries: List[str] = [fill_placeholder(query, entity) for entity in list_of_entities] cur_labels: List[int] = [ int(entity in list_of_answers) if list_of_answers else -1 for entity in list_of_entities ] cur_passages: List[str] = [passage] * num_candidates # keep track of the indices to be able to use target metrics passage_index: int = index["passage"] query_index: int = index["query"] example_indices: List[str] = [f"{passage_index}-{query_index}-{num_candidates}"] * num_candidates if sum(cur_labels) != 0: merged_indices.extend(example_indices) filled_queries.extend(candidate_queries) extended_passages.extend(cur_passages) flat_entities.extend(list_of_entities) labels.extend(cur_labels) return {"idx": merged_indices, "query": filled_queries, "passage": extended_passages, "entities": flat_entities, "label": labels} def add_label_names(dataset: Dataset, label_column: str, label_names: List[str]): """Adds `names` to a specified `label` column. All labels (i.e. integers) in the dataset should be < than the number of label names. Args: dataset: a Dataset to add label names to label_column: the name of the label column (such as `label` or `labels`) in the dataset label_names: a list of label names Returns: Dataset: A copy of the passed `dataset` with added label names """ new_features: Features = dataset.features.copy() new_features[label_column] = ClassLabel(names=label_names) return dataset.cast(new_features) def binary_downsample(dataset: Dataset, ratio: float = 0., seed: int = 42, label_column: str = "label", *, do_correction: bool = True) -> Dataset: """Downsamples a given dataset to the specified negative to positive examples ratio. Only works with binary classification datasets with labels denoted as `0` and `1`. Args: dataset: a Dataset to downsample ratio: negative to positive examples ratio to maintain seed: a seed for shuffling label_column: the name of `label` column such as 'label' or 'labels' do_correction: correct resampled indices. If indices aren't corrected then examples with mismatched indices will not be accounted for be ReCoRD metrics. This is not necessarily undesirable because examples with such indices will have less negative examples (or even none), which makes them easier for the model, thus inflating the resulting metrics. Returns: Dataset: a downsampled dataset """ def replace_indices(data: Dataset, index_map: Dict[str, str]) -> Dict[str, List[str]]: idx: List[str] = [index_map.get(el, el) for el in data["idx"]] return {"idx": idx} def get_correct_indices_map(data: Dataset) -> Dict[str, str]: """Generate a dictionary with replacements for indices that are no longer correct due to downsampling (i.e. the total number of elements denoted by the last part of an index has changed) Args: data: a downsampled Dataset Returns: Dict[str, str]: a dictionary containing replacement indices """ actual_n_elements: Counter = Counter(data["idx"]) corrected_index_map: Dict[str, str] = dict() for idx, n_elements in actual_n_elements.items(): expected_n_elements: int = int(idx.split("-")[-1]) if expected_n_elements != n_elements: new_idx: List[str] = idx.split("-") new_idx[-1]: str = str(n_elements) new_idx: str = "-".join(new_idx) corrected_index_map[idx] = new_idx return corrected_index_map def correct_indices(data: Dataset) -> Dataset: """Sets correct number of examples in downsampled indices Args: data: a downsampled dataset Returns: Dataset: the same dataset with correct indices """ index_map: Dict[str, str] = get_correct_indices_map(data) return data.map(replace_indices, batched=True, fn_kwargs={"index_map": index_map}) dataset_labels = dataset.unique(label_column) # `test` split shouldn't be downsampled if dataset_labels == [-1]: return dataset elif set(dataset_labels) == {0, 1}: # positive examples are denoted with `1` num_positive: int = sum(dataset[label_column]) num_total: int = len(dataset) # the original number of negative examples is returned if `ratio` is not explicitly specified num_negative: int = floor(num_positive * ratio if ratio > 0 else num_total - num_positive) # first `num_positive` examples in a sorted dataset are labeled with `1` # while the rest are labeled with `0` sorted_dataset: Dataset = dataset.sort(label_column, reverse=True) # but we need to reshuffle the dataset before returning it shuffled_dataset: Dataset = sorted_dataset.select(range(num_positive + num_negative)).shuffle(seed=seed) if do_correction: shuffled_dataset = correct_indices(shuffled_dataset) return shuffled_dataset # the same logic is not applicable to cases with != 2 classes else: raise ValueError(f"Only binary classification labels are supported (i.e. [0, 1]), but {dataset_labels} were given") def add_num_examples(dataset: Dataset) -> Dict[str, List[int]]: """Adds the total number of examples in a given dataset to each individual example. Must be applied to the whole dataset (i.e. `batched=True, batch_size=None`), otherwise the number will be incorrect. Args: dataset: a Dataset to add number of examples to Returns: Dict[str, List[int]]: total number of examples repeated for each example """ num_examples = len(dataset[next(iter(dataset))]) return {"num_examples": [num_examples] * num_examples} def preprocess_multirc(examples: Dataset, *, clean_paragraphs: bool = True) -> Dict[str, List[str]]: """Compose strings in form of paragraphs and the folllowing questions. Args: examples: A given dataset. clean_paragraphs: Whether replace spaces and digits with a single space. Returns: Dict[str, List[str]]: Composed strings. """ paragraphs: List[str] = examples["paragraph"] questions: List[str] = examples["question"] if clean_paragraphs: paragraphs = [re.sub(r"\s+", " ", re.sub(r"\(\d{1,2}\)", "", paragraph).strip()) for paragraph in paragraphs] contexts = [f"{paragraph} {question}" for paragraph, question in zip(paragraphs, questions)] return {"context": contexts} def preprocess_wsc(dataset: Dataset) -> Dict[str, List[str]]: """Forms proper sentences from spans1 that are always entities and spans2 that describe these entities. Args: dataset: A given dataset. Returns: Dict[str, List[str]]: Answers that form proper sentences from capitalized spans1 and spans2. """ spans1: List[str] = dataset["span1_text"] spans2: List[str] = dataset["span2_text"] answers = [f"{s2.capitalize()} {s1}" for s1, s2 in zip(spans1, spans2)] return {"answer": answers} ================================================ FILE: deeppavlov/dataset_readers/imdb_reader.py ================================================ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from typing import List, Dict, Any, Optional, Tuple from pathlib import Path from deeppavlov.core.common.registry import register from deeppavlov.core.data.dataset_reader import DatasetReader from deeppavlov.core.data.utils import download_decompress, mark_done, is_done log = getLogger(__name__) @register('imdb_reader') class ImdbReader(DatasetReader): """This class downloads and reads the IMDb sentiment classification dataset. https://ai.stanford.edu/~amaas/data/sentiment/ Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts. (2011). Learning Word Vectors for Sentiment Analysis. The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011). """ def read(self, data_path: str, url: Optional[str] = None, *args, **kwargs) -> Dict[str, List[Tuple[Any, Any]]]: """ Args: data_path: A path to a folder with dataset files. url: A url to the archive with the dataset to download if the data folder is empty. """ data_path = Path(data_path) if url is None: url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz" if not is_done(data_path): log.info('[downloading data from {} to {}]'.format(url, data_path)) download_decompress(url, data_path) mark_done(data_path) alternative_data_path = data_path / "aclImdb" if alternative_data_path.exists(): data_path = alternative_data_path data = {"train": [], "test": []} for data_type in data.keys(): for label in ["neg", "pos"]: labelpath = data_path / data_type / label if not labelpath.exists(): raise RuntimeError(f"Cannot load data: {labelpath} does not exist") for filename in labelpath.glob("*.txt"): with filename.open(encoding='utf-8') as f: text = f.read() data[data_type].append((text, [label])) if not data[data_type]: raise RuntimeError(f"Could not load the '{data_type}' dataset, " "probably data dirs are empty") return data ================================================ FILE: deeppavlov/dataset_readers/line_reader.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, softwaredata # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Dict from deeppavlov.core.common.registry import register from deeppavlov.core.data.dataset_reader import DatasetReader @register('line_reader') class LineReader(DatasetReader): """Read txt file by lines""" def read(self, data_path: str = None, *args, **kwargs) -> Dict: """Read lines from txt file Args: data_path: path to txt file Returns: A dictionary containing training, validation and test parts of the dataset obtainable via ``train``, ``valid`` and ``test`` keys. """ with open(data_path) as f: content = f.readlines() dataset = dict() dataset["train"] = [(line,) for line in content] dataset["valid"] = [] dataset["test"] = [] return dataset ================================================ FILE: deeppavlov/dataset_readers/morphotagging_dataset_reader.py ================================================ # Copyright 2018 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys from logging import getLogger from pathlib import Path from typing import Dict, List, Union, Tuple, Optional from deeppavlov.core.common.registry import register from deeppavlov.core.data.dataset_reader import DatasetReader from deeppavlov.core.data.utils import download_decompress, mark_done WORD_COLUMN, POS_COLUMN, TAG_COLUMN = 1, 3, 5 HEAD_COLUMN, DEP_COLUMN = 6, 7 log = getLogger(__name__) def get_language(filepath: str) -> str: """Extracts language from typical UD filename """ return filepath.split("-")[0] def read_infile(infile: Union[Path, str], *, from_words=False, word_column: int = WORD_COLUMN, pos_column: int = POS_COLUMN, tag_column: int = TAG_COLUMN, head_column: int = HEAD_COLUMN, dep_column: int = DEP_COLUMN, max_sents: int = -1, read_only_words: bool = False, read_syntax: bool = False) -> List[Tuple[List, Union[List, None]]]: """Reads input file in CONLL-U format Args: infile: a path to a file word_column: column containing words (default=1) pos_column: column containing part-of-speech labels (default=3) tag_column: column containing fine-grained tags (default=5) head_column: column containing syntactic head position (default=6) dep_column: column containing syntactic dependency label (default=7) max_sents: maximal number of sentences to read read_only_words: whether to read only words read_syntax: whether to return ``heads`` and ``deps`` alongside ``tags``. Ignored if read_only_words is ``True`` Returns: a list of sentences. Each item contains a word sequence and an output sequence. The output sentence is ``None``, if ``read_only_words`` is ``True``, a single list of word tags if ``read_syntax`` is False, and a list of the form [``tags``, ``heads``, ``deps``] in case ``read_syntax`` is ``True``. """ answer, curr_word_sent, curr_tag_sent = [], [], [] curr_head_sent, curr_dep_sent = [], [] # read_syntax = read_syntax and read_only_words if from_words: word_column, read_only_words = 0, True if infile is not sys.stdin: fin = open(infile, "r", encoding="utf8") else: fin = sys.stdin for line in fin: line = line.strip() if line.startswith("#"): continue if line == "": if len(curr_word_sent) > 0: if read_only_words: curr_tag_sent = None elif read_syntax: curr_tag_sent = [curr_tag_sent, curr_head_sent, curr_dep_sent] answer.append((curr_word_sent, curr_tag_sent)) curr_tag_sent, curr_word_sent = [], [] curr_head_sent, curr_dep_sent = [], [] if len(answer) == max_sents: break continue splitted = line.split("\t") index = splitted[0] if not from_words and not index.isdigit(): continue curr_word_sent.append(splitted[word_column]) if not read_only_words: pos, tag = splitted[pos_column], splitted[tag_column] tag = pos if tag == "_" else "{},{}".format(pos, tag) curr_tag_sent.append(tag) if read_syntax: curr_head_sent.append(int(splitted[head_column])) curr_dep_sent.append(splitted[dep_column]) if len(curr_word_sent) > 0: if read_only_words: curr_tag_sent = None elif read_syntax: curr_tag_sent = [curr_tag_sent, curr_head_sent, curr_dep_sent] answer.append((curr_word_sent, curr_tag_sent)) if infile is not sys.stdin: fin.close() return answer @register('morphotagger_dataset_reader') class MorphotaggerDatasetReader(DatasetReader): """Class to read training datasets in UD format""" URL = 'http://files.deeppavlov.ai/datasets/UD2.0_source/' def read(self, data_path: Union[List, str], language: Optional[str] = None, data_types: Optional[List[str]] = None, **kwargs) -> Dict[str, List]: """Reads UD dataset from data_path. Args: data_path: can be either 1. a directory containing files. The file for data_type 'mode' is then data_path / {language}-ud-{mode}.conllu 2. a list of files, containing the same number of items as data_types language: a language to detect filename when it is not given data_types: which dataset parts among 'train', 'dev', 'test' are returned Returns: a dictionary containing dataset fragments (see ``read_infile``) for given data types """ if data_types is None: data_types = ["train", "dev"] elif isinstance(data_types, str): data_types = list(data_types) for data_type in data_types: if data_type not in ["train", "dev", "test"]: raise ValueError("Unknown data_type: {}, only train, dev and test " "datatypes are allowed".format(data_type)) if isinstance(data_path, str): data_path = Path(data_path) if isinstance(data_path, Path): if data_path.exists(): is_file = data_path.is_file() else: is_file = (len(data_types) == 1) if is_file: # path to a single file data_path, reserve_data_path = [data_path], None else: # path to data directory if language is None: raise ValueError("You must implicitly provide language " "when providing data directory as source") reserve_data_path = data_path data_path = [data_path / "{}-ud-{}.conllu".format(language, mode) for mode in data_types] reserve_data_path = [ reserve_data_path / language / "{}-ud-{}.conllu".format(language, mode) for mode in data_types] else: data_path = [Path(data_path) for data_path in data_path] reserve_data_path = None if len(data_path) != len(data_types): raise ValueError("The number of input files in data_path and data types " "in data_types must be equal") has_missing_files = any(not filepath.exists() for filepath in data_path) if has_missing_files and reserve_data_path is not None: has_missing_files = any(not filepath.exists() for filepath in reserve_data_path) if not has_missing_files: data_path = reserve_data_path if has_missing_files: # Files are downloaded from the Web repository dir_path = data_path[0].parent language = language or get_language(data_path[0].parts[-1]) url = self.URL + "{}.tar.gz".format(language) log.info('[downloading data from {} to {}]'.format(url, dir_path)) dir_path.mkdir(exist_ok=True, parents=True) download_decompress(url, dir_path) mark_done(dir_path) data = {} for mode, filepath in zip(data_types, data_path): if mode == "dev": mode = "valid" data[mode] = read_infile(filepath, **kwargs) return data ================================================ FILE: deeppavlov/dataset_readers/multitask_reader.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import copy from logging import getLogger from typing import Dict from deeppavlov.core.common.registry import get_model, register from deeppavlov.core.data.dataset_reader import DatasetReader log = getLogger(__name__) @register('multitask_reader') class MultiTaskReader(DatasetReader): """Class to read several datasets simultaneously.""" def read(self, tasks: Dict[str, Dict[str, dict]], task_defaults: dict = None, **kwargs): """Creates dataset readers for tasks and returns what task dataset readers `read()` methods return. Args: tasks: dictionary which keys are task names and values are dictionaries with param name - value pairs for nested dataset readers initialization. If task has key-value pair ``'use_task_defaults': False``, task_defaults for this task dataset reader will be ignored. task_defaults: default task parameters. Returns: dictionary which keys are task names and values are what task readers `read()` methods returned. """ data = dict() if task_defaults is None: task_defaults = dict() for task_name, task_params in tasks.items(): if task_params.pop('use_task_defaults', True) is True: task_config = copy.deepcopy(task_defaults) task_config.update(task_params) else: task_config = task_params reader = get_model(task_config.pop('class_name'))() data[task_name] = reader.read(**task_config) return data ================================================ FILE: deeppavlov/dataset_readers/odqa_reader.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json import logging import sqlite3 import unicodedata from multiprocessing import Pool from pathlib import Path from typing import Union, List, Tuple, Generator, Any, Optional from tqdm import tqdm from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register from deeppavlov.core.data.dataset_reader import DatasetReader from deeppavlov.core.data.utils import download logger = logging.getLogger(__name__) @register('odqa_reader') class ODQADataReader(DatasetReader): """Build a SQLite database from folder with txt files, json files or `Wiki Extractor `_ files. """ def read(self, data_path: Union[Path, str], db_url: Optional[str] = None, *args, **kwargs) -> None: """Build a SQLite database from provided files, download SQLite database from a provided URL, or do nothing. Args: data_path: a directory/file with texts to create a database from db_url: path to a database url kwargs: save_path: a path where a database should be saved to, or path to a ready database dataset_format: initial data format; should be selected from ['txt', 'wiki', 'json'] Returns: None """ logger.info('Reading files...') try: save_path = expand_path(kwargs['save_path']) except KeyError: raise ConfigError( f'\"save_path\" attribute should be set for {self.__class__.__name__}\ in the JSON config.') if save_path.exists() and save_path.with_suffix(f'{save_path.suffix}.done').exists(): return try: dataset_format = kwargs['dataset_format'] except KeyError: raise ConfigError( f'\"dataset_format\" attribute should be set for {self.__class__.__name__}\ in the JSON config.') save_path.parent.mkdir(parents=True, exist_ok=True) if db_url: download_dir = save_path.parent logger.info(f'Downloading database from {db_url} to {download_dir}') download(download_dir, db_url, force_download=False) return self._build_db(save_path, dataset_format, expand_path(data_path)) def iter_files(self, path: Union[Path, str]) -> Generator[Path, Any, Any]: """Iterate over folder with files or a single file and generate file paths. Args: path: path to a folder or a file Raises: RuntimeError if the provided `path` doesn't exist Yields: file paths one by one Returns: None """ path = Path(path) if path.is_file(): yield path elif path.is_dir(): for item in path.iterdir(): yield from self.iter_files(item) else: raise RuntimeError("Path doesn't exist: {}".format(path)) def _build_db(self, save_path: Union[Path, str], dataset_format: str, data_path: Union[Path, str], num_workers: int = 8) -> None: """Build a SQLite database in parallel and save it to a pointed path. Args: save_path: a path where the ready database should be saved dataset_format: a data format, should be selected from ['txt', 'json', 'wiki'] data_path: path to a folder/file from which to build a database num_workers: a number of workers for parallel database building Raises: sqlite3.OperationalError if `save_path` doesn't exist. RuntimeError if dataset_format is not in ['txt', 'json', 'wiki'] Returns: None """ done_path = save_path.with_suffix(f'{save_path.suffix}.done') if Path(save_path).exists(): Path(save_path).unlink() if done_path.exists(): done_path.unlink() logger.info('Building the database...') try: conn = sqlite3.connect(str(save_path)) except sqlite3.OperationalError as e: e.args = e.args + ("Check that DB path exists.",) raise e c = conn.cursor() sql_table = "CREATE TABLE documents (id PRIMARY KEY, text);" c.execute(sql_table) files = [f for f in self.iter_files(data_path)] workers = Pool(num_workers) if dataset_format == 'txt': fn = self._get_file_contents elif dataset_format == 'json': fn = self._get_json_contents elif dataset_format == 'wiki': fn = self._get_wiki_contents else: raise RuntimeError('Unknown dataset format.') with tqdm(total=len(files)) as pbar: for data in tqdm(workers.imap_unordered(fn, files)): try: c.executemany("INSERT INTO documents VALUES (?,?)", data) pbar.update() except sqlite3.IntegrityError as e: logger.warning(e) conn.commit() conn.close() done_path.touch() @staticmethod def _get_file_contents(fpath: Union[Path, str]) -> List[Tuple[str, str]]: """Extract file contents from '.txt' file. Args: fpath: path to a '.txt' file. Returns: a list with tuple of normalized file name and file contents """ with open(fpath, encoding='utf-8') as fin: text = fin.read() normalized_text = unicodedata.normalize('NFD', text) return [(fpath.name, normalized_text)] @staticmethod def _get_json_contents(fpath: Union[Path, str]) -> List[Tuple[str, str]]: """Extract file contents from '.json' file. JSON files should be formatted as list with dicts which contain 'title' and 'doc' keywords. Args: fpath: path to a '.json' file. Returns: a list with tuples of normalized file name and file contents """ docs = [] with open(fpath, encoding='utf-8') as fin: for line in fin: data = json.loads(line) for doc in data: if not doc: continue text = doc['text'] normalized_text = unicodedata.normalize('NFD', text) docs.append((doc['title'], normalized_text)) return docs @staticmethod def _get_wiki_contents(fpath: Union[Path, str]) -> List[Tuple[str, str]]: """Extract file contents from wiki extractor formatted files. Args: fpath: path to a '.txt' file in wiki extractor format Returns: a list with tuples of normalized file name and file contents """ docs = [] with open(fpath, encoding='utf-8') as fin: for line in fin: doc = json.loads(line) if not doc: continue text = doc['text'] normalized_text = unicodedata.normalize('NFD', text) docs.append((doc['title'], normalized_text)) return docs ================================================ FILE: deeppavlov/dataset_readers/paraphraser_reader.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import xml.etree.ElementTree as ET from pathlib import Path from typing import Dict, List, Tuple from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.registry import register from deeppavlov.core.data.dataset_reader import DatasetReader @register('paraphraser_reader') class ParaphraserReader(DatasetReader): """The class to read the paraphraser.ru dataset from files. Please, see https://paraphraser.ru. """ def read(self, data_path: str, do_lower_case: bool = True, *args, **kwargs) -> Dict[str, List[Tuple[Tuple[str, str], int]]]: """Read the paraphraser.ru dataset from files. Args: data_path: A path to a folder with dataset files. do_lower_case: Do you want to lowercase all texts """ data_path = expand_path(data_path) train_fname = data_path / 'paraphrases.xml' test_fname = data_path / 'paraphrases_gold.xml' train_data = self._build_data(train_fname, do_lower_case) test_data = self._build_data(test_fname, do_lower_case) return {"train": train_data, "valid": [], "test": test_data} @staticmethod def _build_data(data_path: Path, do_lower_case: bool) -> List[Tuple[Tuple[str, str], int]]: root = ET.fromstring(data_path.read_text(encoding='utf8')) data = {} for paraphrase in root.findall('corpus/paraphrase'): key = (paraphrase.find('value[@name="text_1"]').text, paraphrase.find('value[@name="text_2"]').text) if do_lower_case: key = tuple([t.lower() for t in key]) data[key] = 1 if int(paraphrase.find('value[@name="class"]').text) >= 0 else 0 return list(data.items()) ================================================ FILE: deeppavlov/dataset_readers/rel_ranking_reader.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import xml.etree.ElementTree as ET from pathlib import Path from typing import Dict, List, Tuple from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.registry import register from deeppavlov.core.data.dataset_reader import DatasetReader @register('rel_ranking_reader') class ParaphraserReader(DatasetReader): """The class to read the paraphraser.ru dataset from files. ​ Please, see https://paraphraser.ru. """ def read(self, data_path: str, do_lower_case: bool = True, *args, **kwargs) -> Dict[str, List[Tuple[Tuple[str, str], int]]]: """Read the paraphraser.ru dataset from files. ​ Args: data_path: A path to a folder with dataset files. do_lower_case: Do you want to lowercase all texts """ data_path = expand_path(data_path) train_fname = data_path / 'paraphrases.xml' test_fname = data_path / 'paraphrases_gold.xml' train_data = self._build_data(train_fname, do_lower_case) test_data = self._build_data(test_fname, do_lower_case) return {"train": train_data, "valid": [], "test": test_data} @staticmethod def _build_data(data_path: Path, do_lower_case: bool) -> List[Tuple[Tuple[str, str], int]]: root = ET.fromstring(data_path.read_text(encoding='utf8')) data = [] for paraphrase in root.findall('corpus/paraphrase'): key = (paraphrase.find('value[@name="text_1"]').text, paraphrase.find('value[@name="text_2"]').text) if do_lower_case: key = tuple([t.lower() for t in key]) pos_or_neg = int(paraphrase.find('value[@name="class"]').text) data.append((key, pos_or_neg)) return data ================================================ FILE: deeppavlov/dataset_readers/rured_reader.py ================================================ import json import os import random from typing import Dict, List, Tuple from pathlib import Path from logging import getLogger from deeppavlov.core.common.registry import register from deeppavlov.core.data.dataset_reader import DatasetReader logger = getLogger(__name__) @register('rured_reader') class RuREDDatasetReader(DatasetReader): """ Class to read the datasets in RuRED format""" def read(self, data_path: str, rel2id: Dict = None) -> Dict[str, List[Tuple]]: """ This class processes the RuRED relation extraction dataset (http://www.dialog-21.ru/media/5093/gordeevdiplusetal-031.pdf). Args: data_path: a path to a folder with dataset files. rel2id: a path to a file where information about relation to relation id corresponding is stored. Returns: RuRED output dictionary in the following format: DocRED output dictionary in the following format: {"data_type": List[ Tuple( List[ List[all tokens of the document], List[ List[Tuple(start pos of mention 1 of ent 1, end pos of mention 1 of ent 1), ...], List[Tuple(start position of entity 2, end position of entity 2), ...], List[str(NER tag of entity 1), str(NER tag of entity 2)] ], List(int(one-hot encoded relation label)) ) ] } """ data_path = Path(data_path).resolve() if not rel2id: self.rel2id = self.add_default_rel_dict() else: self.rel2id = rel2id self.stat = {} self.ner_stat = {} with open(os.path.join(data_path, "train.json"), encoding='utf-8') as file: train_data = json.load(file) with open(os.path.join(data_path, "dev.json"), encoding='utf-8') as file: dev_data = json.load(file) with open(os.path.join(data_path, "test.json"), encoding='utf-8') as file: test_data = json.load(file) train_data, self.stat["train"] = self.process_rured_file(train_data, num_neg_samples="twice") dev_data, self.stat["dev"] = self.process_rured_file(dev_data, num_neg_samples="equal") test_data, self.stat["test"] = self.process_rured_file(test_data, num_neg_samples="equal") data = {"train": train_data, "valid": dev_data, "test": test_data} return data def process_rured_file(self, data: List[Dict], num_neg_samples: str) -> Tuple[List, Dict]: """ Processes a RuRED data and returns a DeepPavlov relevant output Args: data: List of data units num_neg_samples: how many negative samples will be included to positive ones Possible values: - None: no negative samples will be generated (relevant to the test set which has from neg samples only) - equal: there will be one negative sample pro positive sample - twice: there will be twice as many negative samples as positive ones - all: take all negative samples from the dataset Returns: one list of processed documents """ processed_samples = [] neg_samples = [] # list of indices of negative samples pos_samples = 0 # counter of positive samples for sample in data: # record negative sample ids if sample["relation"] == "no_relation": neg_samples.append(len(processed_samples)) else: pos_samples += 1 if sample["subj_type"] in self.ner_stat: self.ner_stat[sample["subj_type"]] += 1 else: self.ner_stat[sample["subj_type"]] = 1 if sample["obj_type"] in self.ner_stat: self.ner_stat[sample["obj_type"]] += 1 else: self.ner_stat[sample["obj_type"]] = 1 processed_samples.append( ( [ sample["token"], [[(sample["subj_start"], sample["subj_end"])], [(sample["obj_start"], sample["obj_end"])]], [sample["subj_type"], sample["obj_type"]] ], self.label_to_one_hot(self.rel2id[sample["relation"]]) ) ) # filter out some of negative sample if relevant if num_neg_samples == "equal": # include the same amount of negative samples as positive ones neg_to_eliminate = random.sample(neg_samples, (len(neg_samples) - pos_samples)) processed_samples = [ sample for sample_idx, sample in enumerate(processed_samples) if sample_idx not in neg_to_eliminate ] elif num_neg_samples == "twice": # include twice as much negative samples as positive ones neg_to_eliminate = random.sample(neg_samples, (len(neg_samples) - 2 * pos_samples)) processed_samples = [ sample for sample_idx, sample in enumerate(processed_samples) if sample_idx not in neg_to_eliminate ] elif num_neg_samples == "none": # eliminate all negative samples processed_samples = [ sample for sample_idx, sample in enumerate(processed_samples) if sample_idx not in neg_samples ] else: raise ValueError("Unknown negative samples amount! Currently available are 'equal', 'twice' and 'none") # collect statistics stat = {} for sample in processed_samples: rel = [rel for rel, sample_log in enumerate(sample[1]) if sample_log == 1][0] if rel in stat: stat[rel] += 1 else: stat[rel] = 1 return processed_samples, stat def label_to_one_hot(self, label: int) -> List[int]: """ Turn labels to one hot encodings """ relation = [0] * len(self.rel2id) relation[label] = 1 return relation @staticmethod def add_default_rel_dict(): """ Creates a default relation to relation if dictionary with RuRED relations """ return dict(no_relation=0, MEMBER=1, WORKS_AS=2, WORKPLACE=3, OWNERSHIP=4, SUBORDINATE_OF=5, TAKES_PLACE_IN=6, EVENT_TAKES_PART_IN=7, SELLS_TO=8, ALTERNATIVE_NAME=9, HEADQUARTERED_IN=10, PRODUCES=11, ABBREVIATION=12, DATE_DEFUNCT_IN=13, SUBEVENT_OF=14, DATE_FOUNDED_IN=15, DATE_TAKES_PLACE_ON=16, NUMBER_OF_EMPLOYEES_FIRED=17, ORIGINS_FROM=18, ACQUINTANCE_OF=19, PARENT_OF=20, ORGANIZES=21, FOUNDED_BY=22, PLACE_RESIDES_IN=23, BORN_IN=24, AGE_IS=25, RELATIVE=26, NUMBER_OF_EMPLOYEES=27, SIBLING=28, DATE_OF_BIRTH=29) ================================================ FILE: deeppavlov/dataset_readers/sq_reader.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json import pickle from typing import List from deeppavlov.core.common.registry import register from deeppavlov.core.data.dataset_reader import DatasetReader from deeppavlov.core.common.file import load_pickle from deeppavlov.core.common.file import read_json @register('sq_reader') class SQReader(DatasetReader): """Class to read training datasets""" def read(self, data_path: str, valid_size: int = None): if str(data_path).endswith(".pickle"): dataset = load_pickle(data_path) elif str(data_path).endswith(".json"): dataset = read_json(data_path) else: raise TypeError(f'Unsupported file type: {data_path}') if valid_size: dataset["valid"] = dataset["valid"][:valid_size] return dataset @register('rubq_reader') class RuBQReader(SQReader): """Class to read RuBQ datasets""" def read(self, data_path: str, version: str = "2.0", question_types: List[str] = ["all"], not_include_question_types: List[str] = None, num_samples: int = -1): dataset = super().read(data_path) for data_type in ["valid", "test"]: samples = dataset[data_type] samples = [sample for sample in samples if float(sample["RuBQ_version"]) <= float(version) and (any(tp in sample["tags"] for tp in question_types) or question_types == ["all"])] if not_include_question_types: samples = [sample for sample in samples if all([tp not in sample["tags"] for tp in not_include_question_types])] samples = [self.preprocess(sample) for sample in samples] if num_samples > 0: samples = samples[:num_samples] dataset[data_type] = samples return dataset def preprocess(self, sample): question = sample.get("question_text", "") answers = sample.get("answers", []) answer_ids = [elem.get("value", "").split("/")[-1] for elem in answers] answer_labels = [elem.get("label", "").split("/")[-1] for elem in answers] query = sample.get("query", "") if query is None: query = "" else: query = query.replace("\n", " ").replace(" ", " ") return [question, [answer_ids, answer_labels, query]] @register('lcquad_reader') class LCQuADReader(SQReader): """Class to read LCQuAD dataset""" def read(self, data_path: str, question_types: List[str] = "all", not_include_question_types: List[str] = None, num_samples: int = -1): dataset = super().read(data_path) for data_type in ["valid", "test"]: samples = dataset[data_type] samples = [sample for sample in samples if (any(tp == sample["subgraph"] for tp in question_types) \ or question_types == ["all"])] if not_include_question_types: samples = [sample for sample in samples if sample["subgraph"] not in not_include_question_types] samples = [self.preprocess(sample) for sample in samples] if num_samples > 0: samples = samples[:num_samples] dataset[data_type] = samples return dataset def preprocess(self, sample): question = sample.get("question", "") answers = sample.get("answer", []) answer_labels = sample.get("answer_label", []) query = sample.get("sparql_wikidata", "") return [question, [answers, answer_labels, query]] ================================================ FILE: deeppavlov/dataset_readers/squad_dataset_reader.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json from pathlib import Path from typing import Dict, Any, Optional from deeppavlov.core.common.registry import register from deeppavlov.core.data.dataset_reader import DatasetReader from deeppavlov.core.data.utils import download_decompress @register('squad_dataset_reader') class SquadDatasetReader(DatasetReader): """ Downloads dataset files and prepares train/valid split. SQuAD: Stanford Question Answering Dataset https://rajpurkar.github.io/SQuAD-explorer/ SQuAD2.0: Stanford Question Answering Dataset, version 2.0 https://rajpurkar.github.io/SQuAD-explorer/ SberSQuAD: Dataset from SDSJ Task B https://www.sdsj.ru/ru/contest.html MultiSQuAD: SQuAD dataset with additional contexts retrieved (by tfidf) from original Wikipedia article. MultiSQuADRetr: SQuAD dataset with additional contexts retrieved by tfidf document ranker from full Wikipedia. """ url_squad = 'http://files.deeppavlov.ai/datasets/squad-v1.1.tar.gz' url_sber_squad = 'http://files.deeppavlov.ai/datasets/sber_squad-v1.1.tar.gz' url_multi_squad = 'http://files.deeppavlov.ai/datasets/multiparagraph_squad.tar.gz' url_squad2 = 'http://files.deeppavlov.ai/datasets/squad-v2.0.tar.gz' def read(self, data_path: str, dataset: Optional[str] = 'SQuAD', url: Optional[str] = None, *args, **kwargs) \ -> Dict[str, Dict[str, Any]]: """ Args: data_path: path to save data dataset: default dataset names: ``'SQuAD'``, ``'SberSQuAD'`` or ``'MultiSQuAD'`` url: link to archive with dataset, use url argument if non-default dataset is used Returns: dataset split on train/valid Raises: RuntimeError: if `dataset` is not one of these: ``'SQuAD'``, ``'SberSQuAD'``, ``'MultiSQuAD'``. """ if url is not None: self.url = url elif dataset == 'SQuAD': self.url = self.url_squad elif dataset == 'SberSQuAD': self.url = self.url_sber_squad elif dataset == 'MultiSQuAD': self.url = self.url_multi_squad elif dataset == 'SQuAD2.0': self.url = self.url_squad2 else: raise RuntimeError(f'Dataset {dataset} is unknown') data_path = Path(data_path) if dataset == "SQuAD2.0": required_files = [f'{dt}-v2.0.json' for dt in ['train', 'dev']] else: required_files = [f'{dt}-v1.1.json' for dt in ['train', 'dev']] data_path.mkdir(parents=True, exist_ok=True) if not all((data_path / f).exists() for f in required_files): download_decompress(self.url, data_path) dataset = {} for f in required_files: with data_path.joinpath(f).open('r', encoding='utf8') as fp: data = json.load(fp) if f in {'dev-v1.1.json', 'dev-v2.0.json'}: dataset['valid'] = data else: dataset['train'] = data return dataset @register('multi_squad_dataset_reader') class MultiSquadDatasetReader(DatasetReader): """ Downloads dataset files and prepares train/valid split. MultiSQuADRetr: Multiparagraph SQuAD dataset with additional contexts retrieved by tfidf document ranker from full En Wikipedia. MultiSQuADRuRetr: Multiparagraph SberSQuAD dataset with additional contexts retrieved by tfidf document ranker from Ru Wikipedia. """ url_multi_squad_retr = 'http://files.deeppavlov.ai/datasets/multi_squad_retr_enwiki20161221.tar.gz' url_multi_squad_ru_retr = 'http://files.deeppavlov.ai/datasets/multi_squad_ru_retr.tar.gz' def read(self, data_path: str, dataset: Optional[str] = 'MultiSQuADRetr', url: Optional[str] = None, *args, **kwargs) -> Dict[str, Dict[str, Any]]: """ Args: data_path: path to save data dataset: default dataset names: ``'MultiSQuADRetr'``, ``'MultiSQuADRuRetr'`` url: link to archive with dataset, use url argument if non-default dataset is used Returns: dataset split on train/valid Raises: RuntimeError: if `dataset` is not one of these: ``'MultiSQuADRetr'``, ``'MultiSQuADRuRetr'``. """ if url is not None: self.url = url elif dataset == 'MultiSQuADRetr': self.url = self.url_multi_squad_retr elif dataset == 'MultiSQuADRuRetr': self.url = self.url_multi_squad_ru_retr else: raise RuntimeError(f'Dataset {dataset} is unknown') data_path = Path(data_path) required_files = [f'{dt}.jsonl' for dt in ['train', 'dev']] if not data_path.exists(): data_path.mkdir(parents=True) if not all((data_path / f).exists() for f in required_files): download_decompress(self.url, data_path) dataset = {} for f in required_files: if 'dev' in f: dataset['valid'] = data_path.joinpath(f) else: dataset['train'] = data_path.joinpath(f) return dataset ================================================ FILE: deeppavlov/dataset_readers/typos_reader.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import csv from logging import getLogger from pathlib import Path from typing import Dict, List, Tuple import requests from lxml import html from deeppavlov.core.common.registry import register from deeppavlov.core.data.dataset_reader import DatasetReader from deeppavlov.core.data.utils import is_done, download, mark_done log = getLogger(__name__) @register('typos_custom_reader') class TyposCustom(DatasetReader): """Base class for reading spelling corrections dataset files """ def __init__(self): pass @staticmethod def build(data_path: str) -> Path: """Base method that interprets ``data_path`` argument. Args: data_path: path to the tsv-file containing erroneous and corrected words Returns: the same path as a :class:`~pathlib.Path` object """ return Path(data_path) @classmethod def read(cls, data_path: str, *args, **kwargs) -> Dict[str, List[Tuple[str, str]]]: """Read train data for spelling corrections algorithms Args: data_path: path that needs to be interpreted with :meth:`~deeppavlov.dataset_readers.typos_reader.TyposCustom.build` Returns: train data to pass to a :class:`~deeppavlov.dataset_iterators.typos_iterator.TyposDatasetIterator` """ fname = cls.build(data_path) with fname.open(newline='', encoding='utf8') as tsvfile: reader = csv.reader(tsvfile, delimiter='\t') next(reader) res = [(mistake, correct) for mistake, correct in reader] return {'train': res} @register('typos_wikipedia_reader') class TyposWikipedia(TyposCustom): """Implementation of :class:`~deeppavlov.dataset_readers.typos_reader.TyposCustom` that works with English Wikipedia's list of common misspellings """ @staticmethod def build(data_path: str) -> Path: """Download and parse common misspellings list from `Wikipedia `_ Args: data_path: target directory to download the data to Returns: path to the resulting tsv-file """ data_path = Path(data_path) / 'typos_wiki' fname = data_path / 'misspelings.tsv' if not is_done(data_path): url = 'https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines' page = requests.get(url) tree = html.fromstring(page.content) raw = tree.xpath('//pre/text()')[0].splitlines() data = [] for pair in raw: typo, corrects = pair.strip().split('->') for correct in corrects.split(','): data.append([typo.strip(), correct.strip()]) fname.parent.mkdir(parents=True, exist_ok=True) with fname.open('w', newline='', encoding='utf8') as tsvfile: writer = csv.writer(tsvfile, delimiter='\t') for line in data: writer.writerow(line) mark_done(data_path) log.info('Built') return fname @register('typos_kartaslov_reader') class TyposKartaslov(DatasetReader): """Implementation of :class:`~deeppavlov.dataset_readers.typos_reader.TyposCustom` that works with a Russian misspellings dataset from `kartaslov `_ """ def __init__(self): pass @staticmethod def build(data_path: str) -> Path: """Download misspellings list from `github `_ Args: data_path: target directory to download the data to Returns: path to the resulting csv-file """ data_path = Path(data_path) / 'kartaslov' fname = data_path / 'orfo_and_typos.L1_5.csv' if not is_done(data_path): url = 'https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv' download(fname, url) mark_done(data_path) log.info('Built') return fname @staticmethod def read(data_path: str, *args, **kwargs) -> Dict[str, List[Tuple[str, str]]]: """Read train data for spelling corrections algorithms Args: data_path: path that needs to be interpreted with :meth:`~deeppavlov.dataset_readers.typos_reader.TyposKartaslov.build` Returns: train data to pass to a :class:`~deeppavlov.dataset_iterators.typos_iterator.TyposDatasetIterator` """ fname = TyposKartaslov.build(data_path) with open(str(fname), newline='', encoding='utf8') as csvfile: reader = csv.reader(csvfile, delimiter=';') next(reader) res = [(mistake, correct) for correct, mistake, weight in reader] return {'train': res} ================================================ FILE: deeppavlov/dataset_readers/ubuntu_v2_reader.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import csv from pathlib import Path from typing import List, Dict, Tuple, Union from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.registry import register from deeppavlov.core.data.dataset_reader import DatasetReader @register('ubuntu_v2_reader') class UbuntuV2Reader(DatasetReader): """The class to read the Ubuntu V2 dataset from csv files. Please, see https://github.com/rkadlec/ubuntu-ranking-dataset-creator. """ def read(self, data_path: str, positive_samples=False, *args, **kwargs) -> Dict[str, List[Tuple[List[str], int]]]: """Read the Ubuntu V2 dataset from csv files. Args: data_path: A path to a folder with dataset csv files. positive_samples: if `True`, only positive context-response pairs will be taken for train """ data_path = expand_path(data_path) dataset = {'train': None, 'valid': None, 'test': None} train_fname = Path(data_path) / 'train.csv' valid_fname = Path(data_path) / 'valid.csv' test_fname = Path(data_path) / 'test.csv' self.positive_samples = positive_samples self.sen2int_vocab = {} self.classes_vocab_train = {} self.classes_vocab_valid = {} self.classes_vocab_test = {} dataset["train"] = self.preprocess_data_train(train_fname) dataset["valid"] = self.preprocess_data_validation(valid_fname) dataset["test"] = self.preprocess_data_validation(test_fname) return dataset def preprocess_data_train(self, train_fname: Union[Path, str]) -> List[Tuple[List[str], int]]: contexts = [] responses = [] labels = [] with open(train_fname, 'r') as f: reader = csv.reader(f) next(reader) for el in reader: contexts.append(el[0]) responses.append(el[1]) labels.append(int(el[2])) data = list(zip(contexts, responses)) data = list(zip(data, labels)) if self.positive_samples: data = [el[0] for el in data if el[1] == 1] data = list(zip(data, range(len(data)))) return data def preprocess_data_validation(self, fname: Union[Path, str]) -> List[Tuple[List[str], int]]: contexts = [] responses = [] with open(fname, 'r') as f: reader = csv.reader(f) next(reader) for el in reader: contexts.append(el[0]) responses.append(el[1:]) data = [[el[0]] + el[1] for el in zip(contexts, responses)] data = [(el, 1) for el in data] return data ================================================ FILE: deeppavlov/deep.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse from logging import getLogger from deeppavlov.core.commands.infer import interact_model, predict_on_stream from deeppavlov.core.commands.train import train_evaluate_model_from_config from deeppavlov.core.common.cross_validation import calc_cv_score from deeppavlov.core.common.file import find_config from deeppavlov.download import deep_download from deeppavlov.utils.pip_wrapper import install_from_config from deeppavlov.utils.server import start_model_server from deeppavlov.utils.socket import start_socket_server log = getLogger(__name__) parser = argparse.ArgumentParser() parser.add_argument("mode", help="select a mode, train or interact", type=str, choices={'train', 'evaluate', 'interact', 'predict', 'riseapi', 'risesocket', 'download', 'install', 'crossval'}) parser.add_argument("config_path", help="path to a pipeline json config", type=str) parser.add_argument("-e", "--start-epoch-num", dest="start_epoch_num", default=None, help="Start epoch number", type=int) parser.add_argument("--recursive", action="store_true", help="Train nested configs") parser.add_argument("-b", "--batch-size", dest="batch_size", default=None, help="inference batch size", type=int) parser.add_argument("-f", "--input-file", dest="file_path", default=None, help="Path to the input file", type=str) parser.add_argument("-d", "--download", action="store_true", help="download model components") parser.add_argument("-i", "--install", action="store_true", help="install model requirements") parser.add_argument("--folds", help="number of folds", type=int, default=5) parser.add_argument("--https", action="store_true", default=None, help="run model in https mode") parser.add_argument("--key", default=None, help="ssl key", type=str) parser.add_argument("--cert", default=None, help="ssl certificate", type=str) parser.add_argument("-p", "--port", default=None, help="api port", type=int) parser.add_argument("--socket-type", default="TCP", type=str, choices={"TCP", "UNIX"}) parser.add_argument("--socket-file", default="/tmp/deeppavlov_socket.s", type=str) def main(): args = parser.parse_args() pipeline_config_path = find_config(args.config_path) if args.install or args.mode == 'install': install_from_config(pipeline_config_path) if args.download or args.mode == 'download': deep_download(pipeline_config_path) if args.mode == 'train': train_evaluate_model_from_config(pipeline_config_path, recursive=args.recursive, start_epoch_num=args.start_epoch_num) elif args.mode == 'evaluate': train_evaluate_model_from_config(pipeline_config_path, to_train=False, start_epoch_num=args.start_epoch_num) elif args.mode == 'interact': interact_model(pipeline_config_path) elif args.mode == 'riseapi': start_model_server(pipeline_config_path, args.https, args.key, args.cert, port=args.port) elif args.mode == 'risesocket': start_socket_server(pipeline_config_path, args.socket_type, port=args.port, socket_file=args.socket_file) elif args.mode == 'predict': predict_on_stream(pipeline_config_path, args.batch_size, args.file_path) elif args.mode == 'crossval': if args.folds < 2: log.error('Minimum number of Folds is 2') else: calc_cv_score(pipeline_config_path, n_folds=args.folds, is_loo=False) if __name__ == "__main__": main() ================================================ FILE: deeppavlov/download.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import secrets import shutil import sys from argparse import ArgumentParser, Namespace from collections import defaultdict from logging import getLogger from pathlib import Path from typing import Union, Optional, Dict, Iterable, Set, Tuple, List from urllib.parse import urlparse import requests from filelock import FileLock import deeppavlov from deeppavlov.core.commands.utils import expand_path, parse_config from deeppavlov.core.data.utils import download, download_decompress, get_all_elems_from_json, file_md5, \ set_query_parameter, path_set_md5, get_download_token log = getLogger(__name__) parser = ArgumentParser() parser.add_argument('--config', '-c', help="path to a pipeline json config", type=str, default=None) parser.add_argument('-all', action='store_true', help="Download everything. Warning! There should be at least 10 GB space" " available on disk.") def get_config_downloads(config: Union[str, Path, dict]) -> Set[Tuple[str, Path]]: config = parse_config(config) downloads = set() if 'metadata' in config and 'download' in config['metadata']: for resource in config['metadata']['download']: if isinstance(resource, str): resource = { 'url': resource } url = resource['url'] dest = expand_path(resource.get('subdir', '')) downloads.add((url, dest)) config_references = [expand_path(config_ref) for config_ref in get_all_elems_from_json(config, 'config_path')] downloads |= {(url, dest) for config in config_references for url, dest in get_config_downloads(config)} return downloads def get_configs_downloads(config: Optional[Union[str, Path, dict]] = None) -> Dict[str, Set[Path]]: all_downloads = defaultdict(set) if config: configs = [config] else: configs = list(Path(deeppavlov.__path__[0], 'configs').glob('**/*.json')) for config in configs: for url, dest in get_config_downloads(config): all_downloads[url].add(dest) return all_downloads def check_md5(url: str, dest_paths: List[Path], headers: Optional[dict] = None) -> bool: url_md5 = path_set_md5(url) try: if url_md5.startswith('s3://'): import boto3 s3 = boto3.resource('s3') bucket, key = url_md5[5:].split('/', maxsplit=1) obj = s3.Object(bucket, key) data = obj.get()['Body'].read().decode('utf8') else: r = requests.get(url_md5, headers=headers) if r.status_code != 200: return False data = r.text except Exception as e: log.debug(f'Could not download {url_md5} because of an exception {type(e)}: {e}') return False expected = {} for line in data.splitlines(): _md5, fname = line.split(' ', maxsplit=1) if fname[0] != '*': if fname[0] == ' ': log.warning(f'Hash generated in text mode for {fname}, comparison could be incorrect') else: log.error(f'Unknown hash content format in {url + ".md5"}') return False expected[fname[1:]] = _md5 done = None not_done = [] for base_path in dest_paths: if all(file_md5(base_path / p) == _md5 for p, _md5 in expected.items()): done = base_path else: not_done.append(base_path) if done is None: return False for base_path in not_done: log.info(f'Copying data from {done} to {base_path}') for p in expected.keys(): shutil.copy(done / p, base_path / p) return True def download_resource(url: str, dest_paths: Iterable[Union[Path, str]], headers: Optional[dict] = None) -> None: dest_paths = [Path(dest) for dest in dest_paths] download_path = dest_paths[0].parent download_path.mkdir(parents=True, exist_ok=True) file_name = urlparse(url).path.split('/')[-1] lockfile = download_path / f'.{file_name}.lock' with FileLock(lockfile).acquire(poll_intervall=10): if check_md5(url, dest_paths, headers): log.info(f'Skipped {url} download because of matching hashes') elif any(ext in url for ext in ('.tar.gz', '.gz', '.zip')): download_decompress(url, download_path, dest_paths, headers=headers) else: dest_files = [dest_path / file_name for dest_path in dest_paths] download(dest_files, url, headers=headers) def download_resources(args: Namespace) -> None: if not args.all and not args.config: log.error('You should provide either model config path or -all flag') sys.exit(1) elif args.all: downloads = get_configs_downloads() else: config_path = Path(args.config).resolve() downloads = get_configs_downloads(config=config_path) for url, dest_paths in downloads.items(): download_resource(url, dest_paths) def deep_download(config: Union[str, Path, dict]) -> None: downloads = get_configs_downloads(config) last_id = len(downloads) - 1 session_id = secrets.token_urlsafe(32) for file_id, (url, dest_paths) in enumerate(downloads.items()): headers = { 'dp-token': get_download_token(), 'dp-session': session_id, 'dp-file-id': str(last_id - file_id), 'dp-version': deeppavlov.__version__ } if not url.startswith('s3://') and not isinstance(config, dict): url = set_query_parameter(url, 'config', Path(config).stem) download_resource(url, dest_paths, headers) def main(args: Optional[List[str]] = None) -> None: args = parser.parse_args(args) log.info("Downloading...") download_resources(args) log.info("\nDownload successful!") if __name__ == "__main__": main() ================================================ FILE: deeppavlov/metrics/__init__.py ================================================ ================================================ FILE: deeppavlov/metrics/accuracy.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import itertools import re from logging import getLogger from typing import List import numpy as np from deeppavlov.core.common.metrics_registry import register_metric log = getLogger(__name__) @register_metric('accuracy') def accuracy(y_true: [list, np.ndarray], y_predicted: [list, np.ndarray]) -> float: """ Calculate accuracy in terms of absolute coincidence Args: y_true: array of true values y_predicted: array of predicted values Returns: fraction of absolutely coincidental samples """ examples_len = len(y_true) # if y1 and y2 are both arrays, == can be erroneously interpreted as element-wise equality def _are_equal(y1, y2): answer = (y1 == y2) if isinstance(answer, np.ndarray): answer = answer.all() return answer equalities = [_are_equal(y1, y2) for y1, y2 in zip(y_true, y_predicted)] correct = sum(equalities) return correct / examples_len if examples_len else 0 @register_metric('kbqa_accuracy') def kbqa_accuracy(questions_batch, pred_answer_labels_batch, pred_answer_ids_batch, pred_query_batch, gold_answer_labels_batch, gold_answer_ids_batch, gold_query_batch) -> float: num_samples = len(pred_answer_ids_batch) correct = 0 for question, pred_answer_label, pred_answer_ids, pred_query, gold_answer_labels, gold_answer_ids, gold_query in \ zip(questions_batch, pred_answer_labels_batch, pred_answer_ids_batch, pred_query_batch, gold_answer_labels_batch, gold_answer_ids_batch, gold_query_batch): found_date = False if pred_answer_ids and gold_answer_ids and re.findall(r"[\d]{3,4}", pred_answer_ids[0]) and \ re.findall(r"[\d]{3,4}", pred_answer_ids[0]) == re.findall(r"[\d]{3,4}", gold_answer_ids[0]): found_date = True found_label = False if len(gold_answer_labels) == 1 and len(pred_answer_label) > 1 and pred_answer_label == gold_answer_labels[0]: found_label = True no_answer = False if pred_answer_label == "Not Found" and not gold_answer_ids: no_answer = True if set(pred_answer_ids) == set(gold_answer_ids) or gold_query in pred_query or found_date or found_label \ or no_answer: correct += 1 log.debug(f"question: {question} -- gold_answer_ids: {gold_answer_ids} -- pred_answer_ids: {pred_answer_ids}") return correct / num_samples if num_samples else 0 @register_metric('multitask_accuracy') def multitask_accuracy(*args) -> float: """ Accuracy for multiple simultaneous tasks. Args: *args: a list of `2n` inputs. The first `n` inputs are the correct answers for `n` tasks, and the last `n` are the predicted ones. Returns: The percentage of inputs where the answers for all `n` tasks are correct. """ n = len(args) y_true_by_tasks, y_predicted_by_tasks = args[:n // 2], args[n // 2:] answers = [] for true, pred in zip(y_true_by_tasks, y_predicted_by_tasks): answers.append(accuracy(true, pred)) final_answer = sum(answers)/len(answers) return final_answer @register_metric('multitask_sequence_accuracy') def multitask_sequence_accuracy(*args) -> float: """ Accuracy for multiple simultaneous sequence labeling (tagging) tasks. For each sequence the model checks whether all its elements are labeled correctly for all the individual taggers. Args: *args: a list of `2n` inputs. The first `n` inputs are the correct answers for `n` tasks, and the last `n` are the predicted ones. For each task an Returns: The percentage of sequences where all the items has correct answers for all `n` tasks. """ n = len(args) y_true_by_tasks, y_predicted_by_tasks = args[:n // 2], args[n // 2:] y_true_by_sents = list(zip(*y_true_by_tasks)) y_predicted_by_sents = list(zip(*y_predicted_by_tasks)) y_true = list(list(zip(*elem)) for elem in y_true_by_sents) y_predicted = list(list(zip(*elem)) for elem in y_predicted_by_sents) return accuracy(y_true, y_predicted) @register_metric('multitask_token_accuracy') def multitask_token_accuracy(*args) -> float: """ Per-item accuracy for multiple simultaneous sequence labeling (tagging) tasks. Args: *args: a list of `2n` inputs. The first `n` inputs are the correct answers for `n` tasks and the last `n` are the predicted ones. For each task an Returns: The percentage of sequence elements for which the answers for all `n` tasks are correct. """ n = len(args) y_true_by_tasks, y_predicted_by_tasks = args[:n // 2], args[n // 2:] y_true_by_sents = list(zip(*y_true_by_tasks)) y_predicted_by_sents = list(zip(*y_predicted_by_tasks)) y_true = list(list(zip(*elem)) for elem in y_true_by_sents) y_predicted = list(list(zip(*elem)) for elem in y_predicted_by_sents) return per_token_accuracy(y_true, y_predicted) @register_metric('sets_accuracy') def sets_accuracy(y_true: [list, np.ndarray], y_predicted: [list, np.ndarray]) -> float: """ Calculate accuracy in terms of sets coincidence Args: y_true: true values y_predicted: predicted values Returns: portion of samples with absolutely coincidental sets of predicted values Alias: sets_accuracy """ examples_len = len(y_true) correct = sum([set(y1) == set(y2) for y1, y2 in zip(y_true, y_predicted)]) return correct / examples_len if examples_len else 0 @register_metric('slots_accuracy') def slots_accuracy(y_true, y_predicted): y_true = [{tag.split('-')[-1] for tag in s if tag != 'O'} for s in y_true] y_predicted = [set(s.keys()) for s in y_predicted] return accuracy(y_true, y_predicted) @register_metric('per_token_accuracy') def per_token_accuracy(y_true, y_predicted): y_true = list(itertools.chain(*y_true)) y_predicted = itertools.chain(*y_predicted) examples_len = len(y_true) correct = sum([y1 == y2 for y1, y2 in zip(y_true, y_predicted)]) return correct / examples_len if examples_len else 0 # region go-bot metrics @register_metric('per_item_dialog_accuracy') def per_item_dialog_accuracy(y_true, y_predicted: List[List[str]]): # todo metric classes??? y_true = [y['text'] for dialog in y_true for y in dialog] y_predicted = itertools.chain(*y_predicted) examples_len = len(y_true) correct = sum([y1.strip().lower() == y2.strip().lower() for y1, y2 in zip(y_true, y_predicted)]) return correct / examples_len if examples_len else 0 @register_metric('acc') def round_accuracy(y_true, y_predicted): """ Rounds predictions and calculates accuracy in terms of absolute coincidence. Args: y_true: list of true values y_predicted: list of predicted values Returns: portion of absolutely coincidental samples """ if isinstance(y_predicted[0], np.ndarray): predictions = [np.round(x) for x in y_predicted] else: predictions = [round(x) for x in y_predicted] examples_len = len(y_true) correct = sum([y1 == y2 for y1, y2 in zip(y_true, predictions)]) return correct / examples_len if examples_len else 0 ================================================ FILE: deeppavlov/metrics/bleu.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import itertools from typing import List, Tuple, Any from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction, brevity_penalty, closest_ref_length from deeppavlov.core.common.metrics_registry import register_metric from deeppavlov.metrics.google_bleu import compute_bleu SMOOTH = SmoothingFunction() @register_metric('bleu_advanced') def bleu_advanced(y_true: List[Any], y_predicted: List[Any], weights: Tuple = (1,), smoothing_function=SMOOTH.method1, auto_reweigh=False, penalty=True) -> float: """Calculate BLEU score Parameters: y_true: list of reference tokens y_predicted: list of query tokens weights: n-gram weights smoothing_function: SmoothingFunction auto_reweigh: Option to re-normalize the weights uniformly penalty: either enable brevity penalty or not Return: BLEU score """ bleu_measure = sentence_bleu([y_true], y_predicted, weights, smoothing_function, auto_reweigh) hyp_len = len(y_predicted) hyp_lengths = hyp_len ref_lengths = closest_ref_length([y_true], hyp_len) bpenalty = brevity_penalty(ref_lengths, hyp_lengths) if penalty is True or bpenalty == 0: return bleu_measure return bleu_measure / bpenalty @register_metric('bleu') def bleu(y_true, y_predicted): return corpus_bleu([[y_t.lower().split()] for y_t in y_true], [y_p.lower().split() for y_p in y_predicted]) @register_metric('google_bleu') def google_bleu(y_true, y_predicted): return compute_bleu(([y_t.lower().split()] for y_t in y_true), (y_p.lower().split() for y_p in y_predicted))[0] @register_metric('per_item_bleu') def per_item_bleu(y_true, y_predicted): y_predicted = itertools.chain(*y_predicted) return corpus_bleu([[y_t.lower().split()] for y_t in y_true], [y_p.lower().split() for y_p in y_predicted]) @register_metric('per_item_dialog_bleu') def per_item_dialog_bleu(y_true, y_predicted): y_true = (y['text'] for dialog in y_true for y in dialog) return corpus_bleu([[y_t.lower().split()] for y_t in y_true], [y.lower().split() for y_p in y_predicted for y in y_p]) ================================================ FILE: deeppavlov/metrics/correlation.py ================================================ # Copyright 2020 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from scipy.stats import pearsonr, spearmanr from sklearn.metrics import matthews_corrcoef from deeppavlov.core.common.metrics_registry import register_metric @register_metric('pearson_correlation') def pearson_correlation(y_true, y_predicted) -> float: return pearsonr(y_predicted, y_true)[0] @register_metric('spearman_correlation') def spearman_correlation(y_true, y_predicted) -> float: return spearmanr(y_predicted, y_true)[0] @register_metric('matthews_correlation') def matthews_correlation(y_true, y_predicted) -> float: return matthews_corrcoef(y_true, y_predicted) ================================================ FILE: deeppavlov/metrics/elmo_metrics.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import List import numpy as np from deeppavlov.core.common.metrics_registry import register_metric @register_metric('elmo_loss2ppl') def elmo_loss2ppl(losses: List[np.ndarray]) -> float: """ Calculates perplexity by loss Args: losses: list of numpy arrays of model losses Returns: perplexity : float """ avg_loss = np.mean(losses) return float(np.exp(avg_loss)) ================================================ FILE: deeppavlov/metrics/fmeasure.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import itertools from collections import OrderedDict from itertools import chain from logging import getLogger import numpy as np from sklearn.metrics import f1_score from deeppavlov.core.common.metrics_registry import register_metric log = getLogger(__name__) @register_metric('ner_f1') def ner_f1(y_true, y_predicted): """ Calculates F1 measure for Named Entity Recognition task. Args: y_true: list of true values y_predicted: list of predicted values Returns: F1 score Alias: ner_f1 """ y_true = list(chain(*y_true)) y_predicted = list(chain(*y_predicted)) results = precision_recall_f1(y_true, y_predicted, print_results=True) f1 = results['__total__']['f1'] return f1 @register_metric('ner_token_f1') def ner_token_f1(y_true, y_predicted, print_results=False): """ Calculates F1 measure for Named Entity Recognition task without taking into account BIO or BIOES markup. Args: y_true: list of true values y_predicted: list of predicted values print_results: if True, then F1 score for each entity type is printed Returns: F1 score Alias: ner_f1 """ y_true = list(chain(*y_true)) y_pred = list(chain(*y_predicted)) # Drop BIO or BIOES markup assert all(len(tag.split('-')) <= 2 for tag in y_true) y_true = [tag.split('-')[-1] for tag in y_true] y_pred = [tag.split('-')[-1] for tag in y_pred] tags = set(y_true) | set(y_pred) tags_dict = {tag: n for n, tag in enumerate(tags)} y_true_inds = np.array([tags_dict[tag] for tag in y_true]) y_pred_inds = np.array([tags_dict[tag] for tag in y_pred]) results = {} for tag, tag_ind in tags_dict.items(): if tag == 'O': continue tp = np.sum((y_true_inds == tag_ind) & (y_pred_inds == tag_ind)) fn = np.sum((y_true_inds == tag_ind) & (y_pred_inds != tag_ind)) fp = np.sum((y_true_inds != tag_ind) & (y_pred_inds == tag_ind)) n_pred = np.sum(y_pred_inds == tag_ind) n_true = np.sum(y_true_inds == tag_ind) if tp + fp > 0: precision = tp / (tp + fp) * 100 else: precision = 0 if tp + fn > 0: recall = tp / (tp + fn) * 100 else: recall = 0 if precision + recall > 0: f1 = 2 * precision * recall / (precision + recall) else: f1 = 0 results[tag] = {'precision': precision, 'recall': recall, 'f1': f1, 'n_true': n_true, 'n_pred': n_pred, 'tp': tp, 'fp': fp, 'fn': fn} results['__total__'], accuracy, total_true_entities, total_predicted_entities, total_correct = _global_stats_f1( results) n_tokens = len(y_true) if print_results: log.debug('TOKEN LEVEL F1') _print_conll_report(results, accuracy, total_true_entities, total_predicted_entities, n_tokens, total_correct) return results['__total__']['f1'] def _print_conll_report(results, accuracy, total_true_entities, total_predicted_entities, n_tokens, total_correct, short_report=False, entity_of_interest=None): tags = list(results.keys()) s = 'processed {len} tokens ' \ 'with {tot_true} phrases; ' \ 'found: {tot_pred} phrases;' \ ' correct: {tot_cor}.\n\n'.format(len=n_tokens, tot_true=total_true_entities, tot_pred=total_predicted_entities, tot_cor=total_correct) s += 'precision: {tot_prec:.2f}%; ' \ 'recall: {tot_recall:.2f}%; ' \ 'FB1: {tot_f1:.2f}\n\n'.format(acc=accuracy, tot_prec=results['__total__']['precision'], tot_recall=results['__total__']['recall'], tot_f1=results['__total__']['f1']) if not short_report: for tag in tags: if entity_of_interest is not None: if entity_of_interest in tag: s += '\t' + tag + ': precision: {tot_prec:.2f}%; ' \ 'recall: {tot_recall:.2f}%; ' \ 'F1: {tot_f1:.2f} ' \ '{tot_predicted}\n\n'.format(tot_prec=results[tag]['precision'], tot_recall=results[tag]['recall'], tot_f1=results[tag]['f1'], tot_predicted=results[tag]['n_pred']) elif tag != '__total__': s += '\t' + tag + ': precision: {tot_prec:.2f}%; ' \ 'recall: {tot_recall:.2f}%; ' \ 'F1: {tot_f1:.2f} ' \ '{tot_predicted}\n\n'.format(tot_prec=results[tag]['precision'], tot_recall=results[tag]['recall'], tot_f1=results[tag]['f1'], tot_predicted=results[tag]['n_pred']) elif entity_of_interest is not None: s += '\t' + entity_of_interest + ': precision: {tot_prec:.2f}%; ' \ 'recall: {tot_recall:.2f}%; ' \ 'F1: {tot_f1:.2f} ' \ '{tot_predicted}\n\n'.format(tot_prec=results[entity_of_interest]['precision'], tot_recall=results[entity_of_interest]['recall'], tot_f1=results[entity_of_interest]['f1'], tot_predicted=results[entity_of_interest][ 'n_pred']) log.debug(s) def _global_stats_f1(results): total_true_entities = 0 total_predicted_entities = 0 total_precision = 0 total_recall = 0 total_f1 = 0 total_correct = 0 for tag in results: if tag == '__total__': continue n_pred = results[tag]['n_pred'] n_true = results[tag]['n_true'] total_correct += results[tag]['tp'] total_true_entities += n_true total_predicted_entities += n_pred total_precision += results[tag]['precision'] * n_pred total_recall += results[tag]['recall'] * n_true total_f1 += results[tag]['f1'] * n_true if total_true_entities > 0: accuracy = total_correct / total_true_entities * 100 total_recall = total_recall / total_true_entities else: accuracy = 0 total_recall = 0 if total_predicted_entities > 0: total_precision = total_precision / total_predicted_entities else: total_precision = 0 if total_precision + total_recall > 0: total_f1 = 2 * total_precision * total_recall / (total_precision + total_recall) else: total_f1 = 0 total_res = {'n_predicted_entities': total_predicted_entities, 'n_true_entities': total_true_entities, 'precision': total_precision, 'recall': total_recall, 'f1': total_f1} return total_res, accuracy, total_true_entities, total_predicted_entities, total_correct @register_metric('f1') def round_f1(y_true, y_predicted): """ Calculates F1 (binary) measure. Args: y_true: list of true values y_predicted: list of predicted values Returns: F1 score Alias: f1 """ try: predictions = [np.round(x) for x in y_predicted] except TypeError: if set(y_true) | set(y_predicted) in ({"True"}, {"False"}, {"False", "True"}): y_true = [y == "True" for y in y_true] predictions = [y == "True" for y in y_predicted] else: raise RuntimeError(f"Unexpectible type for {y_true} and {predictions}") return f1_score(y_true, predictions) @register_metric('f1_macro') def round_f1_macro(y_true, y_predicted): """ Calculates F1 macro measure. Args: y_true: list of true values y_predicted: list of predicted values Returns: F1 score Alias: f1_macro """ try: predictions = [np.round(x) for x in y_predicted] except TypeError: predictions = y_predicted return f1_score(np.array(y_true), np.array(predictions), average="macro") @register_metric('f1_weighted') def round_f1_weighted(y_true, y_predicted): """ Calculates F1 weighted measure. Args: y_true: list of true values y_predicted: list of predicted values Returns: F1 score Alias: f1_weighted """ try: predictions = [np.round(x) for x in y_predicted] except TypeError: predictions = y_predicted return f1_score(np.array(y_true), np.array(predictions), average="weighted") def chunk_finder(current_token, previous_token, tag): current_tag = current_token.split('-', 1)[-1] previous_tag = previous_token.split('-', 1)[-1] if previous_tag != tag: previous_tag = 'O' if current_tag != tag: current_tag = 'O' if current_tag != 'O' and ( previous_tag == 'O' or previous_token in ['E-' + tag, 'L-' + tag, 'S-' + tag, 'U-' + tag] or current_token in ['B-' + tag, 'S-' + tag, 'U-' + tag] ): create_chunk = True else: create_chunk = False if previous_tag != 'O' and ( current_tag == 'O' or previous_token in ['E-' + tag, 'L-' + tag, 'S-' + tag, 'U-' + tag] or current_token in ['B-' + tag, 'S-' + tag, 'U-' + tag] ): pop_out = True else: pop_out = False return create_chunk, pop_out def precision_recall_f1(y_true, y_pred, print_results=True, short_report=False, entity_of_interest=None): # Find all tags tags = set() for tag in itertools.chain(y_true, y_pred): if tag != 'O': current_tag = tag[2:] tags.add(current_tag) tags = sorted(list(tags)) results = OrderedDict() for tag in tags: results[tag] = OrderedDict() results['__total__'] = OrderedDict() n_tokens = len(y_true) total_correct = 0 # Firstly we find all chunks in the ground truth and prediction # For each chunk we write starting and ending indices for tag in tags: count = 0 true_chunk = [] pred_chunk = [] y_true = [str(y) for y in y_true] y_pred = [str(y) for y in y_pred] prev_tag_true = 'O' prev_tag_pred = 'O' while count < n_tokens: yt = y_true[count] yp = y_pred[count] create_chunk_true, pop_out_true = chunk_finder(yt, prev_tag_true, tag) if pop_out_true: true_chunk[-1] = (true_chunk[-1], count - 1) if create_chunk_true: true_chunk.append(count) create_chunk_pred, pop_out_pred = chunk_finder(yp, prev_tag_pred, tag) if pop_out_pred: pred_chunk[-1] = (pred_chunk[-1], count - 1) if create_chunk_pred: pred_chunk.append(count) prev_tag_true = yt prev_tag_pred = yp count += 1 if len(true_chunk) > 0 and not isinstance(true_chunk[-1], tuple): true_chunk[-1] = (true_chunk[-1], count - 1) if len(pred_chunk) > 0 and not isinstance(pred_chunk[-1], tuple): pred_chunk[-1] = (pred_chunk[-1], count - 1) # Then we find all correctly classified intervals # True positive results tp = len(set(pred_chunk).intersection(set(true_chunk))) # And then just calculate errors of the first and second kind # False negative fn = len(true_chunk) - tp # False positive fp = len(pred_chunk) - tp if tp + fp > 0: precision = tp / (tp + fp) * 100 else: precision = 0 if tp + fn > 0: recall = tp / (tp + fn) * 100 else: recall = 0 if precision + recall > 0: f1 = 2 * precision * recall / (precision + recall) else: f1 = 0 results[tag]['precision'] = precision results[tag]['recall'] = recall results[tag]['f1'] = f1 results[tag]['n_pred'] = len(pred_chunk) results[tag]['n_true'] = len(true_chunk) results[tag]['tp'] = tp results[tag]['fn'] = fn results[tag]['fp'] = fp results['__total__'], accuracy, total_true_entities, total_predicted_entities, accuracy = _global_stats_f1(results) results['__total__']['n_pred'] = total_predicted_entities results['__total__']['n_true'] = total_true_entities if print_results: s = 'processed {len} tokens ' \ 'with {tot_true} phrases; ' \ 'found: {tot_pred} phrases;' \ ' correct: {tot_cor}.\n\n'.format(len=n_tokens, tot_true=total_true_entities, tot_pred=total_predicted_entities, tot_cor=total_correct) s += 'precision: {tot_prec:.2f}%; ' \ 'recall: {tot_recall:.2f}%; ' \ 'FB1: {tot_f1:.2f}\n\n'.format(acc=accuracy, tot_prec=results['__total__']['precision'], tot_recall=results['__total__']['recall'], tot_f1=results['__total__']['f1']) if not short_report: for tag in tags: if entity_of_interest is not None: if entity_of_interest in tag: s += '\t' + tag + ': precision: {tot_prec:.2f}%; ' \ 'recall: {tot_recall:.2f}%; ' \ 'F1: {tot_f1:.2f} ' \ '{tot_predicted}\n\n'.format(tot_prec=results[tag]['precision'], tot_recall=results[tag]['recall'], tot_f1=results[tag]['f1'], tot_predicted=results[tag]['n_pred']) elif tag != '__total__': s += '\t' + tag + ': precision: {tot_prec:.2f}%; ' \ 'recall: {tot_recall:.2f}%; ' \ 'F1: {tot_f1:.2f} ' \ '{tot_predicted}\n\n'.format(tot_prec=results[tag]['precision'], tot_recall=results[tag]['recall'], tot_f1=results[tag]['f1'], tot_predicted=results[tag]['n_pred']) elif entity_of_interest is not None: s += '\t' + entity_of_interest + ': precision: {tot_prec:.2f}%; ' \ 'recall: {tot_recall:.2f}%; ' \ 'F1: {tot_f1:.2f} ' \ '{tot_predicted}\n\n'.format( tot_prec=results[entity_of_interest]['precision'], tot_recall=results[entity_of_interest]['recall'], tot_f1=results[entity_of_interest]['f1'], tot_predicted=results[entity_of_interest]['n_pred']) log.debug(s) return results @register_metric("average__ner_f1__f1_macro__f1") def ner_f1__f1_macro__f1(ner_true, ner_pred, macro_true, macro_pred, f1_true, f1_pred): ner_f1_res = ner_f1(ner_true, ner_pred) / 100 f1_macro_res = round_f1_macro(macro_true, macro_pred) f1_res = round_f1(f1_true, f1_pred) return (ner_f1_res + f1_macro_res + f1_res) / 3 @register_metric("average__roc_auc__roc_auc__ner_f1") def roc_auc__roc_auc__ner_f1(true_onehot1, pred_probas1, true_onehot2, pred_probas2, ner_true3, ner_pred3): from .roc_auc_score import roc_auc_score roc_auc1 = roc_auc_score(true_onehot1, pred_probas1) roc_auc2 = roc_auc_score(true_onehot2, pred_probas2) ner_f1_3 = ner_f1(ner_true3, ner_pred3) / 100 return (roc_auc1 + roc_auc2 + ner_f1_3) / 3 ================================================ FILE: deeppavlov/metrics/google_bleu.py ================================================ # Copyright 2017 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Python implementation of BLEU and smooth-BLEU. This module provides a Python implementation of BLEU and smooth-BLEU. Smooth BLEU is computed following the method outlined in the paper: Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic evaluation metrics for machine translation. COLING 2004. """ import collections import math def _get_ngrams(segment, max_order): """Extracts all n-grams upto a given maximum order from an input segment. Args: segment: text segment from which n-grams will be extracted. max_order: maximum length in tokens of the n-grams returned by this methods. Returns: The Counter containing all n-grams upto max_order in segment with a count of how many times each n-gram occurred. """ ngram_counts = collections.Counter() for order in range(1, max_order + 1): for i in range(0, len(segment) - order + 1): ngram = tuple(segment[i:i + order]) ngram_counts[ngram] += 1 return ngram_counts def compute_bleu(reference_corpus, translation_corpus, max_order=4, smooth=False): """Computes BLEU score of translated segments against one or more references. Args: reference_corpus: list of lists of references for each translation. Each reference should be tokenized into a list of tokens. translation_corpus: list of translations to score. Each translation should be tokenized into a list of tokens. max_order: Maximum n-gram order to use when computing BLEU score. smooth: Whether or not to apply Lin et al. 2004 smoothing. Returns: 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram precisions and brevity penalty. """ matches_by_order = [0] * max_order possible_matches_by_order = [0] * max_order reference_length = 0 translation_length = 0 for (references, translation) in zip(reference_corpus, translation_corpus): reference_length += min(len(r) for r in references) translation_length += len(translation) merged_ref_ngram_counts = collections.Counter() for reference in references: merged_ref_ngram_counts |= _get_ngrams(reference, max_order) translation_ngram_counts = _get_ngrams(translation, max_order) overlap = translation_ngram_counts & merged_ref_ngram_counts for ngram in overlap: matches_by_order[len(ngram) - 1] += overlap[ngram] for order in range(1, max_order + 1): possible_matches = len(translation) - order + 1 if possible_matches > 0: possible_matches_by_order[order - 1] += possible_matches precisions = [0] * max_order for i in range(0, max_order): if smooth: precisions[i] = ((matches_by_order[i] + 1.) / (possible_matches_by_order[i] + 1.)) else: if possible_matches_by_order[i] > 0: precisions[i] = (float(matches_by_order[i]) / possible_matches_by_order[i]) else: precisions[i] = 0.0 if min(precisions) > 0: p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions) geo_mean = math.exp(p_log_sum) else: geo_mean = 0 ratio = float(translation_length) / reference_length if ratio > 1.0: bp = 1. else: bp = math.exp(1 - 1. / ratio) bleu = geo_mean * bp return (bleu, precisions, bp, ratio, translation_length, reference_length) ================================================ FILE: deeppavlov/metrics/log_loss.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import List, Union import numpy as np from sklearn.metrics import log_loss from deeppavlov.core.common.metrics_registry import register_metric @register_metric('log_loss') def sk_log_loss(y_true: Union[List[List[float]], List[List[int]], np.ndarray], y_predicted: Union[List[List[float]], List[List[int]], np.ndarray]) -> float: """ Calculates log loss. Args: y_true: list or array of true values y_predicted: list or array of predicted values Returns: Log loss Alias: log_loss """ return log_loss(y_true, y_predicted) ================================================ FILE: deeppavlov/metrics/mse.py ================================================ # Copyright 2020 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np from sklearn.metrics import mean_squared_error from typing import Union from deeppavlov.core.common.metrics_registry import register_metric @register_metric('mean_squared_error') def mse(y_true: Union[np.array, list], y_predicted: Union[np.array, list], *args, **kwargs) -> float: """ Calculates mean squared error. Args: y_true: list of true values y_predicted: list of predicted values Returns: float: Mean squared error """ for value in [y_true, y_predicted]: assert (np.isfinite(value).all()) return mean_squared_error(y_true, y_predicted, *args, **kwargs) ================================================ FILE: deeppavlov/metrics/recall_at_k.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import List import numpy as np from deeppavlov.core.common.metrics_registry import register_metric def recall_at_k(y_true: List[int], y_pred: List[List[np.ndarray]], k: int): """ Calculates recall at k ranking metric. Args: y_true: Labels. Not used in the calculation of the metric. y_predicted: Predictions. Each prediction contains ranking score of all ranking candidates for the particular data sample. It is supposed that the ranking score for the true candidate goes first in the prediction. Returns: Recall at k """ num_examples = float(len(y_pred)) predictions = np.array(y_pred) predictions = np.flip(np.argsort(predictions, -1), -1)[:, :k] num_correct = 0 for el in predictions: if 0 in el: num_correct += 1 return float(num_correct) / num_examples @register_metric('r@1') def r_at_1(y_true, y_pred): return recall_at_k(y_true, y_pred, k=1) @register_metric('r@2') def r_at_2(y_true, y_pred): return recall_at_k(y_true, y_pred, k=2) @register_metric('r@5') def r_at_5(labels, predictions): return recall_at_k(labels, predictions, k=5) @register_metric('r@10') def r_at_10(labels, predictions): return recall_at_k(labels, predictions, k=10) ================================================ FILE: deeppavlov/metrics/record_metrics.py ================================================ import re import string import collections from typing import List import numpy as np from deeppavlov.models.preprocessors.torch_transformers_preprocessor import RecordNestedExample from deeppavlov.core.common.metrics_registry import register_metric @register_metric("record_f1_score") def record_f1_score(record_examples: List[RecordNestedExample]): """Calculate F1 score for given nested ReCoRD examples Args: record_examples: processed ReCoRD examples Returns: float: F1 score """ if not record_examples: return 0. f1_scores = [] for example in record_examples: example_f1s = [] for answer in example.answers: example_f1s.append(exact_match_score(example.prediction, answer)) if example_f1s: f1_scores.append(max(example_f1s)) return np.mean(f1_scores) @register_metric("record_em_score") def record_em_score(record_examples: List[RecordNestedExample]): """Calculate Exact Match score for given nested ReCoRD examples Args: record_examples: processed ReCoRD examples Returns: float: Exact Match score """ if not record_examples: return 0. em_scores = [] for example in record_examples: example_ems = [] for answer in example.answers: example_ems.append(string_f1_score(example.prediction, answer)) if example_ems: em_scores.append(max(example_ems)) return np.mean(em_scores) if em_scores else -1 def normalize_answer(s): """Lower text and remove punctuation, articles and extra whitespace. From official ReCoRD eval script """ def remove_articles(text): return re.sub(r"\b(a|an|the)\b", " ", text) def white_space_fix(text): return " ".join(text.split()) def remove_punc(text): exclude = set(string.punctuation) return "".join(ch for ch in text if ch not in exclude) def lower(text): return text.lower() return white_space_fix(remove_articles(remove_punc(lower(s)))) def string_f1_score(prediction, ground_truth): """Compute normalized token level F1 From official ReCoRD eval script """ prediction_tokens = normalize_answer(prediction).split() ground_truth_tokens = normalize_answer(ground_truth).split() common = collections.Counter(prediction_tokens) & collections.Counter(ground_truth_tokens) num_same = sum(common.values()) if num_same == 0: return 0 precision = 1.0 * num_same / len(prediction_tokens) recall = 1.0 * num_same / len(ground_truth_tokens) f1 = (2 * precision * recall) / (precision + recall) return f1 def exact_match_score(prediction, ground_truth): """Compute normalized exact match From official ReCoRD eval script """ return normalize_answer(prediction) == normalize_answer(ground_truth) ================================================ FILE: deeppavlov/metrics/roc_auc_score.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import List, Union import numpy as np import sklearn.metrics from deeppavlov.core.common.metrics_registry import register_metric @register_metric('roc_auc') def roc_auc_score(y_true: Union[List[List[float]], List[List[int]], np.ndarray], y_pred: Union[List[List[float]], List[List[int]], np.ndarray]) -> float: """ Compute Area Under the Curve (AUC) from prediction scores. Args: y_true: true binary labels y_pred: target scores, can either be probability estimates of the positive class Returns: Area Under the Curve (AUC) from prediction scores Alias: roc_auc """ try: return sklearn.metrics.roc_auc_score(np.squeeze(np.array(y_true)), np.squeeze(np.array(y_pred)), average="macro") except ValueError: return 0. ================================================ FILE: deeppavlov/metrics/squad_metrics.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import re import string from collections import Counter from typing import List from deeppavlov.core.common.metrics_registry import register_metric @register_metric('squad_v2_em') def squad_v2_exact_match(y_true: List[List[str]], y_predicted: List[str]) -> float: """ Calculates Exact Match score between y_true and y_predicted EM score uses the best matching y_true answer: if y_pred equal at least to one answer in y_true then EM = 1, else EM = 0 The same as in SQuAD-v2.0 Args: y_true: list of correct answers (correct answers are represented by list of strings) y_predicted: list of predicted answers Returns: exact match score : float """ EM_total = sum(normalize_answer(prediction) in map(normalize_answer, ground_truth) for ground_truth, prediction in zip(y_true, y_predicted)) return 100 * EM_total / len(y_true) if len(y_true) > 0 else 0 @register_metric('squad_v1_em') def squad_v1_exact_match(y_true: List[List[str]], y_predicted: List[str]) -> float: """ Calculates Exact Match score between y_true and y_predicted EM score uses the best matching y_true answer: if y_pred equal at least to one answer in y_true then EM = 1, else EM = 0 Skips examples without an answer. Args: y_true: list of correct answers (correct answers are represented by list of strings) y_predicted: list of predicted answers Returns: exact match score : float """ EM_total = 0 count = 0 for ground_truth, prediction in zip(y_true, y_predicted): if len(ground_truth[0]) == 0: # skip empty answers continue count += 1 EMs = [int(normalize_answer(gt) == normalize_answer(prediction)) for gt in ground_truth] EM_total += max(EMs) return 100 * EM_total / count if count > 0 else 0 @register_metric('squad_v2_f1') def squad_v2_f1(y_true: List[List[str]], y_predicted: List[str]) -> float: """ Calculates F-1 score between y_true and y_predicted F-1 score uses the best matching y_true answer The same as in SQuAD-v2.0 Args: y_true: list of correct answers (correct answers are represented by list of strings) y_predicted: list of predicted answers Returns: F-1 score : float """ f1_total = 0.0 for ground_truth, prediction in zip(y_true, y_predicted): prediction_tokens = normalize_answer(prediction).split() f1s = [] for gt in ground_truth: gt_tokens = normalize_answer(gt).split() if len(gt_tokens) == 0 or len(prediction_tokens) == 0: f1s.append(float(gt_tokens == prediction_tokens)) continue common = Counter(prediction_tokens) & Counter(gt_tokens) num_same = sum(common.values()) if num_same == 0: f1s.append(0.0) continue precision = 1.0 * num_same / len(prediction_tokens) recall = 1.0 * num_same / len(gt_tokens) f1 = (2 * precision * recall) / (precision + recall) f1s.append(f1) f1_total += max(f1s) return 100 * f1_total / len(y_true) if len(y_true) > 0 else 0 @register_metric('squad_v1_f1') def squad_v1_f1(y_true: List[List[str]], y_predicted: List[str]) -> float: """ Calculates F-1 score between y_true and y_predicted F-1 score uses the best matching y_true answer Skips examples without an answer. Args: y_true: list of correct answers (correct answers are represented by list of strings) y_predicted: list of predicted answers Returns: F-1 score : float """ f1_total = 0.0 count = 0 for ground_truth, prediction in zip(y_true, y_predicted): if len(ground_truth[0]) == 0: # skip empty answers continue count += 1 prediction_tokens = normalize_answer(prediction).split() f1s = [] for gt in ground_truth: gt_tokens = normalize_answer(gt).split() common = Counter(prediction_tokens) & Counter(gt_tokens) num_same = sum(common.values()) if num_same == 0: f1s.append(0.0) continue precision = 1.0 * num_same / len(prediction_tokens) recall = 1.0 * num_same / len(gt_tokens) f1 = (2 * precision * recall) / (precision + recall) f1s.append(f1) f1_total += max(f1s) return 100 * f1_total / count if count > 0 else 0 def normalize_answer(s: str) -> str: def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text) def white_space_fix(text): return ' '.join(text.split()) def remove_punc(text): exclude = set(string.punctuation) return ''.join(ch for ch in text if ch not in exclude) def lower(text): return text.lower() return white_space_fix(remove_articles(remove_punc(lower(s)))) ================================================ FILE: deeppavlov/models/__init__.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import nltk from deeppavlov.core.common.prints import RedirectedPrints if not os.environ.get('DP_SKIP_NLTK_DOWNLOAD'): with RedirectedPrints(): nltk.download('punkt', quiet=True) nltk.download('stopwords', quiet=True) nltk.download('perluniprops', quiet=True) nltk.download('nonbreaking_prefixes', quiet=True) ================================================ FILE: deeppavlov/models/api_requester/__init__.py ================================================ from .api_requester import * ================================================ FILE: deeppavlov/models/api_requester/api_requester.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import asyncio from typing import Any, List, Dict, AsyncIterable import requests from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component @register('api_requester') class ApiRequester(Component): """Component for forwarding parameters to APIs Args: url: url of the API. out: count of expected returned values or their names in a chainer. param_names: list of parameter names for API requests. debatchify: if ``True``, single instances will be sent to the API endpoint instead of batches. Attributes: url: url of the API. out_count: count of expected returned values. param_names: list of parameter names for API requests. debatchify: if True, single instances will be sent to the API endpoint instead of batches. """ def __init__(self, url: str, out: [int, list], param_names: [list, tuple] = None, debatchify: bool = False, *args, **kwargs): self.url = url if param_names is None: param_names = kwargs.get('in', ()) self.param_names = param_names self.out_count = out if isinstance(out, int) else len(out) self.debatchify = debatchify def __call__(self, *args: List[Any], **kwargs: Dict[str, Any]): """ Args: *args: list of parameters sent to the API endpoint. Parameter names are taken from self.param_names. **kwargs: named parameters to send to the API endpoint. If not empty, args are ignored Returns: result of the API request(s) """ data = kwargs or dict(zip(self.param_names, args)) if self.debatchify: batch_size = 0 for v in data.values(): batch_size = len(v) break assert batch_size > 0 async def collect(): return [j async for j in self.get_async_response(data, batch_size)] loop = asyncio.get_event_loop() response = loop.run_until_complete(collect()) if self.out_count > 1: response = list(zip(*response)) else: response = requests.post(self.url, json=data).json() return response async def get_async_response(self, data: dict, batch_size: int) -> AsyncIterable: """Helper function for sending requests asynchronously if the API endpoint does not support batching Args: data: data to be passed to the API endpoint batch_size: requests count Yields: requests results parsed as json """ loop = asyncio.get_event_loop() futures = [ loop.run_in_executor( None, requests.post, self.url, None, {k: v[i] for k, v in data.items()} ) for i in range(batch_size) ] for r in await asyncio.gather(*futures): yield r.json() ================================================ FILE: deeppavlov/models/api_requester/api_router.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import concurrent from concurrent.futures import ProcessPoolExecutor from logging import getLogger from typing import List from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component from deeppavlov.models.api_requester import ApiRequester logger = getLogger(__name__) @register("api_router") class ApiRouter(Component): """A helper class for running multiple API requesters on the same data in parallel Args: api_requesters: list of ApiRequester objects n_workers: The maximum number of subprocesses to run Attributes: api_requesters: list of ApiRequester objects n_workers: The maximum number of subprocesses to run """ def __init__(self, api_requesters: List[ApiRequester], n_workers: int = 1, *args, **kwargs): self.api_requesters = api_requesters self.n_workers = n_workers def __call__(self, *args): """ Args: *args: list of arguments to forward to the API requesters Returns: results of the requests """ with ProcessPoolExecutor(self.n_workers) as executor: futures = [executor.submit(api_requester, *args) for api_requester in self.api_requesters] concurrent.futures.wait(futures) results = [] for future, api_requester in zip(futures, self.api_requesters): result = future.result() if api_requester.out_count > 1: results += result else: results.append(result) return results ================================================ FILE: deeppavlov/models/classifiers/__init__.py ================================================ ================================================ FILE: deeppavlov/models/classifiers/cos_sim_classifier.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, softwaredata # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from typing import List, Tuple, Union import numpy as np from scipy.sparse import vstack, csr_matrix from scipy.sparse.linalg import norm as sparse_norm from deeppavlov.core.common.file import load_pickle from deeppavlov.core.common.file import save_pickle from deeppavlov.core.common.registry import register from deeppavlov.core.models.estimator import Estimator from deeppavlov.core.models.serializable import Serializable logger = getLogger(__name__) @register("cos_sim_classifier") class CosineSimilarityClassifier(Estimator, Serializable): """ Classifier based on cosine similarity between vectorized sentences Parameters: save_path: path to save the model load_path: path to load the model """ def __init__(self, top_n: int = 1, save_path: str = None, load_path: str = None, **kwargs) -> None: super().__init__(save_path=save_path, load_path=load_path, **kwargs) self.top_n = top_n self.x_train_features = self.y_train = None if kwargs['mode'] != 'train': self.load() def __call__(self, q_vects: Union[csr_matrix, List]) -> Tuple[List[str], List[int]]: """Found most similar answer for input vectorized question Parameters: q_vects: vectorized questions Returns: Tuple of Answer and Score """ if isinstance(q_vects[0], csr_matrix): q_norm = sparse_norm(q_vects) if q_norm == 0.0: cos_similarities = np.zeros((q_vects.shape[0], self.x_train_features.shape[0])) else: norm = q_norm * sparse_norm(self.x_train_features, axis=1) cos_similarities = np.array(q_vects.dot(self.x_train_features.T).todense()) cos_similarities = cos_similarities / norm elif isinstance(q_vects[0], np.ndarray): q_vects = np.array(q_vects) self.x_train_features = np.array(self.x_train_features) norm = np.linalg.norm(q_vects) * np.linalg.norm(self.x_train_features, axis=1) cos_similarities = q_vects.dot(self.x_train_features.T) / norm elif q_vects[0] is None: cos_similarities = np.zeros(len(self.x_train_features)) else: raise NotImplementedError('Not implemented this type of vectors') # get cosine similarity for each class y_labels = np.unique(self.y_train) labels_scores = np.zeros((len(cos_similarities), len(y_labels))) for i, label in enumerate(y_labels): labels_scores[:, i] = np.max([cos_similarities[:, i] for i, value in enumerate(self.y_train) if value == label], axis=0) labels_scores_sum = labels_scores.sum(axis=1, keepdims=True) labels_scores = np.divide(labels_scores, labels_scores_sum, out=np.zeros_like(labels_scores), where=(labels_scores_sum != 0)) answer_ids = np.argsort(labels_scores)[:, -self.top_n:] # generate top_n answers and scores answers = [] scores = [] for i in range(len(answer_ids)): answers.extend([y_labels[id] for id in answer_ids[i, ::-1]]) scores.extend([np.round(labels_scores[i, id], 2) for id in answer_ids[i, ::-1]]) return answers, scores def fit(self, x_train_vects: Tuple[Union[csr_matrix, List]], y_train: Tuple[str]) -> None: """Train classifier Parameters: x_train_vects: vectorized question for train dataset y_train: answers for train dataset Returns: None """ if isinstance(x_train_vects, tuple): if len(x_train_vects) != 0: if isinstance(x_train_vects[0], csr_matrix): self.x_train_features = vstack(list(x_train_vects)) elif isinstance(x_train_vects[0], np.ndarray): self.x_train_features = np.vstack(list(x_train_vects)) else: raise NotImplementedError('Not implemented this type of vectors') else: raise ValueError("Train vectors can't be empty") else: self.x_train_features = x_train_vects self.y_train = list(y_train) def save(self) -> None: """Save classifier parameters""" logger.info("Saving faq_model to {}".format(self.save_path)) save_pickle((self.x_train_features, self.y_train), self.save_path) def load(self) -> None: """Load classifier parameters""" logger.debug("Loading faq_model from {}".format(self.load_path)) self.x_train_features, self.y_train = load_pickle(self.load_path) ================================================ FILE: deeppavlov/models/classifiers/dnnc_proba2labels.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from typing import List import numpy as np from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component log = getLogger(__name__) @register('dnnc_proba2labels') class Proba2Labels(Component): """ Converts pairwise simmilarity scores into class label Args: confidence_threshold: used to determine whether example belongs to one of the classes in 'y_support' or not pooling: strategy for averaging similarity scores for each label is_binary: determines whether the similarity is a number or a probability vector """ def __init__(self, confidence_threshold: float = 0.0, pooling: str = 'max', is_binary: bool = True, **kwargs) -> None: self.confidence_threshold = confidence_threshold self.pooling = pooling self.is_binary = is_binary def __call__(self, simmilarity_scores: List[float], x: List[str], x_populated: List[str], x_support: List[str], y_support: List[str] ) -> List[str]: y_pred = [] simmilarity_scores = np.array(simmilarity_scores) x_populated = np.array(x_populated) x_support = np.array(x_support) y_support = np.array(y_support) unique_labels = np.unique(y_support) # Transform probits vector into a simmilarity score if not self.is_binary: simmilarity_scores = simmilarity_scores[:, 1] for example in x: example_mask = np.where(np.logical_xor(x_populated == example, x_support == example)) example_simmilarity_scores = simmilarity_scores[example_mask] example_y_support = y_support[example_mask] probability_by_label = [] for label in unique_labels: label_mask = np.where(example_y_support == label) label_simmilarity_scores = example_simmilarity_scores[label_mask] if self.pooling == 'avg': label_probability = np.mean(label_simmilarity_scores) elif self.pooling == 'max': label_probability = np.max(label_simmilarity_scores) probability_by_label.append(label_probability) probability_by_label = np.array(probability_by_label) max_probability = max(probability_by_label) max_probability_label = unique_labels[np.argmax(probability_by_label)] prediction = "oos" if max_probability < self.confidence_threshold else max_probability_label y_pred.append(prediction) return y_pred ================================================ FILE: deeppavlov/models/classifiers/proba2labels.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger import numpy as np from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component log = getLogger(__name__) @register('proba2labels') class Proba2Labels(Component): """ Class implements probability to labels processing using the following ways: \ choosing one or top_n indices with maximal probability or choosing any number of indices \ which probabilities to belong with are higher than given confident threshold Args: max_proba: whether to choose label with maximal probability confidence_threshold: boundary probability value for sample to belong with the class (best use for multi-label) top_n: how many top labels with the highest probabilities to return Attributes: max_proba: whether to choose label with maximal probability confidence_threshold: boundary probability value for sample to belong with the class (best use for multi-label) top_n: how many top labels with the highest probabilities to return """ def __init__(self, max_proba: bool = None, confidence_threshold: float = None, top_n: int = None, is_binary: bool = False, **kwargs) -> None: """ Initialize class with given parameters""" self.max_proba = max_proba self.confidence_threshold = confidence_threshold self.top_n = top_n self.is_binary = is_binary def __call__(self, *args, **kwargs): """ Process probabilities to labels Args: Every argument is a list of vectors with probability distribution Returns: list of labels (only label classification) or list of lists of labels (multi-label classification), or list of the following lists (in multitask setting) for every argument """ answer = [] log.debug(f'input {args}') for data in args: if all([k is None for k in data]): answer.append([]) elif self.confidence_threshold: if self.is_binary: answer.append([int(el > self.confidence_threshold) for el in data]) else: answer.append([list(np.where(np.array(d) > self.confidence_threshold)[0]) for d in data]) elif self.max_proba: answer.append([np.argmax(d) for d in data]) elif self.top_n: answer.append([np.argsort(d)[::-1][:self.top_n] for d in data]) else: raise ConfigError("Proba2Labels requires one of three arguments: bool `max_proba` or " "float `confidence_threshold` for multi-label classification or" "integer `top_n` for choosing several labels with the highest probabilities") if len(args) == 1: # only one argument answer = answer[0] log.debug(f'output {answer}') return answer ================================================ FILE: deeppavlov/models/classifiers/re_bert.py ================================================ import logging from pathlib import Path from typing import Tuple, Union, Any, List import torch from torch import Tensor import torch.nn as nn from opt_einsum import contract from transformers import AutoConfig, BertModel, BertTokenizer from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.errors import ConfigError from deeppavlov.models.relation_extraction.losses import ATLoss log = logging.getLogger(__name__) class BertWithAdaThresholdLocContextPooling(nn.Module): def __init__( self, n_classes: int = 97, pretrained_bert: str = None, bert_tokenizer_config_file: str = None, bert_config_file: str = None, emb_size: int = 768, block_size: int = 8, # 64 num_ner_tags: int = 6, # number of ner tags threshold: float = None, device: str = "gpu" ): super().__init__() self.n_classes = n_classes self.pretrained_bert = pretrained_bert self.bert_config_file = bert_config_file self.num_ner_tags = num_ner_tags self.emb_size = emb_size self.block_size = block_size self.threshold = threshold self.loss_fnt = ATLoss() self.device = torch.device("cuda" if torch.cuda.is_available() and device == "gpu" else "cpu") # initialize parameters that would be filled later self.model, self.config, self.bert_config = None, None, None self.load() # initialize tokenizer to call resize_token_embeddings function for model with increased tokenizer size (due to # the additional token) and get CLS and SEP token ids if Path(bert_tokenizer_config_file).is_file(): vocab_file = str(expand_path(bert_tokenizer_config_file)) self.tokenizer = BertTokenizer(vocab_file=vocab_file) else: tokenizer = BertTokenizer.from_pretrained(pretrained_bert) self.model.resize_token_embeddings(len(tokenizer) + 1) self.cls_token_id = tokenizer.cls_token_id self.sep_token_id = tokenizer.sep_token_id self.hidden_size = self.config.hidden_size self.head_extractor = nn.Linear(2 * self.hidden_size + self.num_ner_tags, self.emb_size) self.tail_extractor = nn.Linear(2 * self.hidden_size + self.num_ner_tags, self.emb_size) self.bilinear = nn.Linear(self.emb_size * self.block_size, self.n_classes) def forward( self, input_ids: Tensor, attention_mask: Tensor, entity_pos: List, ner_tags: List, labels: List = None ) -> Union[Tuple[Any, Tensor], Tuple[Tensor]]: if labels: curr_threshold = None # for training: no set threshold but adaptive one else: curr_threshold = self.threshold # for development and test: threshold set in config output = self.model(input_ids=input_ids, attention_mask=attention_mask) sequence_output = output[0] # Tensor (batch_size x input_length x 768) attention = output[-1][-1] # Tensor (batch_size x 12 x input_length x input_length) hs, rs, ts = self.get_hrt(sequence_output, attention, entity_pos) # Tensors (batch_size x 768) # get ner tags of entities hs_ner_tags, ts_ner_tags = torch.Tensor([list(ele) for ele in list(zip(*ner_tags))]).to(self.device) hs_inp = torch.cat([hs, rs, hs_ner_tags], dim=1) ts_inp = torch.cat([ts, rs, ts_ner_tags], dim=1) hs = torch.tanh(self.head_extractor(hs_inp)) ts = torch.tanh(self.tail_extractor(ts_inp)) b1 = hs.view(-1, self.emb_size // self.block_size, self.block_size) b2 = ts.view(-1, self.emb_size // self.block_size, self.block_size) bl = (b1.unsqueeze(3) * b2.unsqueeze(2)).view(-1, self.emb_size * self.block_size) logits = self.bilinear(bl) output = (self.loss_fnt.get_label(logits, num_labels=self.n_classes, threshold=curr_threshold), logits) if labels is not None: labels_tensors = [torch.tensor(label) for label in labels] labels_tensors = torch.stack(labels_tensors).to(logits) loss = self.loss_fnt(logits.float(), labels_tensors.float()) output = (loss.to(sequence_output),) + output return output def get_hrt(self, sequence_output: Tensor, attention: Tensor, entity_pos: List) -> Tuple[Tensor, Tensor, Tensor]: _, h, _, max_sequence_length = attention.size() hss, tss, rss = [], [], [] for i in range(len(entity_pos)): # for each training sample (= doc) entity_embs, entity_atts = [], [] for e in entity_pos[i]: # for each entity (= list of entity mentions) if len(e) == 0: continue if len(e) > 1: e_emb, e_att = [], [] for start, end in e: # for start and end position of each mention # skip the entity pair if the entity mention is truncated due to limited max seq length. if start + 1 < max_sequence_length: e_emb.append(sequence_output[i, start + 1]) e_att.append(attention[i, :, start + 1]) if len(e_emb) > 0: e_emb = torch.logsumexp(torch.stack(e_emb, dim=0), dim=0) e_att = torch.stack(e_att, dim=0).mean(0) else: e_emb = torch.zeros(self.hidden_size).to(sequence_output) e_att = torch.zeros(h, max_sequence_length).to(attention) else: start, end = e[0] if start + 1 < max_sequence_length: e_emb = sequence_output[i, start + 1] e_att = attention[i, :, start + 1] else: e_emb = torch.zeros(self.hidden_size).to(sequence_output) e_att = torch.zeros(h, max_sequence_length).to(attention) entity_embs.append(e_emb) # get an embedding of an entity entity_atts.append(e_att) # get attention of an entity entity_embs = torch.stack(entity_embs, dim=0) # [n_e, d] # entity embeddings for each document entity_atts = torch.stack(entity_atts, dim=0) # [n_e, h, seq_len] hs = torch.index_select(entity_embs, 0, torch.tensor([0]).to(self.device)) # embeddings of the first entity ts = torch.index_select(entity_embs, 0, torch.tensor([1]).to(self.device)) # embeddings of the second entity h_att = torch.index_select(entity_atts, 0, torch.tensor([0]).to(self.device)) t_att = torch.index_select(entity_atts, 0, torch.tensor([1]).to(self.device)) ht_att = (h_att * t_att).mean(1) ht_att = ht_att / (ht_att.sum(1, keepdim=True) + 1e-5) rs = contract("ld,rl->rd", sequence_output[i], ht_att) # ht_i.shape[0] x sequence_output.shape[2] hss.append(hs) tss.append(ts) rss.append(rs) hss = torch.cat(hss, dim=0) tss = torch.cat(tss, dim=0) rss = torch.cat(rss, dim=0) return hss, rss, tss def load(self) -> None: if self.pretrained_bert: log.debug(f"From pretrained {self.pretrained_bert}.") self.config = AutoConfig.from_pretrained( self.pretrained_bert, num_labels=self.n_classes, output_attentions=True, output_hidden_states=True ) self.model = BertModel.from_pretrained(self.pretrained_bert, config=self.config) elif self.bert_config_file and Path(self.bert_config_file).is_file(): self.config = AutoConfig.from_json_file(str(expand_path(self.bert_config_file))) self.model = BertModel.from_config(config=self.bert_config) else: raise ConfigError("No pre-trained BERT model is given.") self.model.to(self.device) ================================================ FILE: deeppavlov/models/classifiers/torch_classification_model.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging from typing import List, Union, Optional import numpy as np import torch from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register from deeppavlov.core.models.torch_model import TorchModel from .torch_nets import ShallowAndWideCnn log = logging.getLogger(__name__) @register('torch_text_classification_model') class TorchTextClassificationModel(TorchModel): """Class implements torch model for classification of texts. Input can either be embedded tokenized texts OR indices of words in the vocabulary. Number of tokens is not fixed while the samples in batch should be padded to the same (e.g. longest) lengths. Args: n_classes: number of classes kernel_sizes_cnn: list of kernel sizes of convolutions filters_cnn: number of filters for convolutions dense_size: number of units for dense layer dropout_rate: dropout rate, after convolutions and between dense embedding_size: size of vector representation of words multilabel: is multi-label classification (if so, `sigmoid` activation will be used, otherwise, softmax) criterion: criterion name from `torch.nn` embedded_tokens: True, if input contains embedded tokenized texts; False, if input containes indices of words in the vocabulary vocab_size: vocabulary size in case of `embedded_tokens=False`, and embedding is a layer in the Network return_probas: whether to return probabilities or index of classes (only for `multilabel=False`) Attributes: model: torch model itself epochs_done: number of epochs that were done criterion: torch criterion instance """ def __init__(self, n_classes: int, kernel_sizes_cnn: List[int], filters_cnn: int, dense_size: int, dropout_rate: float = 0.0, embedding_size: Optional[int] = None, multilabel: bool = False, criterion: str = "CrossEntropyLoss", embedded_tokens: bool = True, vocab_size: Optional[int] = None, return_probas: bool = True, **kwargs): if n_classes == 0: raise ConfigError("Please, provide vocabulary with considered classes or number of classes.") if multilabel and not return_probas: raise RuntimeError('Set return_probas to True for multilabel classification!') self.multilabel = multilabel self.return_probas = return_probas model = ShallowAndWideCnn( n_classes=n_classes, embedding_size=embedding_size, kernel_sizes_cnn=kernel_sizes_cnn, filters_cnn=filters_cnn, dense_size=dense_size, dropout_rate=dropout_rate, embedded_tokens=embedded_tokens, vocab_size=vocab_size ) self.criterion = getattr(torch.nn, criterion)() super().__init__(model, **kwargs) def __call__(self, texts: List[np.ndarray], *args) -> Union[List[List[float]], List[int]]: """Infer on the given data. Args: texts: list of tokenized text samples labels: labels *args: additional arguments Returns: for each sentence: vector of probabilities to belong with each class or list of labels sentence belongs with """ with torch.no_grad(): features = np.array(texts) inputs = torch.from_numpy(features) inputs = inputs.to(self.device) outputs = self.model(inputs) if self.multilabel: outputs = torch.nn.functional.sigmoid(outputs) else: outputs = torch.nn.functional.softmax(outputs, dim=-1) outputs = outputs.cpu().detach().numpy() if self.return_probas: return outputs.tolist() else: return np.argmax(outputs, axis=-1).tolist() def train_on_batch(self, texts: List[List[np.ndarray]], labels: list) -> Union[float, List[float]]: """Train the model on the given batch. Args: texts: vectorized texts labels: list of labels Returns: metrics values on the given batch """ features, labels = np.array(texts), np.array(labels) inputs, labels = torch.from_numpy(features), torch.from_numpy(labels) inputs, labels = inputs.to(self.device), labels.to(self.device) # zero the parameter gradients self.optimizer.zero_grad() # forward + backward + optimize outputs = self.model(inputs) labels = labels.view(-1).long() loss = self.criterion(outputs, labels) self._make_step(loss) return loss.item() ================================================ FILE: deeppavlov/models/classifiers/torch_nets.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import List, Union, Optional import torch import torch.nn as nn class ShallowAndWideCnn(nn.Module): def __init__(self, n_classes: int, embedding_size: int, kernel_sizes_cnn: List[int], filters_cnn: Union[int, List[int]], dense_size: int, dropout_rate: float = 0.0, embedded_tokens: bool = True, vocab_size: Optional[int] = None, **kwargs): super().__init__() self.embedded_tokens = embedded_tokens self.kernel_sizes_cnn = kernel_sizes_cnn if not embedded_tokens and vocab_size: self.embedding = nn.Embedding(vocab_size, embedding_size) if isinstance(filters_cnn, int): filters_cnn = len(kernel_sizes_cnn) * [filters_cnn] for i in range(len(kernel_sizes_cnn)): setattr(self, "conv_" + str(i), nn.Conv1d(embedding_size, filters_cnn[i], kernel_sizes_cnn[i], padding=kernel_sizes_cnn[i])) setattr(self, "bn_" + str(i), nn.BatchNorm1d(filters_cnn[i])) setattr(self, "relu_" + str(i), nn.ReLU()) setattr(self, "pool_" + str(i), nn.AdaptiveMaxPool1d(1)) self.dropout = nn.Dropout(dropout_rate) self.dense = nn.Linear(sum(filters_cnn), dense_size) self.relu_dense = nn.ReLU() self.final_dense = nn.Linear(dense_size, n_classes) def forward(self, x: torch.Tensor) -> torch.Tensor: # number of tokens is variable if not self.embedded_tokens: # x of shape [batch_size, number of tokens] input = self.embedding(x) input = input.permute(0, 2, 1) else: # x of shape [batch_size, number of tokens, embedding_size] input = x.permute(0, 2, 1) # input of [batch size, embedding size, number of tokens] outputs = [] for i in range(len(self.kernel_sizes_cnn)): # convolutional input should be of shape [batch_size, embedding_size, number of tokens] output = getattr(self, "conv_" + str(i))(input) output = getattr(self, "bn_" + str(i))(output) output = getattr(self, "relu_" + str(i))(output) output = getattr(self, "pool_" + str(i))(output) output = output.squeeze(-1) # output of shape [batch_size, out] outputs.append(output) output = torch.cat(outputs, dim=-1) output = self.dropout(output) output = self.dense(output) output = self.relu_dense(output) output = self.dropout(output) output = self.final_dense(output) return output ================================================ FILE: deeppavlov/models/classifiers/utils.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from typing import List import numpy as np log = getLogger(__name__) def labels2onehot(labels: [List[str], List[List[str]], np.ndarray], classes: [list, np.ndarray]) -> np.ndarray: """ Convert labels to one-hot vectors for multi-class multi-label classification Args: labels: list of samples where each sample is a class or a list of classes which sample belongs with classes: array of classes' names Returns: 2d array with one-hot representation of given samples """ n_classes = len(classes) y = [] for sample in labels: curr = np.zeros(n_classes) if isinstance(sample, list): for intent in sample: if intent not in classes: log.warning('Unknown label {} detected. Assigning no class'.format(intent)) else: curr[np.where(np.array(classes) == intent)[0]] = 1 else: curr[np.where(np.array(classes) == sample)[0]] = 1 y.append(curr) y = np.asarray(y) return y def proba2labels(proba: [list, np.ndarray], confidence_threshold: float, classes: [list, np.ndarray]) -> List[List]: """ Convert vectors of probabilities to labels using confident threshold (if probability to belong with the class is bigger than confidence_threshold, sample belongs with the class; if no probabilities bigger than confident threshold, sample belongs with the class with the biggest probability) Args: proba: list of samples where each sample is a vector of probabilities to belong with given classes confidence_threshold (float): boundary of probability to belong with a class classes: array of classes' names Returns: list of lists of labels for each sample """ y = [] for sample in proba: to_add = np.where(sample > confidence_threshold)[0] if len(to_add) > 0: y.append(np.array(classes)[to_add].tolist()) else: y.append(np.array([np.array(classes)[np.argmax(sample)]]).tolist()) return y def proba2onehot(proba: [list, np.ndarray], confidence_threshold: float, classes: [list, np.ndarray]) -> np.ndarray: """ Convert vectors of probabilities to one-hot representations using confident threshold Args: proba: samples where each sample is a vector of probabilities to belong with given classes confidence_threshold: boundary of probability to belong with a class classes: array of classes' names Returns: 2d array with one-hot representation of given samples """ return labels2onehot(proba2labels(proba, confidence_threshold, classes), classes) ================================================ FILE: deeppavlov/models/doc_retrieval/__init__.py ================================================ ================================================ FILE: deeppavlov/models/doc_retrieval/bpr.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Tuple import faiss import numpy as np import torch from tqdm import trange from transformers import AutoTokenizer, BertModel from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component from deeppavlov.core.models.serializable import Serializable class FaissBinaryIndex: def __init__(self, index: faiss.Index): self.index = index def search(self, query_embs: np.ndarray, k: int, binary_k=1000, rerank=True) -> Tuple[np.ndarray, np.ndarray]: faiss.omp_set_num_threads(12) num_queries = query_embs.shape[0] bin_query_embs = np.packbits(np.where(query_embs > 0, 1, 0)).reshape(num_queries, -1) raw_index = self.index.index _, ids_arr = raw_index.search(bin_query_embs, binary_k) psg_embs = np.vstack([np.unpackbits(raw_index.reconstruct(int(id_))) for id_ in ids_arr.reshape(-1)]) psg_embs = psg_embs.reshape(query_embs.shape[0], binary_k, query_embs.shape[1]) psg_embs = psg_embs.astype(np.float32) psg_embs = psg_embs * 2 - 1 scores_arr = np.einsum("ijk,ik->ij", psg_embs, query_embs) sorted_indices = np.argsort(-scores_arr, axis=1) ids_arr = ids_arr[np.arange(num_queries)[:, None], sorted_indices] ids_arr = np.array([self.index.id_map.at(int(id_)) for id_ in ids_arr.reshape(-1)], dtype=np.int) ids_arr = ids_arr.reshape(num_queries, -1) scores_arr = scores_arr[np.arange(num_queries)[:, None], sorted_indices] return scores_arr[:, :k], ids_arr[:, :k] @register('bpr') class BPR(Component, Serializable): def __init__(self, pretrained_model: str, load_path: str, bpr_index: str, query_encoder_file: str, max_query_length: int = 256, top_n: int = 100, device: str = "gpu", *args, **kwargs ): super().__init__(save_path=None, load_path=load_path) self.device = torch.device("cuda" if torch.cuda.is_available() and device == "gpu" else "cpu") self.bpr_index = bpr_index self.top_n = top_n self.max_query_length = max_query_length self.query_encoder_file = query_encoder_file self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model, use_fast=True) self.q_encoder = BertModel.from_pretrained(pretrained_model).to(self.device) self.load() self.index = FaissBinaryIndex(self.base_index) def load(self): checkpoint = torch.load(str(self.load_path / self.query_encoder_file), map_location=self.device) self.q_encoder.load_state_dict(checkpoint["model_state_dict"], strict=False) self.base_index = faiss.read_index_binary(str(self.load_path / self.bpr_index)) def save(self) -> None: pass def encode_queries(self, queries, batch_size: int = 256) -> np.ndarray: embeddings = [] with torch.no_grad(): for start in trange(0, len(queries), batch_size): model_inputs = self.tokenizer.batch_encode_plus( queries[start: start + batch_size], return_tensors="pt", max_length=self.max_query_length, padding="max_length", ) model_inputs = {k: v.to(self.device) for k, v in model_inputs.items()} sequence_output = self.q_encoder(**model_inputs)[0] emb = sequence_output[:, 0, :].contiguous().cpu().numpy() embeddings.append(emb) return np.vstack(embeddings) def __call__(self, queries): queries = [query.lower() for query in queries] query_embeddings = self.encode_queries(queries) scores_batch, ids_batch = self.index.search(query_embeddings, self.top_n) ids_batch = ids_batch.tolist() return ids_batch ================================================ FILE: deeppavlov/models/doc_retrieval/logit_ranker.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from operator import itemgetter from typing import List, Union, Tuple, Optional from deeppavlov.core.common.chainer import Chainer from deeppavlov.core.common.registry import register from deeppavlov.core.models.estimator import Component from deeppavlov.models.doc_retrieval.utils import find_answer_sentence logger = getLogger(__name__) @register("logit_ranker") class LogitRanker(Component): """Select best answer using squad model logits. Make several batches for a single batch, send each batch to the squad model separately and get a single best answer for each batch. Args: squad_model: a loaded squad model batch_size: batch size to use with squad model sort_noans: whether to downgrade noans tokens in the most possible answers top_n: number of answers to return Attributes: squad_model: a loaded squad model batch_size: batch size to use with squad model top_n: number of answers to return """ def __init__(self, squad_model: Union[Chainer, Component], batch_size: int = 50, sort_noans: bool = False, top_n: int = 1, return_answer_sentence: bool = False, **kwargs): self.squad_model = squad_model self.batch_size = batch_size self.sort_noans = sort_noans self.top_n = top_n self.return_answer_sentence = return_answer_sentence def __call__(self, contexts_batch: List[List[str]], questions_batch: List[List[str]], doc_ids_batch: Optional[List[List[str]]] = None) -> \ Union[ Tuple[List[str], List[float], List[int], List[str]], Tuple[List[List[str]], List[List[float]], List[List[int]], List[List[str]]], Tuple[List[str], List[float], List[int]], Tuple[List[List[str]], List[List[float]], List[List[int]]] ]: """ Sort obtained results from squad reader by logits and get the answer with a maximum logit. Args: contexts_batch: a batch of contexts which should be treated as a single batch in the outer JSON config questions_batch: a batch of questions which should be treated as a single batch in the outer JSON config doc_ids_batch (optional): names of the documents from which the contexts_batch was derived Returns: a batch of best answers, their scores, places in contexts and doc_ids for this answers if doc_ids_batch were passed """ if doc_ids_batch is None: logger.warning("you didn't pass tfidf_doc_ids as input in logit_ranker config so " "batch_best_answers_doc_ids can't be compute") batch_best_answers = [] batch_best_answers_score = [] batch_best_answers_place = [] batch_best_answers_doc_ids = [] batch_best_answers_sentences = [] for quest_ind, [contexts, questions] in enumerate(zip(contexts_batch, questions_batch)): results = [] for i in range(0, len(contexts), self.batch_size): c_batch = contexts[i: i + self.batch_size] q_batch = questions[i: i + self.batch_size] batch_predict = list(zip(*self.squad_model(c_batch, q_batch), c_batch)) results += batch_predict if self.sort_noans: results_sort = sorted(results, key=lambda x: (x[0] != '', x[2]), reverse=True) else: results_sort = sorted(results, key=itemgetter(2), reverse=True) best_answers = [x[0] for x in results_sort[:self.top_n]] best_answers_place = [x[1] for x in results_sort[:self.top_n]] best_answers_score = [x[2] for x in results_sort[:self.top_n]] best_answers_contexts = [x[3] for x in results_sort[:self.top_n]] batch_best_answers.append(best_answers) batch_best_answers_place.append(best_answers_place) batch_best_answers_score.append(best_answers_score) best_answers_sentences = [] for answer, place, context in zip(best_answers, best_answers_place, best_answers_contexts): sentence = find_answer_sentence(place, context) best_answers_sentences.append(sentence) batch_best_answers_sentences.append(best_answers_sentences) if doc_ids_batch is not None: doc_ind = [results.index(x) for x in results_sort] batch_best_answers_doc_ids.append( [doc_ids_batch[quest_ind][i] for i in doc_ind][:len(batch_best_answers[-1])]) if self.top_n == 1: batch_best_answers = [x[0] for x in batch_best_answers] batch_best_answers_place = [x[0] for x in batch_best_answers_place] batch_best_answers_score = [x[0] for x in batch_best_answers_score] batch_best_answers_doc_ids = [x[0] for x in batch_best_answers_doc_ids] batch_best_answers_sentences = [x[0] for x in batch_best_answers_sentences] if doc_ids_batch is None: if self.return_answer_sentence: return batch_best_answers, batch_best_answers_score, batch_best_answers_place, \ batch_best_answers_sentences return batch_best_answers, batch_best_answers_score, batch_best_answers_place if self.return_answer_sentence: return batch_best_answers, batch_best_answers_score, batch_best_answers_place, batch_best_answers_doc_ids, \ batch_best_answers_sentences return batch_best_answers, batch_best_answers_score, batch_best_answers_place, batch_best_answers_doc_ids ================================================ FILE: deeppavlov/models/doc_retrieval/pop_ranker.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from operator import itemgetter from typing import List, Any, Tuple import numpy as np import joblib from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.file import read_json from deeppavlov.core.common.registry import register from deeppavlov.core.models.estimator import Component logger = getLogger(__name__) @register('pop_ranker') class PopRanker(Component): """Rank documents according to their tfidf scores and popularities. It is not a standalone ranker, it should be used for re-ranking the results of TF-IDF Ranker. Based on a Logistic Regression trained on 3 features: * tfidf score of the article * popularity of the article obtained via Wikimedia REST API as a mean number of views for the period since 2017/11/05 to 2018/11/05 * multiplication of the two features above Args: pop_dict_path: a path to json file with article title to article popularity map load_path: a path to saved logistic regression classifier top_n: a number of doc ids to return active: whether to return a number specified by :attr:`top_n` (``True``) or all ids (``False``) Attributes: pop_dict: a map of article titles to their popularity mean_pop: mean popularity of all popularities in :attr:`pop_dict`, use it when popularity is not found clf: a loaded logistic regression classifier top_n: a number of doc ids to return active: whether to return a number specified by :attr:`top_n` or all ids """ def __init__(self, pop_dict_path: str, load_path: str, top_n: int = 3, active: bool = True, **kwargs) -> None: pop_dict_path = expand_path(pop_dict_path) logger.debug(f"Reading popularity dictionary from {pop_dict_path}") self.pop_dict = read_json(pop_dict_path) self.mean_pop = np.mean(list(self.pop_dict.values())) load_path = expand_path(load_path) logger.debug(f"Loading popularity ranker from {load_path}") self.clf = joblib.load(load_path) self.top_n = top_n self.active = active def __call__(self, input_doc_ids: List[List[Any]], input_doc_scores: List[List[float]]) -> \ Tuple[List[List], List[List]]: """Get tfidf scores and tfidf ids, re-rank them by applying logistic regression classifier, output pop ranker ids and pop ranker scores. Args: input_doc_ids: top input doc ids of tfidf ranker input_doc_scores: top input doc scores of tfidf ranker corresponding to doc ids Returns: top doc ids of pop ranker and their corresponding scores """ batch_ids = [] batch_scores = [] for instance_ids, instance_scores in zip(input_doc_ids, input_doc_scores): instance_probas = [] for idx, score in zip(instance_ids, instance_scores): pop = self.pop_dict.get(idx, self.mean_pop) features = [score, pop, score * pop] prob = self.clf.predict_proba([features]) instance_probas.append(prob[0][1]) sort = sorted(enumerate(instance_probas), key=itemgetter(1), reverse=True) sorted_probas = [item[1] for item in sort] sorted_ids = [instance_ids[item[0]] for item in sort] if self.active: sorted_ids = sorted_ids[:self.top_n] sorted_probas = sorted_probas[:self.top_n] batch_ids.append(sorted_ids) batch_scores.append(sorted_probas) return batch_ids, batch_scores ================================================ FILE: deeppavlov/models/doc_retrieval/tfidf_ranker.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from typing import List, Any, Tuple import numpy as np from deeppavlov.core.common.registry import register from deeppavlov.core.models.estimator import Component from deeppavlov.models.vectorizers.hashing_tfidf_vectorizer import HashingTfIdfVectorizer logger = getLogger(__name__) @register("tfidf_ranker") class TfidfRanker(Component): """Rank documents according to input strings. Args: vectorizer: a vectorizer class top_n: a number of doc ids to return active: whether to return a number specified by :attr:`top_n` (``True``) or all ids (``False``) Attributes: top_n: a number of doc ids to return vectorizer: an instance of vectorizer class active: whether to return a number specified by :attr:`top_n` or all ids index2doc: inverted :attr:`doc_index` iterator: a dataset iterator used for generating batches while fitting the vectorizer """ def __init__(self, vectorizer: HashingTfIdfVectorizer, top_n=5, active: bool = True, **kwargs): self.top_n = top_n self.vectorizer = vectorizer self.active = active def __call__(self, questions: List[str]) -> Tuple[List[Any], List[float]]: """Rank documents and return top n document titles with scores. Args: questions: list of queries used in ranking Returns: a tuple of selected doc ids and their scores """ batch_doc_ids, batch_docs_scores = [], [] q_tfidfs = self.vectorizer(questions) for q_tfidf in q_tfidfs: scores = q_tfidf * self.vectorizer.tfidf_matrix scores = np.squeeze( scores.toarray() + 0.0001) # add a small value to eliminate zero scores if self.active: thresh = self.top_n else: thresh = len(self.vectorizer.doc_index) if thresh >= len(scores): o = np.argpartition(-scores, len(scores) - 1)[0:thresh] else: o = np.argpartition(-scores, thresh)[0:thresh] o_sort = o[np.argsort(-scores[o])] doc_scores = scores[o_sort] doc_ids = [self.vectorizer.index2doc.get(i, int(i)) for i in o_sort] batch_doc_ids.append(doc_ids) batch_docs_scores.append(doc_scores) return batch_doc_ids, batch_docs_scores ================================================ FILE: deeppavlov/models/doc_retrieval/utils.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Any, List import nltk from deeppavlov.core.common.registry import register @register('concat_lists') def concat_lists(list_a: List[List[Any]], list_b: List[List[Any]]): list_u = [] for element_a, element_b in zip(list_a, list_b): list_u.append(element_a + element_b) return list_u def find_answer_sentence(answer_pos: int, context: str) -> str: answer_sentence = "" context_sentences = nltk.sent_tokenize(context) start = 0 context_sentences_offsets = [] for sentence in context_sentences: end = start + len(sentence) context_sentences_offsets.append((start, end)) start = end + 1 for sentence, (start_offset, end_offset) in zip(context_sentences, context_sentences_offsets): if start_offset < answer_pos < end_offset: answer_sentence = sentence break return answer_sentence ================================================ FILE: deeppavlov/models/embedders/__init__.py ================================================ ================================================ FILE: deeppavlov/models/embedders/abstract_embedder.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from abc import ABCMeta, abstractmethod from logging import getLogger from pathlib import Path from typing import List, Union, Iterator import numpy as np from deeppavlov.core.data.utils import zero_pad from deeppavlov.core.models.component import Component from deeppavlov.core.models.serializable import Serializable log = getLogger(__name__) class Embedder(Component, Serializable, metaclass=ABCMeta): """ Class implements fastText embedding model Args: load_path: path where to load pre-trained embedding model from pad_zero: whether to pad samples or not Attributes: model: model instance tok2emb: dictionary with already embedded tokens dim: dimension of embeddings pad_zero: whether to pad sequence of tokens with zeros or not mean: whether to return one mean embedding vector per sample load_path: path with pre-trained fastText binary model """ def __init__(self, load_path: Union[str, Path], pad_zero: bool = False, mean: bool = False, **kwargs) -> None: """ Initialize embedder with given parameters """ super().__init__(save_path=None, load_path=load_path) self.tok2emb = {} self.pad_zero = pad_zero self.mean = mean self.dim = None self.model = None self.load() def save(self) -> None: """ Class does not save loaded model again as it is not trained during usage """ raise NotImplementedError def __call__(self, batch: List[List[str]], mean: bool = None) -> List[Union[list, np.ndarray]]: """ Embed sentences from batch Args: batch: list of tokenized text samples mean: whether to return mean embedding of tokens per sample Returns: embedded batch """ batch = [self._encode(sample, mean) for sample in batch] if self.pad_zero: batch = zero_pad(batch) return batch @abstractmethod def __iter__(self) -> Iterator[str]: """ Iterate over all words from the model vocabulary Returns: iterator """ @abstractmethod def _get_word_vector(self, w: str) -> np.ndarray: """ Embed a word using ``self.model`` Args: w: a word Returns: embedding vector """ def _encode(self, tokens: List[str], mean: bool) -> Union[List[np.ndarray], np.ndarray]: """ Embed one text sample Args: tokens: tokenized text sample mean: whether to return mean embedding of tokens per sample Returns: list of embedded tokens or array of mean values """ embedded_tokens = [] for t in tokens: try: emb = self.tok2emb[t] except KeyError: try: emb = self._get_word_vector(t) except KeyError: emb = np.zeros(self.dim, dtype=np.float32) self.tok2emb[t] = emb embedded_tokens.append(emb) if mean is None: mean = self.mean if mean: filtered = [et for et in embedded_tokens if np.any(et)] if filtered: return np.mean(filtered, axis=0) return np.zeros(self.dim, dtype=np.float32) return embedded_tokens ================================================ FILE: deeppavlov/models/embedders/fasttext_embedder.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from typing import Iterator import fasttext import numpy as np from deeppavlov.core.common.registry import register from deeppavlov.models.embedders.abstract_embedder import Embedder log = getLogger(__name__) @register('fasttext') class FasttextEmbedder(Embedder): """ Class implements fastText embedding model Args: load_path: path where to load pre-trained embedding model from pad_zero: whether to pad samples or not Attributes: model: fastText model instance tok2emb: dictionary with already embedded tokens dim: dimension of embeddings pad_zero: whether to pad sequence of tokens with zeros or not load_path: path with pre-trained fastText binary model """ def _get_word_vector(self, w: str) -> np.ndarray: return self.model.get_word_vector(w) def load(self) -> None: """ Load fastText binary model from self.load_path """ log.debug(f"[loading fastText embeddings from `{self.load_path}`]") self.model = fasttext.load_model(str(self.load_path)) self.dim = self.model.get_dimension() def __iter__(self) -> Iterator[str]: """ Iterate over all words from fastText model vocabulary Returns: iterator """ yield from self.model.get_words() ================================================ FILE: deeppavlov/models/embedders/tfidf_weighted_embedder.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from typing import List, Union, Optional, Tuple import numpy as np from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register from deeppavlov.core.data.utils import zero_pad from deeppavlov.core.models.component import Component log = getLogger(__name__) @register('tfidf_weighted') class TfidfWeightedEmbedder(Component): """ The class implements the functionality of embedding the sentence \ as a weighted average by special coefficients of tokens embeddings. \ Coefficients can be taken from the given TFIDF-vectorizer in ``vectorizer`` or \ calculated as TFIDF from counter vocabulary given in ``counter_vocab_path``. Also one can give ``tags_vocab_path`` to the vocabulary with weights of tags. \ In this case, batch with tags should be given as a second input in ``__call__`` method. Args: embedder: embedder instance tokenizer: tokenizer instance, should be able to detokenize sentence pad_zero: whether to pad samples or not mean: whether to return mean token embedding tags_vocab_path: optional path to vocabulary with tags weights vectorizer: vectorizer instance should be trained with ``analyzer="word"`` counter_vocab_path: path to counter vocabulary idf_base_count: minimal idf value (less time occured are not counted) log_base: logarithm base for TFIDF-coefficient calculation froom counter vocabulary min_idf_weight: minimal idf weight Attributes: embedder: embedder instance tokenizer: tokenizer instance, should be able to detokenize sentence dim: dimension of embeddings pad_zero: whether to pad samples or not mean: whether to return mean token embedding tags_vocab: vocabulary with weigths for tags vectorizer: vectorizer instance counter_vocab_path: path to counter vocabulary counter_vocab: counter vocabulary idf_base_count: minimal idf value (less time occured are not counted) log_base: logarithm base for TFIDF-coefficient calculation froom counter vocabulary min_idf_weight: minimal idf weight Examples: >>> from deeppavlov.models.embedders.tfidf_weighted_embedder import TfidfWeightedEmbedder >>> from deeppavlov.models.embedders.fasttext_embedder import FasttextEmbedder >>> fasttext_embedder = FasttextEmbedder('/data/embeddings/wiki.ru.bin') >>> fastTextTfidf = TfidfWeightedEmbedder(embedder=fasttext_embedder, counter_vocab_path='/data/vocabs/counts_wiki_lenta.txt') >>> fastTextTfidf([['большой', 'и', 'розовый', 'бегемот']]) [array([ 1.99135890e-01, -7.14746421e-02, 8.01428872e-02, -5.32840924e-02, 5.05212297e-02, 2.76053832e-01, -2.53270134e-01, -9.34443950e-02, ... 1.18385439e-02, 1.05643446e-01, -1.21904516e-03, 7.70555378e-02])] """ def __init__(self, embedder: Component, tokenizer: Component = None, pad_zero: bool = False, mean: bool = False, tags_vocab_path: str = None, vectorizer: Component = None, counter_vocab_path: str = None, idf_base_count: int = 100, log_base: int = 10, min_idf_weight=0.0, **kwargs) -> None: self.embedder = embedder self.dim = self.embedder.dim self.mean = mean self.pad_zero = pad_zero self.tokenizer = tokenizer or self.space_detokenizer self.vectorizer = vectorizer if vectorizer and counter_vocab_path: raise ConfigError("TfidfWeightedEmbedder got vectorizer and counter_vocab_path simultaneously." " Remove one of them, please") elif vectorizer: self.vectorizer = vectorizer self.vocabulary = np.array(self.vectorizer.model.get_feature_names()) elif counter_vocab_path: self.counter_vocab_path = expand_path(counter_vocab_path) self.counter_vocab, self.min_count = self.load_counter_vocab(self.counter_vocab_path) self.idf_base_count = idf_base_count self.log_base = log_base self.min_idf_weight = min_idf_weight else: raise ConfigError("TfidfWeightedEmbedder did not get vectorizer or counter_vocab_path." " Set one of them, please") if tags_vocab_path: self.tags_vocab = self.load_tags_vocab(expand_path(tags_vocab_path)) else: self.tags_vocab = None @staticmethod def load_tags_vocab(load_path: str) -> dict: """ Load tag vocabulary from the given path, each key of the vocabulary is a tag, \ and the corresponding value of the item is a coefficient of words with such tags to be multiplied for. Args: load_path: path to the vocabulary to be load from Returns: vocabulary """ tags_vocab = dict() with open(load_path, 'r') as f: lines = f.readlines() f.close() for line in lines: key, val = line[:-1].split(' ') # "\t" tags_vocab[key] = val return tags_vocab @staticmethod def load_counter_vocab(load_path: str) -> Tuple[dict, int]: """ Load counter vocabulary from the given path Args: load_path: path to the vocabulary to be load from Returns: vocabulary """ counter_vocab = dict() with open(load_path, 'r') as f: lines = f.readlines() f.close() min_val = np.inf for line in lines: key, val = line[:-1].split('\t') val = int(val) counter_vocab[key] = val if val < min_val: min_val = val return counter_vocab, min_val @staticmethod def space_detokenizer(batch: List[List[str]]) -> List[str]: """ Detokenizer by default. Linking tokens by space symbol Args: batch: batch of tokenized texts Returns: batch of detokenized texts """ return [" ".join(tokens) for tokens in batch] def __call__(self, batch: List[List[str]], tags_batch: Optional[List[List[str]]] = None, mean: bool = None, *args, **kwargs) -> List[Union[list, np.ndarray]]: """ Infer on the given data Args: batch: tokenized text samples tags_batch: optional batch of corresponding tags mean: whether to return mean token embedding (does not depend on self.mean) *args: additional arguments **kwargs: additional arguments Returns: """ if self.tags_vocab: if tags_batch is None: raise ConfigError("TfidfWeightedEmbedder got 'tags_vocab_path' but __call__ did not get tags_batch.") batch = [self._tags_encode(sample, tags_sample, mean=mean) for sample, tags_sample in zip(batch, tags_batch)] else: if tags_batch: raise ConfigError("TfidfWeightedEmbedder got tags batch, but 'tags_vocab_path' is empty.") batch = [self._encode(sample, mean=mean) for sample in batch] if self.pad_zero: batch = zero_pad(batch) return batch def _encode(self, tokens: List[str], mean: bool) -> Union[List[np.ndarray], np.ndarray]: """ Embed one text sample Args: tokens: tokenized text sample mean: whether to return mean token embedding (does not depend on self.mean) Returns: list of embedded tokens or array of mean values """ if self.vectorizer: detokenized_sample = self.tokenizer([tokens])[0] # str vectorized_sample = self.vectorizer([detokenized_sample]) # (voc_size,) weights = np.array([vectorized_sample[0, np.where(self.vocabulary == token)[0][0]] if len(np.where(self.vocabulary == token)[0]) else 0. for token in tokens]) else: weights = np.array([self.get_weight(max(self.counter_vocab.get(token, 0), self.idf_base_count)) for token in tokens]) if sum(weights) == 0: weights = np.ones(len(tokens)) embedded_tokens = np.array(self.embedder([tokens]))[0, :, :] if mean is None: mean = self.mean if mean: embedded_tokens = np.average(embedded_tokens, weights=weights, axis=0) else: embedded_tokens = np.array([weights[i] * embedded_tokens[i] for i in range(len(tokens))]) return embedded_tokens def get_weight(self, count: int) -> float: """ Calculate the weight corresponding to the given count Args: count: the number of occurences of particular token Returns: weight """ log_count = np.log(count) / np.log(self.log_base) log_base_count = np.log(self.idf_base_count) / np.log(self.log_base) weight = max(1.0 / (1.0 + log_count - log_base_count), self.min_idf_weight) return weight def _tags_encode(self, tokens: List[str], tags: List[str], mean: bool) -> Union[List[np.ndarray], np.ndarray]: """ Embed one text sample Args: tokens: tokenized text sample tags: tokenized tags sample mean: whether to return mean token embedding (does not depend on self.mean) Returns: list of embedded tokens or array of mean values """ embedded_tokens = np.array(self.embedder([tokens]))[0, :, :] tags_weights = np.array([self.tags_vocab.get(tag, 1.0) for tag in tags]) detokenized_sample = self.tokenizer([tokens])[0] # str vectorized_sample = self.vectorizer([detokenized_sample]) # (voc_size,) if self.vectorizer: weights = np.array([vectorized_sample[0, np.where(self.vocabulary == token)[0][0]] if len(np.where(self.vocabulary == token)[0]) else 0. for token in tokens]) else: weights = np.array([self.get_weight(max(self.counter_vocab.get(token, 0), self.idf_base_count)) for token in tokens]) weights = np.multiply(weights, tags_weights) if sum(weights) == 0: weights = np.ones(len(tokens)) if mean is None: mean = self.mean if mean: embedded_tokens = np.average(embedded_tokens, weights=weights, axis=0) else: embedded_tokens = np.array([weights[i] * embedded_tokens[i] for i in range(len(tokens))]) return embedded_tokens ================================================ FILE: deeppavlov/models/embedders/transformers_embedder.py ================================================ # Copyright 2020 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from pathlib import Path from typing import Union, Tuple, Collection import torch import transformers from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.registry import register from deeppavlov.core.models.serializable import Serializable @register('transformers_bert_embedder') class TransformersBertEmbedder(Serializable): """Transformers-based BERT model for embeddings tokens, subtokens and sentences Args: load_path: path to a pretrained BERT pytorch checkpoint bert_config_file: path to a BERT configuration file truncate: whether to remove zero-paddings from returned data """ model: transformers.BertModel dim: int def __init__(self, load_path: Union[str, Path], bert_config_path: Union[str, Path] = None, truncate: bool = False, **kwargs): super().__init__(save_path=None, load_path=load_path, **kwargs) if bert_config_path is not None: bert_config_path = expand_path(bert_config_path) self.config = bert_config_path self.truncate = truncate self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.load() def save(self, *args, **kwargs): raise NotImplementedError def load(self): self.model = transformers.BertModel.from_pretrained(self.load_path, config=self.config).eval().to(self.device) self.dim = self.model.config.hidden_size def __call__(self, subtoken_ids_batch: Collection[Collection[int]], startofwords_batch: Collection[Collection[int]], attention_batch: Collection[Collection[int]]) -> Tuple[Collection[Collection[Collection[float]]], Collection[Collection[Collection[float]]], Collection[Collection[float]], Collection[Collection[float]], Collection[Collection[float]]]: """Predict embeddings values for a given batch Args: subtoken_ids_batch: padded indexes for every subtoken startofwords_batch: a mask matrix with ``1`` for every first subtoken init in a token and ``0`` for every other subtoken attention_batch: a mask matrix with ``1`` for every significant subtoken and ``0`` for paddings """ ids_tensor = torch.tensor(subtoken_ids_batch, device=self.device, dtype=torch.long) startofwords_tensor = torch.tensor(startofwords_batch, device=self.device).bool() attention_tensor = torch.tensor(attention_batch, device=self.device) with torch.no_grad(): output = self.model(ids_tensor, attention_tensor) last_hidden = output.last_hidden_state pooler_output = output.pooler_output attention_tensor = attention_tensor.unsqueeze(-1) max_emb = torch.max(last_hidden - 1e9 * (1 - attention_tensor), dim=1)[0] subword_emb = last_hidden * attention_tensor mean_emb = torch.sum(subword_emb, dim=1) / torch.sum(attention_tensor, dim=1) tokens_lengths = startofwords_tensor.sum(dim=1) word_emb = torch.zeros((subword_emb.shape[0], tokens_lengths.max(), subword_emb.shape[2]), device=self.device, dtype=subword_emb.dtype) target_indexes = (torch.arange(word_emb.shape[1], device=self.device).expand(word_emb.shape[:-1]) < tokens_lengths.unsqueeze(-1)) word_emb[target_indexes] = subword_emb[startofwords_tensor] subword_emb = subword_emb.cpu().numpy() word_emb = word_emb.cpu().numpy() pooler_output = pooler_output.cpu().numpy() max_emb = max_emb.cpu().numpy() mean_emb = mean_emb.cpu().numpy() if self.truncate: subword_emb = [item[:mask.sum()] for item, mask in zip(subword_emb, attention_batch)] word_emb = [item[:mask.sum()] for item, mask in zip(word_emb, startofwords_batch)] return word_emb, subword_emb, max_emb, mean_emb, pooler_output ================================================ FILE: deeppavlov/models/entity_extraction/__init__.py ================================================ ================================================ FILE: deeppavlov/models/entity_extraction/entity_detection_parser.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import re from collections import defaultdict from logging import getLogger from string import punctuation from typing import List, Tuple, Union, Any import numpy as np from nltk.corpus import stopwords from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component log = getLogger(__name__) punctuation = punctuation.replace('+', '') @register('question_sign_checker') class QuestionSignChecker: def __init__(self, delete_brackets: bool = False, **kwargs): self.delete_brackets = delete_brackets self.replace_tokens = [(" '", ' "'), ("' ", '" '), (" ?", "?"), (" ", " ")] def __call__(self, questions: List[str]) -> List[str]: """Adds question sign if it is absent or replaces dots in the end with question sign.""" questions_clean = [] for question in questions: question = question if question.endswith('?') else f'{question.rstrip(".")}?' if self.delete_brackets: brackets_text = re.findall(r"(\(.*?\))", question) for elem in brackets_text: question = question.replace(elem, " ") for old_tok, new_tok in self.replace_tokens: question = question.replace(old_tok, new_tok) questions_clean.append(question) return questions_clean @register('entity_type_split') def entity_type_split(entities_batch: List[List[str]], tags_batch: List[List[str]]) -> Tuple[ List[List[str]], List[List[str]], List[List[str]]]: f_entities_batch, f_types_batch, f_tags_batch = [], [], [] for entities_list, tags_list in zip(entities_batch, tags_batch): f_entities_list, f_types_list, f_tags_list = [], [], [] for entity, tag in zip(entities_list, tags_list): if tag != "T": f_entities_list.append(entity) f_tags_list.append(tag.lower()) else: f_types_list.append(entity) f_entities_batch.append(f_entities_list) f_tags_batch.append(f_tags_list) f_types_batch.append(f_types_list) return f_entities_batch, f_tags_batch, f_types_batch @register('entity_detection_parser') class EntityDetectionParser(Component): """This class parses probabilities of tokens to be a token from the entity substring.""" def __init__(self, o_tag: str, tags_file: str, entity_tags: List[str] = None, ignore_points: bool = False, thres_proba: float = 0.8, make_tags_from_probas: bool = False, lang: str = "en", ignored_tags: List[str] = None, **kwargs): """ Args: o_tag: tag for tokens which are neither entities nor types tags_file: filename with NER tags entity_tags: tags for entities ignore_points: whether to consider points as separate symbols thres_proba: if the probability of the tag is less than thres_proba, we assign the tag as 'O' make_tags_from_probas: whether to define token tags from confidences from sequence tagging model lang: language of texts ignored_tags: not used tags of entities """ self.entity_tags = entity_tags self.o_tag = o_tag self.ignore_points = ignore_points self.thres_proba = thres_proba self.tag_ind_dict = {} with open(str(expand_path(tags_file))) as fl: tags = [line.split('\t')[0] for line in fl.readlines()] self.tags = tags if self.entity_tags is None: self.entity_tags = list( {tag.split('-')[1] for tag in tags if len(tag.split('-')) > 1}.difference({self.o_tag})) self.entity_prob_ind = {entity_tag: [i for i, tag in enumerate(tags) if entity_tag in tag] for entity_tag in self.entity_tags} self.tags_ind = {tag: i for i, tag in enumerate(tags)} self.et_prob_ind = [i for tag, ind in self.entity_prob_ind.items() for i in ind] for entity_tag, tag_ind in self.entity_prob_ind.items(): for ind in tag_ind: self.tag_ind_dict[ind] = entity_tag self.tag_ind_dict[0] = self.o_tag self.make_tags_from_probas = make_tags_from_probas if lang == "en": self.stopwords = set(stopwords.words("english")) elif lang == "ru": self.stopwords = set(stopwords.words("russian")) else: raise ValueError(f'Unsupported lang value: "{lang}". Only "en" and "ru" are allowed.') self.ignored_tags = ignored_tags or [] def __call__(self, question_tokens_batch: List[List[str]], tokens_info_batch: List[List[List[float]]], tokens_probas_batch: np.ndarray) -> \ Tuple[List[dict], List[dict], List[dict]]: """ Args: question_tokens_batch: tokenized questions tokens_info_batch: list of tags of question tokens tokens_probas_probas: list of probabilities of question tokens Returns: Batch of dicts where keys are tags and values are substrings corresponding to tags Batch of substrings which correspond to entity types Batch of lists of token indices in the text which correspond to entities """ entities_batch = [] positions_batch = [] probas_batch = [] for tokens, tags, probas in \ zip(question_tokens_batch, tokens_info_batch, tokens_probas_batch): if self.make_tags_from_probas: tags, _ = self.tags_from_probas(tokens, probas) tags = self.correct_quotes(tokens, tags, probas) tags = self.correct_tags(tokens, tags) entities, positions, entities_probas = self.entities_from_tags(tokens, tags, probas) entities_batch.append(entities) positions_batch.append(positions) probas_batch.append(entities_probas) return entities_batch, positions_batch, probas_batch def tags_from_probas(self, tokens: List[str], probas: np.array) -> Tuple[List[Union[str, List[str]]], List[Any]]: """ This method makes a list of tags from a list of probas for tags Args: tokens: text tokens list probas: probabilities for tokens to belong to particular tags Returns: list of tags for tokens list of probabilities of these tags """ tags = [] tag_probas = [] for token, proba in zip(tokens, probas): if proba[0] < self.thres_proba: tag_num = np.argmax(proba[1:]) + 1 else: tag_num = 0 tags.append(self.tags[tag_num]) tag_probas.append(proba[tag_num]) return tags, tag_probas def correct_tags(self, tokens: List[str], tags: List[str]) -> List[str]: for i in range(len(tags) - 2): if len(tags[i]) > 1 and tags[i].startswith("B-"): tag = tags[i].split("-")[1] if tags[i + 2] == f"I-{tag}" and tags[i + 1] != f"I-{tag}": tags[i + 1] = f"I-{tag}" if tokens[i + 1] in '«' and tags[i] != "O": tags[i] = "O" tags[i + 1] = "O" if len(tags[i]) > 1 and tags[i].split("-")[1] == "EVENT": found_n = -1 for j in range(i + 1, i + 3): if re.findall(r"[\d]{3,4}", tokens[j]): found_n = j break if found_n > 0: for j in range(i + 1, found_n + 1): tags[j] = "I-EVENT" if i < len(tokens) - 3 and len(tokens[i]) == 1 and tokens[i + 1] == "." and len(tokens[i + 2]) == 1 \ and tokens[i + 3] == "." and tags[i + 2].startswith("B-"): tag = tags[i + 2].split("-")[1] tags[i] = f"B-{tag}" tags[i + 1] = f"I-{tag}" tags[i + 2] = f"I-{tag}" return tags def correct_quotes(self, tokens: List[str], tags: List[str], probas: np.array) -> List[str]: quotes = {"«": "»", '"': '"'} for i in range(len(tokens)): if tokens[i] in {"«", '"'}: quote_start = tokens[i] end_pos = 0 for j in range(i + 1, len(tokens)): if tokens[j] == quotes[quote_start]: end_pos = j break if end_pos and end_pos != i + 1: probas_sum = np.sum(probas[i + 1:end_pos], axis=0) tags_probas = {} for tag in self.entity_prob_ind: for ind in self.entity_prob_ind[tag]: if tag not in tags_probas: tags_probas[tag] = probas_sum[ind] else: tags_probas[tag] += probas_sum[ind] tags_probas = list(tags_probas.items()) tags_probas = sorted(tags_probas, key=lambda x: x[1], reverse=True) found_tag = "" for tag, _ in tags_probas: if tag != "PERSON": found_tag = tag break if found_tag: tags[i + 1] = f"B-{found_tag}" for j in range(i + 2, end_pos): tags[j] = f"I-{found_tag}" return tags def add_entity(self, entity: str, c_tag: str) -> None: replace_tokens = [(' - ', '-'), ("'s", ''), (' .', '.'), ('{', ''), ('}', ''), (' ', ' '), ('"', "'"), ('(', ''), (')', ''), (' +', '+')] if entity and (entity[-1] in punctuation or entity[-1] == "»"): entity = entity[:-1] self.ent_pos_dict[c_tag] = self.ent_pos_dict[c_tag][:-1] if entity and (entity[0] in punctuation or entity[0] == "«"): entity = entity[1:] self.ent_pos_dict[c_tag] = self.ent_pos_dict[c_tag][1:] entity = ' '.join(entity) for old, new in replace_tokens: entity = entity.replace(old, new) if entity and entity.lower() not in self.stopwords: cur_probas = self.ent_probas_dict[c_tag] self.ents_pos_probas_dict[c_tag].append((entity, self.ent_pos_dict[c_tag], round(sum(cur_probas) / len(cur_probas), 4))) self.ent_dict[c_tag] = [] self.ent_pos_dict[c_tag] = [] self.ent_probas_dict[c_tag] = [] def entities_from_tags(self, tokens: List[str], tags: List[str], tag_probas: List[List[float]]) -> Tuple[dict, dict, dict]: """ This method makes lists of substrings corresponding to entities and entity types and a list of indices of tokens which correspond to entities Args: tokens: list of tokens of the text tags: list of tags for tokens tag_probas: list of probabilities of tags Returns: list of entity substrings (or a dict of tags (keys) and entity substrings (values)) list of substrings for entity types list of indices of tokens which correspond to entities (or a dict of tags (keys) and list of indices of entity tokens) """ self.ent_dict = defaultdict(list) self.ent_pos_dict = defaultdict(list) self.ent_probas_dict = defaultdict(list) self.ents_pos_probas_dict = defaultdict(list) cnt = 0 for n, (tok, tag, probas) in enumerate(zip(tokens, tags, tag_probas)): if tag.split('-')[-1] in self.entity_tags: f_tag = tag.split("-")[-1] if tag.startswith("B-") and any(self.ent_dict.values()): for c_tag, entity in self.ent_dict.items(): self.add_entity(entity, c_tag) self.ent_dict[f_tag].append(tok) self.ent_pos_dict[f_tag].append(cnt) self.ent_probas_dict[f_tag].append(probas[self.tags_ind[tag]]) elif any(self.ent_dict.values()): for tag, entity in self.ent_dict.items(): c_tag = tag.split("-")[-1] self.add_entity(entity, c_tag) cnt += 1 if any(self.ent_dict.values()): for tag, entity in self.ent_dict.items(): c_tag = tag.split("-")[-1] self.add_entity(entity, c_tag) self.ents_pos_probas_dict = {tag: elements for tag, elements in self.ents_pos_probas_dict.items() if tag not in self.ignored_tags} for tag in self.ents_pos_probas_dict: ents_pos_proba = self.ents_pos_probas_dict[tag] entities_dict = {tag: [ent[0] for ent in ents] for tag, ents in self.ents_pos_probas_dict.items()} entities_positions_dict = {tag: [ent[1] for ent in ents] for tag, ents in self.ents_pos_probas_dict.items()} entities_probas_dict = {tag: [ent[2] for ent in ents] for tag, ents in self.ents_pos_probas_dict.items()} log.debug(f"entities_dict {entities_dict}") return entities_dict, entities_positions_dict, entities_probas_dict ================================================ FILE: deeppavlov/models/entity_extraction/entity_linking.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import re import sqlite3 from logging import getLogger from typing import List, Dict, Tuple, Any, Union from collections import defaultdict import nltk import spacy from hdt import HDTDocument from nltk.corpus import stopwords from rapidfuzz import fuzz from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component from deeppavlov.core.models.serializable import Serializable from deeppavlov.models.entity_extraction.find_word import WordSearcher log = getLogger(__name__) nltk.download("stopwords") @register("entity_linker") class EntityLinker(Component, Serializable): """ Class for linking of entity substrings in the document to entities in Wikidata """ def __init__( self, load_path: str, entity_ranker=None, entities_database_filename: str = None, words_dict_filename: str = None, ngrams_matrix_filename: str = None, num_entities_for_bert_ranking: int = 50, num_entities_for_conn_ranking: int = 5, num_entities_to_return: int = 10, max_text_len: int = 300, max_paragraph_len: int = 150, lang: str = "ru", use_descriptions: bool = True, alias_coef: float = 1.1, use_tags: bool = False, lemmatize: bool = False, full_paragraph: bool = False, use_connections: bool = False, kb_filename: str = None, prefixes: Dict[str, Any] = None, **kwargs, ) -> None: """ Args: load_path: path to folder with inverted index files entity_ranker: component deeppavlov.models.kbqa.rel_ranking_bert entities_database_filename: filename with database with entities index words_dict_filename: filename with words and corresponding tags ngrams_matrix_filename: filename with char tfidf matrix num_entities_for_bert_ranking: number of candidate entities for BERT ranking using description and context num_entities_for_conn_ranking: number of candidate entities for ranking using connections in the knowledge graph num_entities_to_return: number of candidate entities for the substring which are returned max_text_len: maximal length of entity context max_paragraph_len: maximal length of context paragraphs lang: russian or english use_description: whether to perform entity ranking by context and description alias_coef: coefficient which is multiplied by the substring matching confidence if the substring is the title of the entity use_tags: whether to filter candidate entities by tags lemmatize: whether to lemmatize tokens full_paragraph: whether to use full paragraph for entity context use_connections: whether to rank entities by connections in the knowledge graph kb_filename: filename with the knowledge base in HDT format prefixes: entity and title prefixes **kwargs: """ super().__init__(save_path=None, load_path=load_path) self.lemmatize = lemmatize self.num_entities_for_bert_ranking = num_entities_for_bert_ranking self.num_entities_for_conn_ranking = num_entities_for_conn_ranking self.entity_ranker = entity_ranker self.entities_database_filename = entities_database_filename self.num_entities_to_return = num_entities_to_return self.max_text_len = max_text_len self.max_paragraph_len = max_paragraph_len self.lang = f"@{lang}" if self.lang == "@en": self.stopwords = set(stopwords.words("english")) self.nlp = spacy.load("en_core_web_sm") elif self.lang == "@ru": self.stopwords = set(stopwords.words("russian")) self.nlp = spacy.load("ru_core_news_sm") self.alias_coef = alias_coef self.use_descriptions = use_descriptions self.use_connections = use_connections self.use_tags = use_tags self.full_paragraph = full_paragraph self.re_tokenizer = re.compile(r"[\w']+|[^\w ]") self.related_tags = { "loc": ["gpe", "country", "city", "us_state", "river"], "gpe": ["loc", "country", "city", "us_state"], "work_of_art": ["product", "law"], "product": ["work_of_art"], "law": ["work_of_art"], "org": ["fac", "business"], "business": ["org"] } self.word_searcher = None if words_dict_filename: self.word_searcher = WordSearcher(words_dict_filename, ngrams_matrix_filename, self.lang) self.kb_filename = kb_filename self.prefixes = prefixes self.load() def load(self) -> None: self.conn = sqlite3.connect(str(self.load_path / self.entities_database_filename)) self.cur = self.conn.cursor() self.kb = None if self.kb_filename: self.kb = HDTDocument(str(expand_path(self.kb_filename))) def save(self) -> None: pass def __call__( self, substr_batch: List[List[str]], tags_batch: List[List[str]] = None, probas_batch: List[List[float]] = None, sentences_batch: List[List[str]] = None, offsets_batch: List[List[List[int]]] = None, sentences_offsets_batch: List[List[Tuple[int, int]]] = None, entities_to_link_batch: List[List[int]] = None ): if (not sentences_offsets_batch or sentences_offsets_batch[0] is None) and sentences_batch is not None: sentences_offsets_batch = [] for sentences_list in sentences_batch: sentences_offsets_list = [] start = 0 for sentence in sentences_list: end = start + len(sentence) sentences_offsets_list.append([start, end]) start = end + 1 sentences_offsets_batch.append(sentences_offsets_list) if sentences_batch is None: sentences_batch = [[] for _ in substr_batch] sentences_offsets_batch = [[] for _ in substr_batch] if not entities_to_link_batch or entities_to_link_batch[0] is None: entities_to_link_batch = [[1 for _ in substr_list] for substr_list in substr_batch] log.debug(f"substr: {substr_batch} --- sentences_batch: {sentences_batch} --- offsets: {offsets_batch}") if (not offsets_batch or offsets_batch[0] is None) and sentences_batch: offsets_batch = [] for substr_list, sentences_list in zip(substr_batch, sentences_batch): text = " ".join(sentences_list).lower() log.debug(f"text {text}") offsets_list = [] for substr in substr_list: st_offset = text.find(substr.lower()) end_offset = st_offset + len(substr) offsets_list.append([st_offset, end_offset]) offsets_batch.append(offsets_list) ids_batch, conf_batch, pages_batch, labels_batch = [], [], [], [] for substr_list, offsets_list, tags_list, probas_list, sentences_list, sentences_offsets_list, \ entities_to_link in zip(substr_batch, offsets_batch, tags_batch, probas_batch, sentences_batch, sentences_offsets_batch, entities_to_link_batch): ids_list, conf_list, pages_list, labels_list = \ self.link_entities(substr_list, offsets_list, tags_list, probas_list, sentences_list, sentences_offsets_list, entities_to_link) log.debug(f"ids_list {ids_list} conf_list {conf_list}") if self.num_entities_to_return == 1: pages_list = [pages[0] for pages in pages_list] else: pages_list = [pages[: len(ids)] for pages, ids in zip(pages_list, ids_list)] ids_batch.append(ids_list) conf_batch.append(conf_list) pages_batch.append(pages_list) labels_batch.append(labels_list) return ids_batch, conf_batch, pages_batch, labels_batch def link_entities( self, substr_list: List[str], offsets_list: List[List[int]], tags_list: List[str], probas_list: List[float], sentences_list: List[str], sentences_offsets_list: List[List[int]], entities_to_link: List[int] ) -> Tuple[List[Any], List[Any], List[List[Union[str, Any]]], List[List[Union[str, Any]]]]: log.debug(f"substr_list {substr_list} tags_list {tags_list} probas {probas_list} offsets_list {offsets_list}") ids_list, conf_list, pages_list, label_list, descr_list = [], [], [], [], [] if substr_list: entities_scores_list = [] cand_ent_scores_list = [] for substr, tags, proba in zip(substr_list, tags_list, probas_list): for old_symb, new_symb in [("'s", ""), ("@", ""), (" ", " "), (".", ""), (",", ""), ("-", " "), ("'", " "), ("!", ""), (":", ""), ("&", ""), ("/", " "), ('"', ""), (" ", " ")]: substr = substr.replace(old_symb, new_symb) substr = substr.strip() cand_ent_init = defaultdict(set) if len(substr) > 1: if isinstance(tags, str): tags = [tags] tags = [tag.lower() for tag in tags] if tags and not isinstance(tags[0], (list, tuple)): tags = [(tag, 1.0) for tag in tags] if tags and tags[0][0] == "e": use_tags_flag = False else: use_tags_flag = True cand_ent_init = self.find_exact_match(substr, tags, use_tags=use_tags_flag) new_substr = re.sub(r"\b([a-z]{1}) ([a-z]{1})\b", r"\1\2", substr) if substr != new_substr: new_cand_ent_init = self.find_exact_match(new_substr, tags, use_tags=use_tags_flag) cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init) init_substr_split = substr.lower().split(" ") if tags[0][0] in {"person", "work_of_art"}: substr_split = [word for word in substr.lower().split(" ") if len(word) > 0] else: substr_split = [word for word in substr.lower().split(" ") if word not in self.stopwords and len(word) > 0] substr_split_lemm = [self.nlp(tok)[0].lemma_ for tok in substr_split] substr_lemm = " ".join(substr_split_lemm) if substr_split != substr_split_lemm \ or (tags[0][0] == "work_of_art" and len(substr_split) != len(init_substr_split)): new_cand_ent_init = self.find_fuzzy_match(substr_split, tags, use_tags=use_tags_flag) cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init) if substr_split != substr_split_lemm: new_cand_ent_init = self.find_exact_match(substr_lemm, tags, use_tags=use_tags_flag) cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init) new_cand_ent_init = self.find_fuzzy_match(substr_split_lemm, tags, use_tags=use_tags_flag) cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init) all_low_conf = self.define_all_low_conf(cand_ent_init, 1.0) clean_tags, corr_tags, corr_clean_tags = self.correct_tags(tags) log.debug(f"substr: {substr} --- lemm: {substr_split_lemm} --- tags: {tags} --- corr_tags: " f"{corr_tags} --- all_low_conf: {all_low_conf} --- cand_ent_init: {len(cand_ent_init)}") if (not cand_ent_init or all_low_conf) and corr_tags: corr_cand_ent_init = self.find_exact_match(substr, corr_tags, use_tags=use_tags_flag) cand_ent_init = self.unite_dicts(cand_ent_init, corr_cand_ent_init) if substr_split != substr_split_lemm: new_cand_ent_init = self.find_exact_match(substr_lemm, corr_tags, use_tags=use_tags_flag) cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init) new_cand_ent_init = self.find_fuzzy_match(substr_split_lemm, corr_tags, use_tags=use_tags_flag) cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init) if not cand_ent_init and len(substr_split) == 1 and self.word_searcher: corr_words = self.word_searcher(substr_split[0], set(clean_tags + corr_clean_tags)) if corr_words: cand_ent_init = self.find_exact_match(corr_words[0], tags + corr_tags, use_tags=use_tags_flag) if not cand_ent_init and len(substr_split) > 1: cand_ent_init = self.find_fuzzy_match(substr_split, tags) all_low_conf = self.define_all_low_conf(cand_ent_init, 0.85) if (not cand_ent_init or all_low_conf) and tags[0][0] != "t": use_tags_flag = False new_cand_ent_init = self.find_exact_match(substr, tags, use_tags=use_tags_flag) cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init) if substr_split != substr_split_lemm and (tags[0][0] == "e" or not cand_ent_init): new_cand_ent_init = self.find_fuzzy_match(substr_split, tags, use_tags=use_tags_flag) cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init) new_cand_ent_init = self.find_fuzzy_match(substr_split_lemm, tags, use_tags=use_tags_flag) cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init) cand_ent_scores = [] for entity in cand_ent_init: entities_scores = list(cand_ent_init[entity]) entities_scores = sorted(entities_scores, key=lambda x: (x[0], x[2], x[1]), reverse=True) cand_ent_scores.append(([entity] + list(entities_scores[0]))) cand_ent_scores = sorted(cand_ent_scores, key=lambda x: (x[1], x[3], x[2]), reverse=True) cand_ent_scores = cand_ent_scores[: self.num_entities_for_bert_ranking] cand_ent_scores_list.append(cand_ent_scores) entity_ids = [elem[0] for elem in cand_ent_scores] scores = [elem[1:4] for elem in cand_ent_scores] conf_list.append(scores) entities_scores_list.append( {entity_id: entity_scores for entity_id, entity_scores in zip(entity_ids, scores)} ) ids_list.append(entity_ids) pages = [elem[4] for elem in cand_ent_scores] entity_labels = [elem[5] for elem in cand_ent_scores] pages_list.append({entity_id: page for entity_id, page in zip(entity_ids, pages)}) label_list.append( {entity_id: entity_label for entity_id, entity_label in zip(entity_ids, entity_labels)}) descr_list.append([elem[6] for elem in cand_ent_scores]) scores_dict = {} if self.use_connections and self.kb: scores_dict = self.rank_by_connections(ids_list) substr_lens = [len(entity_substr.split()) for entity_substr in substr_list] ids_list, conf_list = self.rank_by_description(substr_list, tags_list, offsets_list, ids_list, descr_list, entities_scores_list, sentences_list, sentences_offsets_list, substr_lens, scores_dict) label_list = [[label_dict.get(entity_id, "") for entity_id in entity_ids] for entity_ids, label_dict in zip(ids_list, label_list)] pages_list = [[pages_dict.get(entity_id, "") for entity_id in entity_ids] for entity_ids, pages_dict in zip(ids_list, pages_list)] f_ids_list, f_conf_list, f_pages_list, f_label_list = [], [], [], [] for ids, confs, pages, labels, add_flag in \ zip(ids_list, conf_list, pages_list, label_list, entities_to_link): if add_flag: f_ids_list.append(ids) f_conf_list.append(confs) f_pages_list.append(pages) f_label_list.append(labels) return f_ids_list, f_conf_list, f_pages_list, f_label_list def define_all_low_conf(self, cand_ent_init, thres): all_low_conf = True for entity_id in cand_ent_init: entity_info_set = cand_ent_init[entity_id] for entity_info in entity_info_set: if entity_info[0] >= thres: all_low_conf = False break if not all_low_conf: break return all_low_conf def correct_tags(self, tags): clean_tags = [tag for tag, conf in tags] corr_tags, corr_clean_tags = [], [] for tag, conf in tags: if tag in self.related_tags: corr_tag_list = self.related_tags[tag] for corr_tag in corr_tag_list: if corr_tag not in clean_tags and corr_tag not in corr_clean_tags: corr_tags.append([corr_tag, conf]) corr_clean_tags.append(corr_tag) return clean_tags, corr_tags, corr_clean_tags def unite_dicts(self, cand_ent_init, new_cand_ent_init): for entity_id in new_cand_ent_init: if entity_id in cand_ent_init: for entity_info in new_cand_ent_init[entity_id]: cand_ent_init[entity_id].add(entity_info) else: cand_ent_init[entity_id] = new_cand_ent_init[entity_id] return cand_ent_init def process_cand_ent(self, cand_ent_init, entities_and_ids, substr_split, tag, tag_conf, use_tags): for title, entity_id, rels, ent_tag, page, label, descr in entities_and_ids: if (ent_tag == tag and use_tags) or not use_tags: substr_score = self.calc_substr_score(title, substr_split, tag, ent_tag, label) cand_ent_init[entity_id].add((substr_score, rels, tag_conf, page, label, descr)) return cand_ent_init def sanitize_substr(self, entity_substr, tag): if tag == "person": entity_substr_split = entity_substr.split() if len(entity_substr_split) > 1 and len(entity_substr_split[-1]) > 1 and len(entity_substr_split[-2]) == 1: entity_substr = entity_substr_split[-1] return entity_substr def find_exact_match(self, entity_substr, tags, use_tags=True): entity_substr = entity_substr.lower() entity_substr_split = entity_substr.split() cand_ent_init = defaultdict(set) for tag, tag_conf in tags: entity_substr = self.sanitize_substr(entity_substr, tag) query = "SELECT * FROM inverted_index WHERE title MATCH ?;" entities_and_ids = [] try: res = self.cur.execute(query, (entity_substr,)) entities_and_ids = res.fetchall() except: log.info(f"error in query execute {query}") if entities_and_ids: cand_ent_init = self.process_cand_ent( cand_ent_init, entities_and_ids, entity_substr_split, tag, tag_conf, use_tags) return cand_ent_init def find_fuzzy_match(self, entity_substr_split, tags, use_tags=True): cand_ent_init = defaultdict(set) for tag, tag_conf in tags: if len(entity_substr_split) > 3: entity_substr_split = [" ".join(entity_substr_split[i:i + 2]) for i in range(len(entity_substr_split) - 1)] for word in entity_substr_split: if len(word) > 1 and word not in self.stopwords: query = "SELECT * FROM inverted_index WHERE title MATCH ?;" part_entities_and_ids = [] try: res = self.cur.execute(query, (word,)) part_entities_and_ids = res.fetchall() except: log.info(f"error in query execute {query}") if part_entities_and_ids: cand_ent_init = self.process_cand_ent( cand_ent_init, part_entities_and_ids, entity_substr_split, tag, tag_conf, use_tags) return cand_ent_init def match_tokens(self, entity_substr_split, label_tokens): cnt = 0.0 if not (len(entity_substr_split) > 1 and len(label_tokens) > 1 and set(entity_substr_split) != set(label_tokens) and label_tokens[0] != label_tokens[-1] and ((entity_substr_split[0] == label_tokens[-1]) or (entity_substr_split[-1] == label_tokens[0]))): for ent_tok in entity_substr_split: found = False for label_tok in label_tokens: if label_tok == ent_tok: found = True break if found: cnt += 1.0 else: for label_tok in label_tokens: if label_tok[:2] == ent_tok[:2]: fuzz_score = fuzz.ratio(label_tok, ent_tok) c_long_toks = len(label_tok) >= 8 and label_tok[:6] == ent_tok[:6] and fuzz_score > 70.0 c_shrt_toks = len(label_tokens) > 2 and len(label_tok) > 3 and label_tok[:4] == ent_tok[:4] if (fuzz_score >= 75.0 or c_long_toks or c_shrt_toks) and not found: cnt += fuzz_score * 0.01 break substr_score = round(cnt / max(len(label_tokens), len(entity_substr_split)), 3) if len(label_tokens) == 2 and len(entity_substr_split) == 1: if entity_substr_split[0] == label_tokens[1]: substr_score = 0.5 elif entity_substr_split[0] == label_tokens[0]: substr_score = 0.3 return substr_score def correct_substr_score(self, entity_substr_split, label_tokens, substr_score): if sum([len(tok) == 1 for tok in entity_substr_split]) == 2 and len(label_tokens) >= 2 \ and any([(len(tok) == 2 and re.findall(r"[a-z]{2}", tok)) for tok in label_tokens]): new_label_tokens = [] for tok in label_tokens: if len(tok) == 2 and re.findall(r"[a-z]{2}", tok): new_label_tokens.append(tok[0]) new_label_tokens.append(tok[1]) else: new_label_tokens.append(tok) label_tokens = new_label_tokens if any([re.findall(r"[\d]{4}", tok) for tok in entity_substr_split]) \ and any([re.findall(r"[\d]{4}–[\d]{2}", tok) for tok in label_tokens]): new_label_tokens = [] for tok in label_tokens: if re.findall(r"[\d]{4}–[\d]{2}", tok): new_label_tokens.append(tok[:4]) new_label_tokens.append(tok[5:]) else: new_label_tokens.append(tok) label_tokens = new_label_tokens new_substr_score = self.match_tokens(entity_substr_split, label_tokens) substr_score = max(substr_score, new_substr_score) return substr_score def calc_substr_score(self, entity_title, entity_substr_split, tag, ent_tag, entity_label): if self.lang == "@ru": entity_title = entity_title.replace("ё", "е") label_tokens = entity_title.split() substr_score = self.match_tokens(entity_substr_split, label_tokens) substr_score = self.correct_substr_score(entity_substr_split, label_tokens, substr_score) if re.findall(r" \(.*\)", entity_label): entity_label_split = entity_label.replace("(", "").replace(")", "").lower().split() lbl_substr_score = self.match_tokens(entity_substr_split, entity_label_split) substr_score = max(substr_score, lbl_substr_score) if tag == ent_tag and tag.lower() == "person" and len(entity_substr_split) > 1 \ and len(entity_substr_split[-1]) > 1 and len(entity_substr_split[-2]) == 1 \ and len(label_tokens) == len(entity_substr_split): cnt = 0.0 for j in range(len(label_tokens) - 1): if label_tokens[j][0] == entity_substr_split[j][0]: cnt += 1.0 if label_tokens[-1] == entity_substr_split[-1]: cnt += 1.0 new_substr_score = cnt / len(label_tokens) substr_score = max(substr_score, new_substr_score) if entity_title.lower() == entity_label.lower() and substr_score == 1.0: substr_score = substr_score * self.alias_coef return substr_score def rank_by_description( self, entity_substr_list: List[str], tags_list: List[str], entity_offsets_list: List[List[int]], cand_ent_list: List[List[str]], cand_ent_descr_list: List[List[str]], entities_scores_list: List[Dict[str, Tuple[int, float]]], sentences_list: List[str], sentences_offsets_list: List[Tuple[int, int]], substr_lens: List[int], scores_dict: Dict[str, int] = None ) -> Tuple[List[Union[Union[float, List[Any], List[Union[float, Any]]], Any]], List[ Union[Union[tuple, List[tuple], List[Any], List[Tuple[Union[float, Any], ...]]], Any]]]: entity_ids_list = [] conf_list = [] contexts = [] for entity_offset in entity_offsets_list: context, sentence = "", "" if len(entity_offset) == 2: entity_start_offset, entity_end_offset = entity_offset rel_start_offset = 0 rel_end_offset = 0 found_sentence_num = 0 for num, (sent, (sent_start_offset, sent_end_offset)) in enumerate( zip(sentences_list, sentences_offsets_list) ): if entity_start_offset >= sent_start_offset and entity_end_offset <= sent_end_offset: sentence = sent found_sentence_num = num rel_start_offset = entity_start_offset - sent_start_offset rel_end_offset = entity_end_offset - sent_start_offset break if sentence: start_of_sentence = 0 end_of_sentence = len(sentence) if len(sentence) > self.max_text_len: start_of_sentence = max(rel_start_offset - self.max_text_len // 2, 0) end_of_sentence = min(rel_end_offset + self.max_text_len // 2, len(sentence)) text_before = sentence[start_of_sentence:rel_start_offset] text_after = sentence[rel_end_offset:end_of_sentence] context = text_before + "[ENT]" + text_after if self.full_paragraph: cur_sent_len = len(re.findall(self.re_tokenizer, context)) first_sentence_num = found_sentence_num last_sentence_num = found_sentence_num context = [context] while True: added = False if last_sentence_num < len(sentences_list) - 1: sentence_tokens = re.findall(self.re_tokenizer, sentences_list[last_sentence_num + 1]) last_sentence_len = len(sentence_tokens) if cur_sent_len + last_sentence_len < self.max_paragraph_len: context.append(sentences_list[last_sentence_num + 1]) cur_sent_len += last_sentence_len last_sentence_num += 1 added = True if first_sentence_num > 0: sentence_tokens = re.findall(self.re_tokenizer, sentences_list[first_sentence_num - 1]) first_sentence_len = len(sentence_tokens) if cur_sent_len + first_sentence_len < self.max_paragraph_len: context = [sentences_list[first_sentence_num - 1]] + context cur_sent_len += first_sentence_len first_sentence_num -= 1 added = True if not added: break context = " ".join(context) log.debug(f"rank, context: {context}") contexts.append(context) if self.use_descriptions: scores_list = self.entity_ranker(contexts, cand_ent_list, cand_ent_descr_list) else: scores_list = [[(entity_id, 1.0) for entity_id in cand_ent] for cand_ent in cand_ent_list] for entity_substr, tag, context, candidate_entities, substr_len, entities_scores, scores in zip( entity_substr_list, tags_list, contexts, cand_ent_list, substr_lens, entities_scores_list, scores_list ): entities_with_scores = [] max_conn_score = 0 if scores_dict and scores: max_conn_score = max([scores_dict.get(entity, 0) for entity, _ in scores]) for entity, score in scores: substr_score = round(entities_scores.get(entity, (0.0, 0))[0], 2) num_rels = entities_scores.get(entity, (0.0, 0))[1] if len(context.split()) < 4: score = 0.95 elif scores_dict and 0 < max_conn_score == scores_dict.get(entity, 0): score = 1.0 num_rels = 200 entities_with_scores.append((entity, substr_score, num_rels, float(score))) if tag == "t": entities_with_scores = sorted(entities_with_scores, key=lambda x: (x[1], x[2], x[3]), reverse=True) else: entities_with_scores = sorted(entities_with_scores, key=lambda x: (x[1], x[3], x[2]), reverse=True) log.debug(f"{entity_substr} --- tag: {tag} --- entities_with_scores: {entities_with_scores}") if not entities_with_scores: top_entities = [] top_conf = [] elif entities_with_scores and substr_len == 1 and entities_with_scores[0][1] < 1.0: top_entities = [] top_conf = [] elif entities_with_scores and ( entities_with_scores[0][1] < 0.3 or (entities_with_scores[0][3] < 0.13 and entities_with_scores[0][2] < 20) or (entities_with_scores[0][3] < 0.3 and entities_with_scores[0][2] < 4) or entities_with_scores[0][1] < 0.6 ): top_entities = [] top_conf = [] else: top_entities = [score[0] for score in entities_with_scores] top_conf = [score[1:] for score in entities_with_scores] high_conf_entities = [] high_conf_nums = [] for elem_num, (entity, conf) in enumerate(zip(top_entities, top_conf)): if len(conf) == 3 and conf[0] >= 1.0 and conf[1] > 50 and conf[2] > 0.3: new_conf = list(conf) if new_conf[1] > 55: new_conf[2] = 1.0 new_conf = tuple(new_conf) high_conf_entities.append((entity,) + new_conf) high_conf_nums.append(elem_num) high_conf_entities = sorted(high_conf_entities, key=lambda x: (x[1], x[3], x[2]), reverse=True) log.debug(f"high_conf_entities: {high_conf_entities}") for n, elem_num in enumerate(high_conf_nums): if 0 <= elem_num - n < len(top_entities): del top_entities[elem_num - n] del top_conf[elem_num - n] top_entities = [elem[0] for elem in high_conf_entities] + top_entities top_conf = [elem[1:] for elem in high_conf_entities] + top_conf if not top_entities: entities_with_scores = sorted(entities_with_scores, key=lambda x: (x[1], x[2], x[3]), reverse=True) top_entities = [score[0] for score in entities_with_scores] top_conf = [score[1:] for score in entities_with_scores] if self.num_entities_to_return == 1 and top_entities: entity_ids_list.append(top_entities[0]) conf_list.append([round(cnf, 2) for cnf in top_conf[0]]) elif self.num_entities_to_return == "max": if top_conf: max_conf = top_conf[0][0] max_rank_conf = top_conf[0][2] entity_ids, confs = [], [] for entity_id, conf in zip(top_entities, top_conf): if (conf[0] >= max_conf * 0.9 and max_rank_conf <= 1.0) \ or (max_rank_conf == 1.0 and conf[2] == 1.0): entity_ids.append(entity_id) confs.append([round(cnf, 2) for cnf in conf]) entity_ids_list.append(entity_ids) conf_list.append(confs) else: entity_ids_list.append([]) conf_list.append([]) else: entity_ids_list.append(top_entities[: self.num_entities_to_return]) conf_list.append([[round(cnf, 2) for cnf in conf] for conf in top_conf[: self.num_entities_to_return]]) log.debug(f"{entity_substr} --- top entities {entity_ids_list[-1]} --- top_conf {conf_list[-1]}") return entity_ids_list, conf_list def sort_out_low_conf(self, entity_substr, top_entities, top_conf): if len(entity_substr.split()) > 1 and top_conf: f_top_entities, f_top_conf = [], [] for top_conf_thres, conf_thres in [(1.0, 0.9), (0.9, 0.8)]: if top_conf[0][0] >= top_conf_thres: for ent, conf in zip(top_entities, top_conf): if conf[0] > conf_thres: f_top_entities.append(ent) f_top_conf.append(conf) return f_top_entities, f_top_conf return top_entities, top_conf def rank_by_connections(self, ids_list): objects_sets_dict, scores_dict, conn_dict = {}, {}, {} for ids in ids_list: for entity_id in ids: scores_dict[entity_id] = 0 conn_dict[entity_id] = set() for ids in ids_list: for entity_id in ids[:self.num_entities_for_conn_ranking]: objects = set() for prefix in self.prefixes["entity"]: tr, _ = self.kb.search_triples(f"{prefix}/{entity_id}", "", "") for subj, rel, obj in tr: if rel.split("/")[-1] not in {"P31", "P279"}: if any([obj.startswith(pr) for pr in self.prefixes["entity"]]): objects.add(obj.split("/")[-1]) if rel.startswith(self.prefixes["rels"]["no_type"]): tr2, _ = self.kb.search_triples(obj, "", "") for _, rel2, obj2 in tr2: if rel2.startswith(self.prefixes["rels"]["statement"]) \ or rel2.startswith(self.prefixes["rels"]["qualifier"]): if any([obj2.startswith(pr) for pr in self.prefixes["entity"]]): objects.add(obj2.split("/")[-1]) objects_sets_dict[entity_id] = objects for obj in objects: if obj not in objects_sets_dict: objects_sets_dict[obj] = set() objects_sets_dict[obj].add(entity_id) for i in range(len(ids_list)): for j in range(len(ids_list)): if i != j: for entity_id1 in ids_list[i][:self.num_entities_for_conn_ranking]: for entity_id2 in ids_list[j][:self.num_entities_for_conn_ranking]: if entity_id1 in objects_sets_dict[entity_id2]: conn_dict[entity_id1].add(entity_id2) conn_dict[entity_id2].add(entity_id1) for entity_id in conn_dict: scores_dict[entity_id] = len(conn_dict[entity_id]) return scores_dict ================================================ FILE: deeppavlov/models/entity_extraction/find_word.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import itertools import pickle from collections import Counter import numpy as np import scipy as sp from deeppavlov.core.commands.utils import expand_path Sparse = sp.sparse.csr_matrix class WordSearcher: def __init__(self, words_dict_filename: str, ngrams_matrix_filename: str, lang: str = "@en", thresh: int = 1000): self.words_dict_filename = words_dict_filename self.ngrams_matrix_filename = ngrams_matrix_filename if lang == "@en": self.letters = "abcdefghijklmnopqrstuvwxyz" elif lang == "@ru": self.letters = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя" else: raise ValueError(f'Unexpected lang value: "{lang}"') self.thresh = thresh self.load() self.make_ngrams_dicts() def load(self): with open(str(expand_path(self.words_dict_filename)), "rb") as fl: self.words_dict = pickle.load(fl) words_list = list(self.words_dict.keys()) self.words_list = sorted(words_list) loader = np.load(str(expand_path(self.ngrams_matrix_filename)), allow_pickle=True) self.count_matrix = Sparse((loader["data"], loader["indices"], loader["indptr"]), shape=loader["shape"]) def make_ngrams_dicts(self): self.bigrams_dict, self.trigrams_dict = {}, {} bigram_combs = list(itertools.product(self.letters, self.letters)) bigram_combs = ["".join(comb) for comb in bigram_combs] trigram_combs = list(itertools.product(self.letters, self.letters, self.letters)) trigram_combs = ["".join(comb) for comb in trigram_combs] for cnt, bigram in enumerate(bigram_combs): self.bigrams_dict[bigram] = cnt for cnt, trigram in enumerate(trigram_combs): self.trigrams_dict[trigram] = cnt + len(bigram_combs) def __call__(self, query, tags): ngrams_list = [] for i in range(len(query) - 1): ngram = query[i : i + 2].lower() if ngram in self.bigrams_dict: ngram_id = self.bigrams_dict[ngram] ngrams_list.append(ngram_id) for i in range(len(query) - 2): ngram = query[i : i + 3].lower() if ngram in self.trigrams_dict: ngram_id = self.trigrams_dict[ngram] ngrams_list.append(ngram_id) ngrams_with_cnts = Counter(ngrams_list).most_common() ngram_ids = [elem[0] for elem in ngrams_with_cnts] ngram_cnts = [1 for _ in ngrams_with_cnts] indptr = np.array([0, len(ngram_cnts)]) query_matrix = Sparse( (ngram_cnts, ngram_ids, indptr), shape=(1, len(self.bigrams_dict) + len(self.trigrams_dict)) ) scores = query_matrix * self.count_matrix scores = np.squeeze(scores.toarray()) if self.thresh >= len(scores): o = np.argpartition(-scores, len(scores) - 1)[0:self.thresh] else: o = np.argpartition(-scores, self.thresh)[0:self.thresh] o_sort = o[np.argsort(-scores[o])] o_sort = o_sort.tolist() found_words = [self.words_list[n] for n in o_sort] found_words = [ word for word in found_words if ( word.startswith(query[0]) and abs(len(word) - len(query)) < 3 and self.words_dict[word].intersection(tags) ) ] return found_words ================================================ FILE: deeppavlov/models/entity_extraction/ner_chunker.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import re from logging import getLogger from string import punctuation from typing import List, Tuple, Union, Any from nltk import sent_tokenize from transformers import AutoTokenizer from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component from deeppavlov.core.common.chainer import Chainer from deeppavlov.models.entity_extraction.entity_detection_parser import EntityDetectionParser log = getLogger(__name__) @register('ner_chunker') class NerChunker(Component): """ Class to split documents into chunks of max_seq_len symbols so that the length will not exceed maximal sequence length to feed into BERT """ def __init__(self, vocab_file: str, max_seq_len: int = 400, lowercase: bool = False, batch_size: int = 2, **kwargs): """ Args: vocab_file: vocab file of pretrained transformer model max_seq_len: maximal length of chunks into which the document is split lowercase: whether to lowercase text batch_size: how many chunks are in batch """ self.max_seq_len = max_seq_len self.batch_size = batch_size self.re_tokenizer = re.compile(r"[\w']+|[^\w ]") self.tokenizer = AutoTokenizer.from_pretrained(vocab_file, do_lower_case=True) self.punct_ext = punctuation + " " + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" self.russian_letters = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя" self.lowercase = lowercase def __call__(self, docs_batch: List[str]) -> Tuple[List[List[str]], List[List[int]], List[List[Union[ List[Union[Tuple[int, int], Tuple[Union[int, Any], Union[int, Any]]]], List[ Tuple[Union[int, Any], Union[int, Any]]], List[Tuple[int, int]]]]], List[List[Union[List[Any], List[str]]]], List[List[str]]]: """ This method splits each document in the batch into chunks wuth the maximal length of max_seq_len Args: docs_batch: batch of documents Returns: batch of lists of document chunks for each document batch of lists of numbers of documents which correspond to chunks """ text_batch_list, nums_batch_list, sentences_offsets_batch_list, sentences_batch_list = [], [], [], [] text_batch, nums_batch, sentences_offsets_batch, sentences_batch = [], [], [], [] for n, doc in enumerate(docs_batch): if self.lowercase: doc = doc.lower() start = 0 text = "" sentences_list = [] sentences_offsets_list = [] cur_len = 0 doc_pieces = doc.split("\n") doc_pieces = [self.sanitize(doc_piece) for doc_piece in doc_pieces] doc_pieces = [doc_piece for doc_piece in doc_pieces if len(doc_piece) > 1] if doc_pieces: sentences = [] for doc_piece in doc_pieces: sentences += sent_tokenize(doc_piece) for sentence in sentences: sentence_tokens = re.findall(self.re_tokenizer, sentence) sentence_len = sum([len(self.tokenizer.encode_plus(token, add_special_tokens=False)["input_ids"]) for token in sentence_tokens]) if cur_len + sentence_len < self.max_seq_len: text += f"{sentence} " cur_len += sentence_len end = start + len(sentence) sentences_offsets_list.append((start, end)) sentences_list.append(sentence) start = end + 1 else: text = text.strip() if text: text_batch.append(text) sentences_offsets_batch.append(sentences_offsets_list) sentences_batch.append(sentences_list) nums_batch.append(n) if sentence_len < self.max_seq_len: text = f"{sentence} " cur_len = sentence_len start = 0 end = start + len(sentence) sentences_offsets_list = [(start, end)] sentences_list = [sentence] start = end + 1 else: text = "" sentence_chunks = sentence.split(" ") for chunk in sentence_chunks: chunk_tokens = re.findall(self.re_tokenizer, chunk) chunk_len = sum([len(self.tokenizer.encode_plus(token, add_special_tokens=False)["input_ids"]) for token in chunk_tokens]) if cur_len + chunk_len < self.max_seq_len: text += f"{chunk} " cur_len += chunk_len + 1 end = start + len(chunk) sentences_offsets_list.append((start, end)) sentences_list.append(chunk) start = end + 1 else: text = text.strip() if text: text_batch.append(text) sentences_offsets_batch.append(sentences_offsets_list) sentences_batch.append(sentences_list) nums_batch.append(n) text = f"{chunk} " cur_len = chunk_len start = 0 end = start + len(chunk) sentences_offsets_list = [(start, end)] sentences_list = [chunk] start = end + 1 text = text.strip().strip(",") if text: text_batch.append(text) nums_batch.append(n) sentences_offsets_batch.append(sentences_offsets_list) sentences_batch.append(sentences_list) else: text_batch.append("а") nums_batch.append(n) sentences_offsets_batch.append([(0, len(doc))]) sentences_batch.append([doc]) num_batches = len(text_batch) // self.batch_size + int(len(text_batch) % self.batch_size > 0) for jj in range(num_batches): text_batch_list.append(text_batch[jj * self.batch_size:(jj + 1) * self.batch_size]) nums_batch_list.append(nums_batch[jj * self.batch_size:(jj + 1) * self.batch_size]) sentences_offsets_batch_list.append( sentences_offsets_batch[jj * self.batch_size:(jj + 1) * self.batch_size]) sentences_batch_list.append(sentences_batch[jj * self.batch_size:(jj + 1) * self.batch_size]) return text_batch_list, nums_batch_list, sentences_offsets_batch_list, sentences_batch_list def sanitize(self, text): text_len = len(text) if text_len > 0 and text[text_len - 1] not in {'.', '!', '?'}: i = text_len - 1 while text[i] in self.punct_ext and i > 0: i -= 1 if (text[i] in {'.', '!', '?'} and text[i - 1].lower() in self.russian_letters) or \ (i > 1 and text[i] in {'.', '!', '?'} and text[i - 1] in '"' and text[ i - 2].lower() in self.russian_letters): break text = text[:i + 1] text = re.sub(r'\s+', ' ', text) return text @register('ner_chunk_model') class NerChunkModel(Component): """ Class for linking of entity substrings in the document to entities in Wikidata """ def __init__(self, ner: Chainer, ner_parser: EntityDetectionParser, ner2: Chainer = None, ner_parser2: EntityDetectionParser = None, **kwargs) -> None: """ Args: ner: config for entity detection ner_parser: component deeppavlov.models.entity_extraction.entity_detection_parser ner2: config of additional entity detection model (ensemble of ner and ner2 models gives better entity detection quality than single ner model) ner_parser2: component deeppavlov.models.entity_extraction.entity_detection_parser **kwargs: """ self.ner = ner self.ner_parser = ner_parser self.ner2 = ner2 self.ner_parser2 = ner_parser2 def __call__(self, text_batch_list: List[List[str]], nums_batch_list: List[List[int]], sentences_offsets_batch_list: List[List[List[Tuple[int, int]]]], sentences_batch_list: List[List[List[str]]] ): """ Args: text_batch_list: list of document chunks nums_batch_list: nums of documents sentences_offsets_batch_list: indices of start and end symbols of sentences in text sentences_batch_list: list of sentences from texts Returns: doc_entity_substr_batch: entity substrings doc_entity_offsets_batch: indices of start and end symbols of entities in text doc_tags_batch: entity tags (PER, LOC, ORG) doc_sentences_offsets_batch: indices of start and end symbols of sentences in text doc_sentences_batch: list of sentences from texts """ entity_substr_batch_list, entity_offsets_batch_list, entity_positions_batch_list, tags_batch_list, \ entity_probas_batch_list, text_len_batch_list, text_tokens_len_batch_list = [], [], [], [], [], [], [] for text_batch, sentences_offsets_batch, sentences_batch in \ zip(text_batch_list, sentences_offsets_batch_list, sentences_batch_list): text_batch = [text.replace("\xad", " ") for text in text_batch] ner_tokens_batch, ner_tokens_offsets_batch, ner_probas_batch, probas_batch = self.ner(text_batch) entity_substr_batch, entity_positions_batch, entity_probas_batch = \ self.ner_parser(ner_tokens_batch, ner_probas_batch, probas_batch) if self.ner2: ner_tokens_batch2, ner_tokens_offsets_batch2, ner_probas_batch2, probas_batch2 = self.ner2(text_batch) entity_substr_batch2, entity_positions_batch2, entity_probas_batch2 = \ self.ner_parser2(ner_tokens_batch2, ner_probas_batch2, probas_batch2) entity_substr_batch, entity_positions_batch, entity_probas_batch = \ self.merge_annotations(entity_substr_batch, entity_positions_batch, entity_probas_batch, entity_substr_batch2, entity_positions_batch2, entity_probas_batch2) entity_pos_tags_probas_batch = [[(entity_substr.lower(), entity_substr_positions, tag, entity_proba) for tag, entity_substr_list in entity_substr_dict.items() for entity_substr, entity_substr_positions, entity_proba in zip(entity_substr_list, entity_positions_dict[tag], entity_probas_dict[tag])] for entity_substr_dict, entity_positions_dict, entity_probas_dict in zip(entity_substr_batch, entity_positions_batch, entity_probas_batch)] entity_substr_batch, entity_offsets_batch, entity_positions_batch, tags_batch, \ probas_batch = [], [], [], [], [] for entity_pos_tags_probas, ner_tokens_offsets_list in \ zip(entity_pos_tags_probas_batch, ner_tokens_offsets_batch): if entity_pos_tags_probas: entity_offsets_list = [] entity_substr_list, entity_positions_list, tags_list, probas_list = zip(*entity_pos_tags_probas) for entity_positions in entity_positions_list: start_offset = ner_tokens_offsets_list[entity_positions[0]][0] end_offset = ner_tokens_offsets_list[entity_positions[-1]][1] entity_offsets_list.append((start_offset, end_offset)) else: entity_substr_list, entity_offsets_list, entity_positions_list = [], [], [] tags_list, probas_list = [], [] entity_substr_batch.append(list(entity_substr_list)) entity_offsets_batch.append(list(entity_offsets_list)) entity_positions_batch.append(list(entity_positions_list)) tags_batch.append(list(tags_list)) probas_batch.append(list(probas_list)) entity_substr_batch_list.append(entity_substr_batch) tags_batch_list.append(tags_batch) entity_offsets_batch_list.append(entity_offsets_batch) entity_positions_batch_list.append(entity_positions_batch) entity_probas_batch_list.append(probas_batch) text_len_batch_list.append([len(text) for text in text_batch]) text_tokens_len_batch_list.append([len(ner_tokens) for ner_tokens in ner_tokens_batch]) doc_entity_substr_batch, doc_tags_batch, doc_entity_offsets_batch, doc_probas_batch = [], [], [], [] doc_entity_positions_batch, doc_sentences_offsets_batch, doc_sentences_batch = [], [], [] doc_entity_substr, doc_tags, doc_probas, doc_entity_offsets, doc_entity_positions = [], [], [], [], [] doc_sentences_offsets, doc_sentences = [], [] cur_doc_num = 0 text_len_sum = 0 text_tokens_len_sum = 0 for entity_substr_batch, tags_batch, probas_batch, entity_offsets_batch, entity_positions_batch, \ sentences_offsets_batch, sentences_batch, text_len_batch, text_tokens_len_batch, nums_batch in \ zip(entity_substr_batch_list, tags_batch_list, entity_probas_batch_list, entity_offsets_batch_list, entity_positions_batch_list, sentences_offsets_batch_list, sentences_batch_list, text_len_batch_list, text_tokens_len_batch_list, nums_batch_list): for entity_substr_list, tag_list, probas_list, entity_offsets_list, entity_positions_list, \ sentences_offsets_list, sentences_list, text_len, text_tokens_len, doc_num in \ zip(entity_substr_batch, tags_batch, probas_batch, entity_offsets_batch, entity_positions_batch, sentences_offsets_batch, sentences_batch, text_len_batch, text_tokens_len_batch, nums_batch): if doc_num == cur_doc_num: doc_entity_substr += entity_substr_list doc_tags += tag_list doc_probas += probas_list doc_entity_offsets += [(start_offset + text_len_sum, end_offset + text_len_sum) for start_offset, end_offset in entity_offsets_list] doc_sentences_offsets += [(start_offset + text_len_sum, end_offset + text_len_sum) for start_offset, end_offset in sentences_offsets_list] doc_entity_positions += [[pos + text_tokens_len_sum for pos in positions] for positions in entity_positions_list] doc_sentences += sentences_list text_len_sum += text_len + 1 text_tokens_len_sum += text_tokens_len else: doc_entity_substr_batch.append(doc_entity_substr) doc_tags_batch.append(doc_tags) doc_probas_batch.append(doc_probas) doc_entity_offsets_batch.append(doc_entity_offsets) doc_entity_positions_batch.append(doc_entity_positions) doc_sentences_offsets_batch.append(doc_sentences_offsets) doc_sentences_batch.append(doc_sentences) doc_entity_substr = entity_substr_list doc_tags = tag_list doc_probas = probas_list doc_entity_offsets = entity_offsets_list doc_sentences_offsets = sentences_offsets_list doc_sentences = sentences_list cur_doc_num = doc_num text_len_sum = text_len + 1 text_tokens_len_sum = text_tokens_len doc_entity_substr_batch.append(doc_entity_substr) doc_tags_batch.append(doc_tags) doc_probas_batch.append(doc_probas) doc_entity_offsets_batch.append(doc_entity_offsets) doc_entity_positions_batch.append(doc_entity_positions) doc_sentences_offsets_batch.append(doc_sentences_offsets) doc_sentences_batch.append(doc_sentences) return doc_entity_substr_batch, doc_entity_offsets_batch, doc_entity_positions_batch, doc_tags_batch, \ doc_sentences_offsets_batch, doc_sentences_batch, doc_probas_batch def merge_annotations(self, substr_batch, pos_batch, probas_batch, substr_batch2, pos_batch2, probas_batch2): log.debug(f"ner_chunker, substr2: {substr_batch2} --- pos2: {pos_batch2} --- probas2: {probas_batch2} --- " f"substr: {substr_batch} --- pos: {pos_batch} --- probas: {probas_batch}") for i in range(len(substr_batch)): for key2 in substr_batch2[i]: substr_list2 = substr_batch2[i][key2] pos_list2 = pos_batch2[i][key2] probas_list2 = probas_batch2[i][key2] for substr2, pos2, probas2 in zip(substr_list2, pos_list2, probas_list2): found = False for key in substr_batch[i]: pos_list = pos_batch[i][key] for pos in pos_list: if pos[0] <= pos2[0] <= pos[-1] or pos[0] <= pos2[-1] <= pos[-1]: found = True if not found: if key2 not in substr_batch[i]: substr_batch[i][key2] = [] pos_batch[i][key2] = [] probas_batch[i][key2] = [] substr_batch[i][key2].append(substr2) pos_batch[i][key2].append(pos2) probas_batch[i][key2].append(probas2) for i in range(len(substr_batch)): for key2 in substr_batch2[i]: substr_list2 = substr_batch2[i][key2] pos_list2 = pos_batch2[i][key2] probas_list2 = probas_batch2[i][key2] for substr2, pos2, probas2 in zip(substr_list2, pos_list2, probas_list2): for key in substr_batch[i]: inds = [] substr_list = substr_batch[i][key] pos_list = pos_batch[i][key] probas_list = probas_batch[i][key] for n, (substr, pos, probas) in enumerate(zip(substr_list, pos_list, probas_list)): if (pos[0] == pos2[0] and pos[-1] < pos2[-1]) or (pos[0] > pos2[0] and pos[-1] == pos2[-1]): inds.append(n) elif key == "EVENT" and ((pos[0] >= pos2[0] and pos[-1] <= pos2[-1]) or (len(substr.split()) == 1 and pos2[0] <= pos[0])): inds.append(n) if (len(inds) > 1 or (len(inds) == 1 and key in {"WORK_OF_ART", "EVENT"})) \ and not (key == "PERSON" and " и " in substr2): inds = sorted(inds, reverse=True) for ind in inds: del substr_batch[i][key][ind] del pos_batch[i][key][ind] del probas_batch[i][key][ind] substr_batch[i][key].append(substr2) pos_batch[i][key].append(pos2) probas_batch[i][key].append(probas2) return substr_batch, pos_batch, probas_batch ================================================ FILE: deeppavlov/models/kbqa/__init__.py ================================================ ================================================ FILE: deeppavlov/models/kbqa/query_generator.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import copy import itertools import re from collections import defaultdict from logging import getLogger from typing import Tuple, List, Optional, Union, Dict, Any, Set import nltk import numpy as np from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component from deeppavlov.models.kbqa.query_generator_base import QueryGeneratorBase from deeppavlov.models.kbqa.rel_ranking_infer import RelRankerInfer from deeppavlov.models.kbqa.utils import extract_year, extract_number, make_combs, fill_query, find_query_features, \ make_sparql_query, merge_sparql_query from deeppavlov.models.kbqa.wiki_parser import WikiParser log = getLogger(__name__) @register('query_generator') class QueryGenerator(QueryGeneratorBase): """ Class for query generation using Wikidata hdt file """ def __init__(self, wiki_parser: WikiParser, rel_ranker: RelRankerInfer, entities_to_leave: int = 5, types_to_leave: int = 2, rels_to_leave: int = 7, max_comb_num: int = 10000, gold_query_info: Dict[str, str] = None, map_query_str_to_kb: List[Tuple[str, str]] = None, return_answers: bool = True, *args, **kwargs) -> None: """ Args: wiki_parser: component deeppavlov.models.kbqa.wiki_parser rel_ranker: component deeppavlov.models.kbqa.rel_ranking_infer entities_to_leave: how many entities to leave after entity linking types_to_leave: how many types to leave after entity linking rels_to_leave: how many relations to leave after relation ranking max_comb_num: the maximum number of combinations of candidate entities and relations gold_query_info: dict of variable names used for formatting output sparql queries map_query_str_to_kb: mapping of knowledge base prefixes to full https return_answers: whether to return answers or candidate relations and answers for further ranking **kwargs: """ self.wiki_parser = wiki_parser self.rel_ranker = rel_ranker self.entities_to_leave = entities_to_leave self.types_to_leave = types_to_leave self.rels_to_leave = rels_to_leave self.max_comb_num = max_comb_num self.gold_query_info = gold_query_info self.map_query_str_to_kb = map_query_str_to_kb self.return_answers = return_answers self.replace_tokens = [("wdt:p", "wdt:P"), ("pq:p", "pq:P")] super().__init__(wiki_parser=self.wiki_parser, rel_ranker=self.rel_ranker, entities_to_leave=self.entities_to_leave, rels_to_leave=self.rels_to_leave, *args, **kwargs) def __call__(self, question_batch: List[str], question_san_batch: List[str], template_type_batch: Union[List[List[str]], List[str]], entities_from_ner_batch: List[List[str]], types_from_ner_batch: List[List[str]], entity_tags_batch: List[List[str]], probas_batch: List[List[float]], answer_types_batch: List[Set[str]] = None, entities_to_link_batch: List[List[int]] = None) -> Tuple[List[Any], List[Any]]: candidate_outputs_batch, template_answers_batch = [], [] if not answer_types_batch or answer_types_batch[0] is None: answer_types_batch = [[] for _ in question_batch] if not entities_to_link_batch or entities_to_link_batch[0] is None: entities_to_link_batch = [[1 for _ in substr_list] for substr_list in entities_from_ner_batch] log.debug(f"kbqa inputs {question_batch} {question_san_batch} template_type_batch: {template_type_batch} --- " f"entities_from_ner: {entities_from_ner_batch} --- types_from_ner: {types_from_ner_batch} --- " f"entity_tags_batch: {entity_tags_batch} --- answer_types_batch: " f"{[list(elem)[:3] for elem in answer_types_batch]}") for question, question_sanitized, template_type, entities_from_ner, types_from_ner, entity_tags_list, \ probas, entities_to_link, answer_types in zip(question_batch, question_san_batch, template_type_batch, entities_from_ner_batch, types_from_ner_batch, entity_tags_batch, probas_batch, entities_to_link_batch, answer_types_batch): if template_type == "-1": template_type = "7" candidate_outputs, template_answer = \ self.find_candidate_answers(question, question_sanitized, template_type, entities_from_ner, types_from_ner, entity_tags_list, probas, entities_to_link, answer_types) candidate_outputs_batch.append(candidate_outputs) template_answers_batch.append(template_answer) if self.return_answers: answers = self.rel_ranker(question_batch, template_type_batch, candidate_outputs_batch, entities_from_ner_batch, template_answers_batch) log.debug(f"(__call__)answers: {answers}") if not answers: answers = ["Not Found" for _ in question_batch] return answers else: return candidate_outputs_batch, template_answers_batch def parse_queries_info(self, question, queries_info, entity_ids, type_ids, rels_from_template): parsed_queries_info = [] question_tokens = nltk.word_tokenize(question) rels_scores_dict = {} for query_info in queries_info: query = query_info["query_template"].lower() for old_tok, new_tok in self.replace_tokens: query = query.replace(old_tok, new_tok) log.debug(f"\n_______________________________\nquery: {query}\n_______________________________\n") entities_and_types_select = query_info["entities_and_types_select"] rels_for_search = query_info["rank_rels"] rel_types = query_info["rel_types"] n_hops = query_info["n_hops"] unk_rels = query_info.get("unk_rels", []) query_seq_num = query_info["query_sequence"] return_if_found = query_info["return_if_found"] log.debug(f"(query_parser)query: {query}, rels_for_search {rels_for_search}, rel_types {rel_types} " f"n_hops {n_hops}, {query_seq_num}, {return_if_found}") query_triplets = re.findall("{[ ]?(.*?)[ ]?}", query)[0].split(' . ') log.debug(f"(query_parser)query_triplets: {query_triplets}") query_triplets_split = [triplet.split(' ')[:3] for triplet in query_triplets] property_types = {} for rel_type, query_triplet in zip(rel_types, query_triplets_split): if query_triplet[1].startswith("?") and rel_type == "qualifier": property_types[query_triplet[1]] = rel_type query_sequence_dict = {num + 1: triplet for num, triplet in enumerate(query_triplets_split)} query_sequence = [] for i in query_seq_num: query_sequence.append(query_sequence_dict[i]) triplet_info_list = [("forw" if triplet[2].startswith('?') else "backw", search_source, rel_type, n_hop) for search_source, triplet, rel_type, n_hop in \ zip(rels_for_search, query_sequence, rel_types, n_hops) if search_source != "do_not_rank"] log.debug(f"(query_parser)query_sequence_dict: {query_sequence_dict} --- rel_directions: " f"{triplet_info_list} --- query_sequence: {query_sequence}") entity_ids = [entity[:self.entities_to_leave] for entity in entity_ids] rels, entities_rel_conn = [], set() if rels_from_template is not None: rels = [[(rel, 1.0) for rel in rel_list] for rel_list in rels_from_template] elif not rels: for triplet_info in triplet_info_list: ex_rels, cur_rels_scores_dict, entity_rel_conn = self.find_top_rels(question, entity_ids, triplet_info) rels.append(ex_rels) rels_scores_dict = {**rels_scores_dict, **cur_rels_scores_dict} entities_rel_conn = entities_rel_conn.union(entity_rel_conn) log.debug(f"(query_parser)rels: {rels}") rels_from_query = [triplet[1] for triplet in query_triplets_split if triplet[1].startswith('?')] qualifier_rels = [triplet[1] for triplet in query_triplets_split if triplet[1].startswith("pq:P")] answer_ent, order_info, filter_from_query = find_query_features(query, qualifier_rels, question) log.debug(f"(query_parser) filter_from_query: {filter_from_query} --- order_info: {order_info}") year = extract_year(question_tokens, question) number = extract_number(question_tokens, question) log.debug(f"year {year}, number {number}") if year: filter_info = [(elem[0], elem[1].replace("n", year)) for elem in filter_from_query] elif number: filter_info = [(elem[0], elem[1].replace("n", number)) for elem in filter_from_query] else: filter_info = [elem for elem in filter_from_query if elem[1] != "n"] for unk_prop, prop_type in property_types.items(): filter_info.append((unk_prop, prop_type)) log.debug(f"(query_parser)filter_from_query: {filter_from_query}") rel_combs = make_combs(rels, permut=False) entity_positions, type_positions = [elem.split('_') for elem in entities_and_types_select.split(' ')] log.debug(f"entity_positions {entity_positions}, type_positions {type_positions}") selected_entity_ids, selected_type_ids = [], [] if len(entity_ids) > 1 and len(entity_positions) == 1: selected_entity_ids = [] for j in range(max([len(elem) for elem in entity_ids])): for elem in entity_ids: if j < len(elem): selected_entity_ids.append(elem[j]) selected_entity_ids = [selected_entity_ids] elif entity_ids: selected_entity_ids = [entity_ids[int(pos) - 1] for pos in entity_positions if int(pos) > 0] if type_ids: selected_type_ids = [type_ids[int(pos) - 1][:self.types_to_leave] for pos in type_positions if int(pos) > 0] entity_combs = make_combs(selected_entity_ids, permut=True) type_combs = make_combs(selected_type_ids, permut=False) log.debug(f"(query_parser)entity_combs: {entity_combs[:3]}, type_combs: {type_combs[:3]}," f" rel_combs: {rel_combs[:3]}") all_combs_list = list(itertools.product(entity_combs, type_combs, rel_combs)) all_combs_list = sorted(all_combs_list, key=lambda x: (sum([elem[-1] for elem in x]), x[0][-1])) parsed_queries_info.append({"query_triplets": query_triplets, "query_sequence": query_sequence, "rels_from_query": rels_from_query, "answer_ent": answer_ent, "filter_info": filter_info, "order_info": order_info, "rel_types": rel_types, "unk_rels": unk_rels, "return_if_found": return_if_found, "selected_entity_ids": selected_entity_ids, "selected_type_ids": selected_type_ids, "rels": rels, "entities_rel_conn": entities_rel_conn, "entity_combs": entity_combs, "type_combs": type_combs, "rel_combs": rel_combs, "all_combs_list": all_combs_list}) return parsed_queries_info, rels_scores_dict def check_valid_query(self, entities_rel_conn, query_hdt_seq): entity_rel_valid = True if entities_rel_conn: for query_hdt_elem in query_hdt_seq: entity, rel = "", "" if len(query_hdt_elem) == 3 and any([query_hdt_elem[i].startswith("?") for i in [0, 2]]): if "statement" in self.kb_prefixes and query_hdt_elem[1].startswith(self.kb_prefixes["statement"]): continue else: if not query_hdt_elem[0].startswith("?"): entity = query_hdt_elem[0].split("/")[-1] elif not query_hdt_elem[2].startswith("?"): entity = query_hdt_elem[2].split("/")[-1] if not query_hdt_elem[1].startswith("?"): rel = query_hdt_elem[1].split("/")[-1] if entity and rel and rel not in self.kb_prefixes["type_rels"] \ and (entity, rel) not in entities_rel_conn: entity_rel_valid = False return entity_rel_valid def query_parser(self, question: str, queries_info: Dict[str, str], entity_ids: List[List[str]], type_ids: List[List[str]], answer_types: Set[str], rels_from_template: Optional[List[Tuple[str]]] = None) -> Union[List[Dict[str, Any]], list]: parsed_queries_info, rels_scores_dict = self.parse_queries_info(question, queries_info, entity_ids, type_ids, rels_from_template) queries_list, parser_info_list, entity_conf_list = [], [], [] new_combs_list, query_info_list = [], [] combs_num_list = [len(parsed_query_info["all_combs_list"]) for parsed_query_info in parsed_queries_info] if combs_num_list: max_comb_nums = max(combs_num_list) else: max_comb_nums = 0 for comb_num in range(max_comb_nums): for parsed_query_info in parsed_queries_info: if comb_num < min(len(parsed_query_info["all_combs_list"]), self.max_comb_num): query_triplets = parsed_query_info["query_triplets"] query_sequence = parsed_query_info["query_sequence"] rels_from_query = parsed_query_info["rels_from_query"] answer_ent = parsed_query_info["answer_ent"] filter_info = parsed_query_info["filter_info"] order_info = parsed_query_info["order_info"] rel_types = parsed_query_info["rel_types"] unk_rels = parsed_query_info["unk_rels"] return_if_found = parsed_query_info["return_if_found"] entities_rel_conn = parsed_query_info["entities_rel_conn"] combs = parsed_query_info["all_combs_list"][comb_num] if combs[0][-1] == 0: entity_conf_list.append(1.0) else: entity_conf_list.append(0.9) query_hdt_seq = [fill_query(query_hdt_elem, combs[0], combs[1], combs[2], self.map_query_str_to_kb) for query_hdt_elem in query_sequence] if comb_num == 0: log.debug(f"\n______________________\nfilled query: {query_hdt_seq}\n______________________\n") entity_rel_valid = self.check_valid_query(entities_rel_conn, query_hdt_seq) if entity_rel_valid: new_combs_list.append(combs) queries_list.append((answer_ent, rels_from_query, query_hdt_seq, filter_info, order_info, answer_types, rel_types, return_if_found)) query_info_list.append((query_triplets, query_hdt_seq, answer_ent, filter_info, order_info)) parser_info_list.append("query_execute") if comb_num < 3 and unk_rels: unk_query_sequence = copy.deepcopy(query_sequence) unk_rels_from_query = copy.deepcopy(rels_from_query) for unk_rel, rel_var in zip(unk_rels, ["?p", "?p2"]): unk_query_sequence[int(unk_rel) - 1][1] = rel_var combs[-1][int(unk_rel) - 1] = (rel_var, 1.0) if rel_var not in rels_from_query: unk_rels_from_query.append(rel_var) query_hdt_seq = [ fill_query(query_hdt_elem, combs[0], combs[1], combs[2], self.map_query_str_to_kb) for query_hdt_elem in unk_query_sequence] new_combs_list.append(combs) queries_list.append((answer_ent, unk_rels_from_query, query_hdt_seq, filter_info, order_info, answer_types, rel_types, return_if_found)) query_info_list.append((query_triplets, query_hdt_seq, answer_ent, filter_info, order_info)) parser_info_list.append("query_execute") outputs_list = self.wiki_parser(parser_info_list, queries_list) outputs = self.parse_outputs(outputs_list, new_combs_list, query_info_list, entity_conf_list, rels_scores_dict) return outputs def parse_outputs(self, outputs_list, combs_list, query_info_list, entity_conf_list, rels_scores_dict): outputs = [] if isinstance(outputs_list, list) and outputs_list: outputs_len = len(outputs_list) combs_list = combs_list[:outputs_len] entity_conf_list = entity_conf_list[:outputs_len] for combs, query_info, entity_conf, (answers_list, found_rels_list, found_combs_list) in \ zip(combs_list, query_info_list, entity_conf_list, outputs_list): for answers, found_rels, found_comb in zip(answers_list, found_rels_list, found_combs_list): found_rels = [found_rel.split("/")[-1] for found_rel in found_rels] new_combs = list(copy.deepcopy(combs)) found_unk_rel = False for j, rel_var in enumerate(["?p", "?p2"]): if isinstance(new_combs[2][j], tuple) and new_combs[2][j][0] == rel_var: if found_rels: new_combs[2][j] = (found_rels[j], rels_scores_dict.get(found_rels[j], 1.0)) else: new_combs[2][j] = (new_combs[2][j][0], 0.0) found_unk_rel = True if found_rels and not found_unk_rel: new_combs[2] = new_combs[2][:-1] + [(found_rels[0], 1.0), new_combs[2][-1]] confidence = np.prod([score for rel, score in new_combs[2][:-1]]) if answers: outputs.append([new_combs[0], new_combs[1]] + [rel for rel, score in new_combs[2][:-1]] + answers + [(confidence, entity_conf), found_comb, query_info, new_combs[2]]) outputs_dict = defaultdict(list) types_dict = defaultdict(list) for output in outputs: key = (tuple(output[0]), tuple([rel.split("/")[-1] for rel in output[2:-5]])) if key not in outputs_dict or output[-5:] not in outputs_dict[key]: outputs_dict[key].append(output[-5:]) types_dict[key].append(tuple(output[1])) outputs = [] for (entity_comb, rel_comb), output in outputs_dict.items(): type_comb = types_dict[(entity_comb, rel_comb)] output_conf = [elem[1] for elem in output] output_conf = sorted(output_conf, key=lambda x: x[0] * x[1], reverse=True) found_combs = [elem[2] for elem in output] queries = [elem[3] for elem in output] rel_combs = [elem[4] for elem in output] cur_rel_comb = rel_combs[0] cur_rel_comb = [rel for rel, score in cur_rel_comb[:-1]] sparql_query = make_sparql_query(queries[0], entity_comb, rel_combs[0], type_comb[0], self.gold_query_info) parser_info_list = ["fill_triplets"] parser_query_list = [(queries[0][1], queries[0][2], found_combs[0])] filled_triplets = self.wiki_parser(parser_info_list, parser_query_list) outputs.append({"entities": entity_comb, "types": type_comb, "relations": list(cur_rel_comb), "answers": tuple([ans for ans, *_ in output]), "output_conf": output_conf[0], "sparql_query": sparql_query, "triplets": filled_triplets[0]}) return outputs @register('query_formatter') class QueryFormatter(Component): def __init__(self, query_info: Dict[str, str], replace_prefixes: Dict[str, str] = None, **kwargs): self.query_info = query_info self.replace_prefixes = replace_prefixes def __call__(self, queries_batch): parsed_queries_batch = [] for query in queries_batch: query_split = re.findall("{[ ]?(.*?)[ ]?}", query) init_query_triplets, query_triplets = [], [] if query_split: init_query_triplets = query_split[0].split('. ') for triplet in init_query_triplets: triplet = " ".join([elem.strip("<>") for elem in triplet.strip().split()]) if self.replace_prefixes: for old_prefix, new_prefix in self.replace_prefixes.items(): triplet = triplet.replace(old_prefix, new_prefix) query_triplets.append(triplet) answer_ent, order_info, filter_from_query = find_query_features(query, order_from_query=True) query_info = (query_triplets, answer_ent, filter_from_query, order_info) query = merge_sparql_query(query_info, self.query_info) parsed_queries_batch.append(query) return parsed_queries_batch ================================================ FILE: deeppavlov/models/kbqa/query_generator_base.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import itertools import json from logging import getLogger from typing import Tuple, List, Dict, Optional, Union, Any, Set from bs4 import BeautifulSoup from whapi import search, get_html from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.file import read_json from deeppavlov.core.models.component import Component from deeppavlov.core.models.serializable import Serializable from deeppavlov.models.entity_extraction.entity_linking import EntityLinker from deeppavlov.models.kbqa.rel_ranking_infer import RelRankerInfer from deeppavlov.models.kbqa.template_matcher import TemplateMatcher from deeppavlov.models.kbqa.utils import preprocess_template_queries log = getLogger(__name__) class QueryGeneratorBase(Component, Serializable): """ This class takes as input entity substrings, defines the template of the query and fills the slots of the template with candidate entities and relations. """ def __init__(self, template_matcher: TemplateMatcher, rel_ranker: RelRankerInfer, load_path: str, sparql_queries_filename: str, entity_linker: EntityLinker, rels_in_ranking_queries_fname: str = None, wiki_parser=None, entities_to_leave: int = 5, rels_to_leave: int = 7, syntax_structure_known: bool = False, use_wp_api_requester: bool = False, use_el_api_requester: bool = False, use_alt_templates: bool = True, delete_rel_prefix: bool = True, kb_prefixes: Dict[str, str] = None, *args, **kwargs) -> None: """ Args: template_matcher: component deeppavlov.models.kbqa.template_matcher rel_ranker: component deeppavlov.models.kbqa.rel_ranking_infer load_path: path to folder with wikidata files sparql_queries_filename: file with sparql query templates entity_linker: component deeppavlov.models.entity_extraction.entity_linking for linking of entities rels_in_ranking_queries_fname: file with list of rels in queries for questions with ranking wiki_parser: component deeppavlov.models.kbqa.wiki_parser entities_to_leave: how many entities to leave after entity linking rels_to_leave: how many relations to leave after relation ranking syntax_structure_known: if syntax tree parser was used to define query template type use_wp_api_requester: whether deeppavlov.models.api_requester.api_requester component will be used for Wiki Parser use_el_api_requester: whether deeppavlov.models.api_requester.api_requester component will be used for Entity Linking use_alt_templates: whether to use alternative templates if no answer was found for default query template delete_rel_prefix: whether to delete prefix in relations kb_prefixes: prefixes for entities, relations and types in the knowledge base """ super().__init__(save_path=None, load_path=load_path) self.template_matcher = template_matcher self.entity_linker = entity_linker self.wiki_parser = wiki_parser self.rel_ranker = rel_ranker self.rels_in_ranking_queries_fname = rels_in_ranking_queries_fname self.rels_in_ranking_queries = {} self.entities_to_leave = entities_to_leave self.rels_to_leave = rels_to_leave self.syntax_structure_known = syntax_structure_known self.use_wp_api_requester = use_wp_api_requester self.use_el_api_requester = use_el_api_requester self.use_alt_templates = use_alt_templates self.sparql_queries_filename = sparql_queries_filename self.delete_rel_prefix = delete_rel_prefix self.kb_prefixes = kb_prefixes self.load() def load(self) -> None: if self.rels_in_ranking_queries_fname is not None: self.rels_in_ranking_queries = read_json(self.load_path / self.rels_in_ranking_queries_fname) template_queries = read_json(str(expand_path(self.sparql_queries_filename))) self.template_queries = preprocess_template_queries(template_queries, self.kb_prefixes) def save(self) -> None: pass def find_candidate_answers(self, question: str, question_sanitized: str, template_types: Union[List[str], str], entities_from_ner: List[str], types_from_ner: List[str], entity_tags: List[str], probas: List[float], entities_to_link: List[int], answer_types: Set[str]) -> Tuple[Union[List[Dict[str, Any]], list], str]: candidate_outputs = [] self.template_nums = [template_types] replace_tokens = [(' - ', '-'), (' .', ''), ('{', ''), ('}', ''), (' ', ' '), ('"', "'"), ('(', ''), (')', ''), ('–', '-')] for old, new in replace_tokens: question = question.replace(old, new) entities_from_template, types_from_template, rels_from_template, rel_dirs_from_template, query_type_template, \ entity_types, template_answer, template_answer_types, template_found = self.template_matcher( question_sanitized, entities_from_ner) if query_type_template: self.template_nums = [query_type_template] log.debug( f"question: {question} entities_from_template {entities_from_template} template_type {self.template_nums} " f"types from template {types_from_template} rels_from_template {rels_from_template} entities_from_ner " f"{entities_from_ner} types_from_ner {types_from_ner} answer_types {list(answer_types)[:3]}") if entities_from_template or types_from_template: if rels_from_template[0][0] == "PHOW": how_to_content = self.find_answer_wikihow(entities_from_template[0]) candidate_outputs = [["PHOW", how_to_content, 1.0]] else: entity_ids = self.get_entity_ids(entities_from_template, entity_tags, probas, question, entities_to_link) type_ids = self.get_entity_ids(types_from_template, ["t" for _ in types_from_template], [1.0 for _ in types_from_template], question) log.debug(f"entities_from_template: {entities_from_template} --- entity_types: {entity_types} --- " f"types_from_template: {types_from_template} --- rels_from_template: {rels_from_template} " f"--- answer_types: {template_answer_types} --- entity_ids: {entity_ids}") candidate_outputs = self.sparql_template_parser(question_sanitized, entity_ids, type_ids, template_answer_types, rels_from_template, rel_dirs_from_template) if not candidate_outputs and (entities_from_ner or types_from_ner): log.debug(f"(__call__)entities_from_ner: {entities_from_ner}") entity_ids = self.get_entity_ids(entities_from_ner, entity_tags, probas, question) type_ids = self.get_entity_ids(types_from_ner, ["t" for _ in types_from_ner], [1.0 for _ in types_from_ner], question) log.debug(f"(__call__)entity_ids: {entity_ids} type_ids {type_ids}") self.template_nums = template_types log.debug(f"(__call__)self.template_nums: {self.template_nums}") if not self.syntax_structure_known: entity_ids = entity_ids[:3] candidate_outputs = self.sparql_template_parser(question_sanitized, entity_ids, type_ids, answer_types) return candidate_outputs, template_answer def get_entity_ids(self, entities: List[str], tags: List[str], probas: List[float], question: str, entities_to_link: List[int] = None) -> List[List[str]]: entity_ids, el_output = [], [] try: el_output = self.entity_linker([entities], [tags], [probas], [[question]], [None], [None], [entities_to_link]) except json.decoder.JSONDecodeError: log.warning("not received output from entity linking") if el_output: if self.use_el_api_requester: el_output = el_output[0] if el_output: if isinstance(el_output[0], dict): entity_ids = [entity_info.get("entity_ids", []) for entity_info in el_output] if isinstance(el_output[0], list): entity_ids, *_ = el_output if not self.use_el_api_requester and entity_ids: entity_ids = entity_ids[0] return entity_ids def sparql_template_parser(self, question: str, entity_ids: List[List[str]], type_ids: List[List[str]], answer_types: Set[str], rels_from_template: Optional[List[Tuple[str]]] = None, rel_dirs_from_template: Optional[List[str]] = None) -> Union[List[Dict[str, Any]], list]: candidate_outputs = [] if isinstance(self.template_nums, str): self.template_nums = [self.template_nums] template_log_list = [str([elem["query_template"], elem["template_num"]]) for elem in self.template_queries.values() if elem["template_num"] in self.template_nums] log.debug(f"(find_candidate_answers)self.template_nums: {' --- '.join(template_log_list)}") init_templates = [] for template_num in self.template_nums: for num, template in self.template_queries.items(): if (num == template_num and self.syntax_structure_known) or \ (template["template_num"] == template_num and not self.syntax_structure_known): init_templates.append(template) templates = [template for template in init_templates if (not self.syntax_structure_known and [len(entity_ids), len(type_ids)] == template[ "entities_and_types_num"]) or self.syntax_structure_known] if not templates: templates = [template for template in init_templates if (not self.syntax_structure_known and [len(entity_ids), 0] == template[ "entities_and_types_num"]) or self.syntax_structure_known] if not templates: return candidate_outputs if rels_from_template is not None: query_template = {} for template in templates: if template["rel_dirs"] == rel_dirs_from_template: query_template = template if query_template: candidate_outputs = self.query_parser(question, [query_template], entity_ids, type_ids, answer_types, rels_from_template) else: candidate_outputs = [] for priority in range(1, 3): pr_templates = [template for template in templates if template["priority"] == priority] candidate_outputs = self.query_parser(question, pr_templates, entity_ids, type_ids, answer_types, rels_from_template) if candidate_outputs: return candidate_outputs if not candidate_outputs: alt_template_nums = templates[0].get("alternative_templates", []) log.debug(f"Using alternative templates {alt_template_nums}") alt_templates = [self.template_queries[num] for num in alt_template_nums] candidate_outputs = self.query_parser(question, alt_templates, entity_ids, type_ids, answer_types, rels_from_template) if candidate_outputs: return candidate_outputs log.debug("candidate_rels_and_answers:\n" + '\n'.join([str(output) for output in candidate_outputs[:5]])) return candidate_outputs def find_top_rels(self, question: str, entity_ids: List[List[str]], triplet_info: Tuple) -> \ Tuple[List[Tuple[str, float]], Dict[str, float], Set[Tuple[str, str]]]: ex_rels, entity_rel_conn = [], set() direction, source, rel_type, n_hop = triplet_info if source == "wiki": queries_list = list({(entity, direction, rel_type) for entity_id in entity_ids for entity in entity_id[:self.entities_to_leave]}) entity_ids_list = [elem[0] for elem in queries_list] parser_info_list = ["find_rels" for i in range(len(queries_list))] ex_rels = self.wiki_parser(parser_info_list, queries_list) for ex_rels_elem, entity_id in zip(ex_rels, entity_ids_list): for rel in ex_rels_elem: entity_rel_conn.add((entity_id, rel.split("/")[-1])) if self.use_wp_api_requester and ex_rels: ex_rels = [rel[0] for rel in ex_rels] ex_rels = list(set(itertools.chain.from_iterable(ex_rels))) if n_hop in {"1-of-2-hop", "2-hop"}: queries_list = list({(entity, "backw", rel_type) for entity_id in entity_ids for entity in entity_id[:self.entities_to_leave]}) entity_ids_list = [elem[0] for elem in queries_list] parser_info_list = ["find_rels" for i in range(len(queries_list))] ex_rels_backw = self.wiki_parser(parser_info_list, queries_list) for ex_rels_elem, entity_id in zip(ex_rels_backw, entity_ids_list): for rel in ex_rels_elem: entity_rel_conn.add((entity_id, rel.split("/")[-1])) ex_rels_backw = list(set(itertools.chain.from_iterable(ex_rels_backw))) ex_rels += ex_rels_backw if self.delete_rel_prefix: ex_rels = [rel.split('/')[-1] for rel in ex_rels] elif source in {"rank_list_1", "rel_list_1"}: ex_rels = self.rels_in_ranking_queries.get("one_rel_in_query", []) elif source in {"rank_list_2", "rel_list_2"}: ex_rels = self.rels_in_ranking_queries.get("two_rels_in_query", []) ex_rels = [rel for rel in ex_rels if not any([rel.endswith(t_rel) for t_rel in self.kb_prefixes["type_rels"]])] rels_with_scores = self.rel_ranker.rank_rels(question, ex_rels) if n_hop == "2-hop" and rels_with_scores and entity_ids and entity_ids[0]: rels_1hop = [rel for rel, score in rels_with_scores] queries_list = [(entity_ids[0], rels_1hop[:5])] parser_info_list = ["find_rels_2hop"] ex_rels_2hop = self.wiki_parser(parser_info_list, queries_list) if self.delete_rel_prefix: ex_rels_2hop = [rel.split('/')[-1] for rel in ex_rels_2hop] rels_with_scores = self.rel_ranker.rank_rels(question, ex_rels_2hop) rels_with_scores = list(set(rels_with_scores)) rels_with_scores = sorted(rels_with_scores, key=lambda x: x[1], reverse=True) rels_scores_dict = {rel: score for rel, score in rels_with_scores} return rels_with_scores[:self.rels_to_leave], rels_scores_dict, entity_rel_conn def find_answer_wikihow(self, howto_sentence: str) -> str: tags = [] search_results = search(howto_sentence, 5) if search_results: article_id = search_results[0]["article_id"] html = get_html(article_id) page = BeautifulSoup(html, 'lxml') tags = list(page.find_all(['p'])) if tags: howto_content = f"{tags[0].text.strip()}@en" else: howto_content = "Not Found" return howto_content def query_parser(self, question, query_templates, entity_ids, type_ids, answer_types, rels_from_template): raise NotImplementedError ================================================ FILE: deeppavlov/models/kbqa/rel_ranking_infer.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from typing import Tuple, List, Any, Optional from scipy.special import softmax from deeppavlov.core.common.chainer import Chainer from deeppavlov.core.common.file import load_pickle, read_json from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component from deeppavlov.core.models.serializable import Serializable from deeppavlov.models.kbqa.sentence_answer import sentence_answer from deeppavlov.models.kbqa.wiki_parser import WikiParser log = getLogger(__name__) @register('rel_ranking_infer') class RelRankerInfer(Component, Serializable): """Class for ranking of paths in subgraph""" def __init__(self, load_path: str, rel_q2name_filename: str, return_elements: List[str] = None, ranker: Chainer = None, wiki_parser: Optional[WikiParser] = None, batch_size: int = 32, softmax: bool = False, use_api_requester: bool = False, rank: bool = True, nll_rel_ranking: bool = False, nll_path_ranking: bool = False, top_possible_answers: int = -1, top_n: int = 1, pos_class_num: int = 1, rel_thres: float = 0.0, type_rels: List[str] = None, **kwargs): """ Args: load_path: path to folder with wikidata files rel_q2name_filename: name of file which maps relation id to name return_elements: what elements return in output ranker: component deeppavlov.models.ranking.rel_ranker wiki_parser: component deeppavlov.models.wiki_parser batch_size: infering batch size softmax: whether to process relation scores with softmax function use_api_requester: whether wiki parser will be used as external api rank: whether to rank relations or simple copy input nll_rel_ranking: whether use components trained with nll loss for relation ranking nll_path_ranking: whether use components trained with nll loss for relation path ranking top_possible_answers: number of answers returned for a question in each list of candidate answers top_n: number of lists of candidate answers returned for a question pos_class_num: index of positive class in the output of relation ranking model rel_thres: threshold of relation confidence type_rels: list of relations in the knowledge base which connect an entity and its type **kwargs: """ super().__init__(save_path=None, load_path=load_path) self.rel_q2name_filename = rel_q2name_filename self.ranker = ranker self.wiki_parser = wiki_parser self.batch_size = batch_size self.softmax = softmax self.return_elements = return_elements or list() self.use_api_requester = use_api_requester self.rank = rank self.nll_rel_ranking = nll_rel_ranking self.nll_path_ranking = nll_path_ranking self.top_possible_answers = top_possible_answers self.top_n = top_n self.pos_class_num = pos_class_num self.rel_thres = rel_thres self.type_rels = type_rels or set() self.load() def load(self) -> None: if self.rel_q2name_filename.endswith("pickle"): self.rel_q2name = load_pickle(self.load_path / self.rel_q2name_filename) elif self.rel_q2name_filename.endswith("json"): self.rel_q2name = read_json(self.load_path / self.rel_q2name_filename) def save(self) -> None: pass def __call__(self, questions_batch: List[str], template_type_batch: List[str], raw_answers_batch: List[List[Tuple[str]]], entity_substr_batch: List[List[str]], template_answers_batch: List[str]) -> List[str]: answers_batch, outp_confidences_batch, answer_ids_batch = [], [], [] entities_and_rels_batch, queries_batch, triplets_batch = [], [], [] for question, template_type, raw_answers, entities, template_answer in \ zip(questions_batch, template_type_batch, raw_answers_batch, entity_substr_batch, template_answers_batch): answers_with_scores = [] l_questions, l_rels, l_rels_labels, l_cur_answers, l_entities, l_types, l_sparql_queries, l_triplets, \ l_confs = self.preprocess_ranking_input(question, raw_answers) n_batches = len(l_questions) // self.batch_size + int(len(l_questions) % self.batch_size > 0) for i in range(n_batches): if self.rank: if self.nll_path_ranking: probas = self.ranker([l_questions[0]], [l_rels_labels[self.batch_size * i:self.batch_size * (i + 1)]]) probas = probas[0] else: probas = self.ranker(l_questions[self.batch_size * i:self.batch_size * (i + 1)], l_rels_labels[self.batch_size * i:self.batch_size * (i + 1)]) probas = [proba[0] for proba in probas] else: probas = [rel_conf for rel_conf, entity_conf in l_confs[self.batch_size * i:self.batch_size * (i + 1)]] for j in range(self.batch_size * i, self.batch_size * (i + 1)): if j < len(l_cur_answers) and (probas[j - self.batch_size * i] > self.rel_thres or (len(l_rels[j]) > 1 and not set(l_rels[j]).intersection( self.type_rels))): answers_with_scores.append((l_cur_answers[j], l_sparql_queries[j], l_triplets[j], l_entities[j], l_types[j], l_rels_labels[j], l_rels[j], round(probas[j - self.batch_size * i], 3), round(l_confs[j][0], 3), l_confs[j][1])) answers_with_scores = sorted(answers_with_scores, key=lambda x: x[-1] * x[-3], reverse=True) if template_type == "simple_boolean" and not answers_with_scores: answers_with_scores = [(["No"], "", [], [], [], [], [], 1.0, 1.0, 1.0)] res_answers_list, res_answer_ids_list, res_confidences_list, res_entities_and_rels_list = [], [], [], [] res_queries_list, res_triplets_list = [], [] for n, ans_sc_elem in enumerate(answers_with_scores): init_answer_ids, query, triplets, q_entities, q_types, _, q_rels, p_conf, r_conf, e_conf = ans_sc_elem answer_ids = [] for answer_id in init_answer_ids: answer_id = str(answer_id).replace("@en", "").strip('"') if answer_id not in answer_ids: answer_ids.append(answer_id) if self.top_possible_answers > 0: answer_ids = answer_ids[:self.top_possible_answers] answer_ids_input = [(answer_id, question) for answer_id in answer_ids] answer_ids = [str(answer_id).split("/")[-1] for answer_id in answer_ids] parser_info_list = ["find_label" for _ in answer_ids_input] init_answer_labels = self.wiki_parser(parser_info_list, answer_ids_input) if n < 7: log.debug(f"answers: {init_answer_ids[:3]} --- query {query} --- entities {q_entities} --- " f"types {q_types[:3]} --- q_rels {q_rels} --- {ans_sc_elem[5:]} --- " f"answer_labels {init_answer_labels[:3]}") answer_labels = [] for label in init_answer_labels: if label not in answer_labels: answer_labels.append(label) answer_labels = [label for label in answer_labels if (label and label != "Not Found")][:5] answer_labels = [str(label) for label in answer_labels] if len(answer_labels) > 2: answer = f"{', '.join(answer_labels[:-1])} and {answer_labels[-1]}" else: answer = ', '.join(answer_labels) if "sentence_answer" in self.return_elements: try: answer = sentence_answer(question, answer, entities, template_answer) except ValueError as e: log.warning(f"Error in sentence answer, {e}") res_answers_list.append(answer) res_answer_ids_list.append(answer_ids) if "several_confidences" in self.return_elements: res_confidences_list.append((p_conf, r_conf, e_conf)) else: res_confidences_list.append(p_conf) res_entities_and_rels_list.append([q_entities[:-1], q_rels]) res_queries_list.append(query) res_triplets_list.append(triplets) if self.top_n == 1: if answers_with_scores: answers_batch.append(res_answers_list[0]) outp_confidences_batch.append(res_confidences_list[0]) answer_ids_batch.append(res_answer_ids_list[0]) entities_and_rels_batch.append(res_entities_and_rels_list[0]) queries_batch.append(res_queries_list[0]) triplets_batch.append(res_triplets_list[0]) else: answers_batch.append("Not Found") outp_confidences_batch.append(0.0) answer_ids_batch.append([]) entities_and_rels_batch.append([]) queries_batch.append([]) triplets_batch.append([]) else: answers_batch.append(res_answers_list[:self.top_n]) outp_confidences_batch.append(res_confidences_list[:self.top_n]) answer_ids_batch.append(res_answer_ids_list[:self.top_n]) entities_and_rels_batch.append(res_entities_and_rels_list[:self.top_n]) queries_batch.append(res_queries_list[:self.top_n]) triplets_batch.append(res_triplets_list[:self.top_n]) answer_tuple = (answers_batch,) if "confidences" in self.return_elements: answer_tuple += (outp_confidences_batch,) if "answer_ids" in self.return_elements: answer_tuple += (answer_ids_batch,) if "entities_and_rels" in self.return_elements: answer_tuple += (entities_and_rels_batch,) if "queries" in self.return_elements: answer_tuple += (queries_batch,) if "triplets" in self.return_elements: answer_tuple += (triplets_batch,) return answer_tuple def preprocess_ranking_input(self, question, answers): l_questions, l_rels, l_rels_labels, l_cur_answers = [], [], [], [] l_entities, l_types, l_sparql_queries, l_triplets, l_confs = [], [], [], [], [] for ans_and_rels in answers: answer, sparql_query, confidence = "", "", [] entities, types, rels, rels_labels, triplets = [], [], [], [], [] if ans_and_rels: rels = [rel.split('/')[-1] for rel in ans_and_rels["relations"]] answer = ans_and_rels["answers"] entities = ans_and_rels["entities"] types = ans_and_rels["types"] sparql_query = ans_and_rels["sparql_query"] triplets = ans_and_rels["triplets"] confidence = ans_and_rels["output_conf"] rels_labels = [] for rel in rels: if rel in self.rel_q2name: label = self.rel_q2name[rel] if isinstance(label, list): label = label[0] rels_labels.append(label.lower()) if rels_labels: l_questions.append(question) l_rels.append(rels) l_rels_labels.append(rels_labels) l_cur_answers.append(answer) l_entities.append(entities) l_types.append(types) l_sparql_queries.append(sparql_query) l_triplets.append(triplets) l_confs.append(confidence) return l_questions, l_rels, l_rels_labels, l_cur_answers, l_entities, l_types, l_sparql_queries, l_triplets, \ l_confs def rank_rels(self, question: str, candidate_rels: List[str]) -> List[Tuple[str, Any]]: rels_with_scores = [] if question is not None: questions, rels_labels, rels = [], [], [] for candidate_rel in candidate_rels: if candidate_rel in self.rel_q2name: cur_rels_labels = self.rel_q2name[candidate_rel] if isinstance(cur_rels_labels, str): cur_rels_labels = [cur_rels_labels] for cur_rel in cur_rels_labels: questions.append(question) rels.append(candidate_rel) rels_labels.append(cur_rel) if questions: n_batches = len(rels) // self.batch_size + int(len(rels) % self.batch_size > 0) for i in range(n_batches): if self.nll_rel_ranking: probas = self.ranker([questions[0]], [rels_labels[i * self.batch_size:(i + 1) * self.batch_size]]) probas = probas[0] else: probas = self.ranker(questions[i * self.batch_size:(i + 1) * self.batch_size], rels_labels[i * self.batch_size:(i + 1) * self.batch_size]) probas = [proba[self.pos_class_num] for proba in probas] for j, rel in enumerate(rels[i * self.batch_size:(i + 1) * self.batch_size]): rels_with_scores.append((rel, probas[j])) if self.softmax: scores = [score for rel, score in rels_with_scores] softmax_scores = softmax(scores) rels_with_scores = [(rel, softmax_score) for (rel, score), softmax_score in zip(rels_with_scores, softmax_scores)] rels_with_scores_dict = {} for rel, score in rels_with_scores: if rel not in rels_with_scores_dict: rels_with_scores_dict[rel] = [] rels_with_scores_dict[rel].append(score) rels_with_scores = [(rel, max(scores)) for rel, scores in rels_with_scores_dict.items()] rels_with_scores = sorted(rels_with_scores, key=lambda x: x[1], reverse=True) return rels_with_scores ================================================ FILE: deeppavlov/models/kbqa/ru_adj_to_noun.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import re from collections import defaultdict from logging import getLogger from typing import List import numpy as np import spacy from scipy.sparse import csr_matrix from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.registry import register log = getLogger(__name__) @register('ru_adj_to_noun') class RuAdjToNoun: """ Class for converting an adjective in Russian to the corresponding noun, for example: "московский" -> "Москва", "африканский" -> "Африка" """ def __init__(self, freq_dict_filename: str, candidate_nouns: int = 10, freq_thres: float = 4.5, score_thres: float = 2.8, **kwargs): """ Args: freq_dict_filename: file with the dictionary of Russian words with the corresponding frequencies candidate_nouns: how many candidate nouns to leave after search **kwargs: """ self.candidate_nouns = candidate_nouns self.freq_thres = freq_thres self.score_thres = score_thres alphabet = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя-" self.alphabet_length = len(alphabet) self.max_word_length = 24 self.letter_nums = {letter: num for num, letter in enumerate(alphabet)} with open(str(expand_path(freq_dict_filename)), 'r') as fl: lines = fl.readlines() pos_freq_dict = defaultdict(list) for line in lines: line_split = line.strip('\n').split('\t') if re.match("[\d]+\.[\d]+", line_split[2]): pos_freq_dict[line_split[1]].append((line_split[0], float(line_split[2]))) self.nouns_with_freq = pos_freq_dict["s.PROP"] self.adj_set = set([word for word, freq in pos_freq_dict["a"]]) self.nouns = [noun[0] for noun in self.nouns_with_freq] self.matrix = self.make_sparse_matrix(self.nouns).transpose() self.nlp = spacy.load("ru_core_news_sm") def search(self, word: str): word = self.nlp(word)[0].lemma_ if word in self.adj_set: q_matrix = self.make_sparse_matrix([word]) scores = q_matrix * self.matrix scores = np.squeeze(scores.toarray()) indices = np.argsort(-scores)[:self.candidate_nouns] scores = list(scores[indices]) candidates = [self.nouns_with_freq[indices[i]] + (scores[i],) for i in range(len(indices))] candidates = [cand for cand in candidates if cand[0][:3].lower() == word[:3].lower()] candidates = sorted(candidates, key=lambda x: (x[2], x[1]), reverse=True) log.debug(f"AdjToNoun, found nouns: {candidates}") if candidates and candidates[0][1] > self.freq_thres and candidates[0][2] > self.score_thres: return candidates[0][0] return "" def make_sparse_matrix(self, words: List[str]): indptr = [] indices = [] data = [] total_length = 0 for n, word in enumerate(words): indptr.append(total_length) for cnt, letter in enumerate(word.lower()): col = self.alphabet_length * cnt + self.letter_nums[letter] indices.append(col) init_value = 1.0 - cnt * 0.05 if init_value < 0: init_value = 0 data.append(init_value) total_length += len(word) indptr.append(total_length) data = np.array(data) indptr = np.array(indptr) indices = np.array(indices) matrix = csr_matrix((data, indices, indptr), shape=(len(words), self.max_word_length * self.alphabet_length)) return matrix ================================================ FILE: deeppavlov/models/kbqa/sentence_answer.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import importlib import re from logging import getLogger import pkg_resources import spacy log = getLogger(__name__) # en_core_web_sm is installed and used by test_inferring_pretrained_model in the same interpreter session during tests. # Spacy checks en_core_web_sm package presence with pkg_resources, but pkg_resources is initialized with interpreter, # sot it doesn't see en_core_web_sm installed after interpreter initialization, so we use importlib.reload below. if 'en-core-web-sm' not in pkg_resources.working_set.by_key.keys(): importlib.reload(pkg_resources) # TODO: move nlp to sentence_answer, sentence_answer to rel_ranking_infer and revise en_core_web_sm requirement, # TODO: make proper downloading with spacy.cli.download nlp = spacy.load('en_core_web_sm') pronouns = ["who", "what", "when", "where", "how"] def find_tokens(tokens, node, not_inc_node): if node != not_inc_node: tokens.append(node.text) for elem in node.children: tokens = find_tokens(tokens, elem, not_inc_node) return tokens def find_inflect_dict(sent_nodes): inflect_dict = {} for node in sent_nodes: if node.dep_ == "aux" and node.tag_ == "VBD" and (node.head.tag_ == "VBP" or node.head.tag_ == "VB"): new_verb = node.head._.inflect("VBD") inflect_dict[node.head.text] = new_verb inflect_dict[node.text] = "" if node.dep_ == "aux" and node.tag_ == "VBZ" and node.head.tag_ == "VB": new_verb = node.head._.inflect("VBZ") inflect_dict[node.head.text] = new_verb inflect_dict[node.text] = "" return inflect_dict def find_wh_node(sent_nodes): wh_node = "" main_head = "" wh_node_head = "" for node in sent_nodes: if node.text.lower() in pronouns: wh_node = node break if wh_node: wh_node_head = wh_node.head if wh_node_head.dep_ == "ccomp": main_head = wh_node_head.head return wh_node, wh_node_head, main_head def find_tokens_to_replace(wh_node_head, main_head, question_tokens, question): redundant_tokens_to_replace = [] question_tokens_to_replace = [] if main_head: redundant_tokens_to_replace = find_tokens([], main_head, wh_node_head) what_tokens_fnd = re.findall("what (.*) (is|was|does|did) (.*)", question, re.IGNORECASE) if what_tokens_fnd: what_tokens = what_tokens_fnd[0][0].split() if len(what_tokens) <= 2: redundant_tokens_to_replace += what_tokens wh_node_head_desc = [node for node in wh_node_head.children if node.text != "?"] wh_node_head_dep = [node.dep_ for node in wh_node_head.children if (node.text != "?" and node.dep_ not in ["aux", "prep"] and node.text.lower() not in pronouns)] for node in wh_node_head_desc: if node.dep_ == "nsubj" and len(wh_node_head_dep) > 1 or node.text.lower() in pronouns or node.dep_ == "aux": question_tokens_to_replace.append(node.text) for elem in node.subtree: question_tokens_to_replace.append(elem.text) question_tokens_to_replace = list(set(question_tokens_to_replace)) redundant_replace_substr = [] for token in question_tokens: if token in redundant_tokens_to_replace: redundant_replace_substr.append(token) else: if redundant_replace_substr: break redundant_replace_substr = ' '.join(redundant_replace_substr) question_replace_substr = [] for token in question_tokens: if token in question_tokens_to_replace: question_replace_substr.append(token) else: if question_replace_substr: break question_replace_substr = ' '.join(question_replace_substr) return redundant_replace_substr, question_replace_substr def sentence_answer(question, entity_title, entities=None, template_answer=None): log.debug(f"question {question} entity_title {entity_title} entities {entities} template_answer {template_answer}") sent_nodes = nlp(question) reverse = False if sent_nodes[-2].tag_ == "IN": reverse = True question_tokens = [elem.text for elem in sent_nodes] log.debug(f"spacy tags: {[(elem.text, elem.tag_, elem.dep_, elem.head.text) for elem in sent_nodes]}") inflect_dict = find_inflect_dict(sent_nodes) wh_node, wh_node_head, main_head = find_wh_node(sent_nodes) redundant_replace_substr, question_replace_substr = find_tokens_to_replace(wh_node_head, main_head, question_tokens, question) log.debug(f"redundant_replace_substr {redundant_replace_substr} question_replace_substr {question_replace_substr}") if redundant_replace_substr: answer = question.replace(redundant_replace_substr, '') else: answer = question if answer.endswith('?'): answer = answer.replace('?', '').strip() if question_replace_substr: if template_answer and entities: answer = template_answer.replace("[ent]", entities[0]).replace("[ans]", entity_title) elif wh_node.text.lower() in ["what", "who", "how"]: fnd_date = re.findall(f"what (day|year) (.*)\?", question, re.IGNORECASE) fnd_wh = re.findall("what (is|was) the name of (.*) (which|that) (.*)\?", question, re.IGNORECASE) fnd_name = re.findall("what (is|was) the name (.*)\?", question, re.IGNORECASE) if fnd_date: fnd_date_aux = re.findall(f"what (day|year) (is|was) ({entities[0]}) (.*)\?", question, re.IGNORECASE) if fnd_date_aux: answer = f"{entities[0]} {fnd_date_aux[0][1]} {fnd_date_aux[0][3]} on {entity_title}" else: answer = f"{fnd_date[0][1]} on {entity_title}" elif fnd_wh: answer = f"{entity_title} {fnd_wh[0][3]}" elif fnd_name: aux_verb, sent_cut = fnd_name[0] if sent_cut.startswith("of "): sent_cut = sent_cut[3:] answer = f"{entity_title} {aux_verb} {sent_cut}" else: if reverse: answer = answer.replace(question_replace_substr, '') answer = f"{answer} {entity_title}" else: answer = answer.replace(question_replace_substr, entity_title) elif wh_node.text.lower() in ["when", "where"] and entities: sent_cut = re.findall(f"(when|where) (was|is) {entities[0]} (.*)\?", question, re.IGNORECASE) if sent_cut: if sent_cut[0][0].lower() == "when": answer = f"{entities[0]} {sent_cut[0][1]} {sent_cut[0][2]} on {entity_title}" else: answer = f"{entities[0]} {sent_cut[0][1]} {sent_cut[0][2]} in {entity_title}" else: answer = answer.replace(question_replace_substr, '') answer = f"{answer} in {entity_title}" for old_tok, new_tok in inflect_dict.items(): answer = answer.replace(old_tok, new_tok) answer = re.sub("\s+", " ", answer).strip() answer = answer + '.' return answer ================================================ FILE: deeppavlov/models/kbqa/template_matcher.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import functools import json import multiprocessing as mp import re from logging import getLogger from typing import Any, Tuple, List, Union from deeppavlov.core.common.registry import register from deeppavlov.core.models.serializable import Serializable log = getLogger(__name__) class RegexpMatcher: def __init__(self, question): self.question = question def __call__(self, template): res = re.findall(template["template_regexp"], self.question) found_template = [] if res: found_template.append((res[0], template)) return found_template @register('template_matcher') class TemplateMatcher(Serializable): """ This class matches the question with one of the templates to extract entity substrings and define which relations corresponds to the question """ def __init__(self, load_path: str, templates_filename: str, num_processors: int = None, **kwargs) -> None: """ Args: load_path: path to folder with file with templates templates_filename: file with templates **kwargs: """ super().__init__(save_path=None, load_path=load_path) self.templates_filename = templates_filename self.num_processors = mp.cpu_count() if num_processors == None else num_processors self.pool = mp.Pool(self.num_processors) self.load() def load(self) -> None: log.debug(f"(load)self.load_path / self.templates_filename: {self.load_path / self.templates_filename}") with open(self.load_path / self.templates_filename) as fl: self.templates = json.load(fl) def save(self) -> None: raise NotImplementedError def __call__(self, question: str, entities_from_ner: List[str]) -> \ Tuple[Union[List[str], list], list, Union[list, Any], Union[list, Any], Union[str, Any], Union[list, Any], Union[str, Any], Union[list, Any], Union[str, Any]]: question = question.lower() question = self.sanitize(question) question_length = len(question) entities, types, relations, relation_dirs = [], [], [], [] query_type = "" template_found = "" entity_types = [] template_answer = "" answer_types = [] results = self.pool.map(RegexpMatcher(question), self.templates) results = functools.reduce(lambda x, y: x + y, results) replace_tokens = [("the uk", "united kingdom"), ("the us", "united states")] if results: min_length = 100 for result in results: found_ent, template = result positions_entity_tokens = template["positions_entity_tokens"] positions_type_tokens = template["positions_type_tokens"] positions_unuseful_tokens = template["positions_unuseful_tokens"] template_len = template["template_len"] template_found = template["template"] entities_cand = [found_ent[pos].replace('?', '') for pos in positions_entity_tokens] types_cand = [found_ent[pos].replace('?', '').split(',')[0] for pos in positions_type_tokens] unuseful_tokens = [found_ent[pos].replace('?', '') for pos in positions_unuseful_tokens] entity_lengths = [len(entity) for entity in entities_cand] entity_num_tokens = all([len(entity.split(' ')) < 6 for entity in entities_cand]) type_lengths = [len(entity_type) for entity_type in types_cand] unuseful_tokens_len = sum([len(unuseful_tok) for unuseful_tok in unuseful_tokens]) log.debug(f"found template: {template}, {found_ent}") match, entities_cand = self.match_template_and_ner(entities_cand, entities_from_ner, template_found) if match and (0 not in entity_lengths or 0 not in type_lengths and entity_num_tokens): cur_len = sum(entity_lengths) + sum(type_lengths) log.debug(f"lengths: entity+type {cur_len}, question {question_length}, " f"template {template_len}, unuseful tokens {unuseful_tokens_len}") if cur_len < min_length and unuseful_tokens_len + template_len + cur_len == question_length: entities = entities_cand for old_token, new_token in replace_tokens: entities = [entity.replace(old_token, new_token) for entity in entities] types = types_cand relations = template["relations"] relation_dirs = template["rel_dirs"] query_type = template["template_type"] entity_types = template.get("entity_types", []) template_answer = template.get("template_answer", "") answer_types = template.get("answer_types", []) min_length = cur_len return entities, types, relations, relation_dirs, query_type, entity_types, template_answer, answer_types, \ template_found def sanitize(self, question: str) -> str: question = re.sub(r"^(a |the )", '', question) date_interval = re.findall("([\d]{4}-[\d]{4})", question) if date_interval: question = question.replace(date_interval[0], '') question = question.replace(' ', ' ') return question def match_template_and_ner(self, entities_cand: List[str], entities_from_ner: List[str], template: str): entities_from_ner = [entity.lower() for entity in entities_from_ner] entities_from_ner = [re.sub(r"^(a |the )", '', entity) for entity in entities_from_ner] entities_cand = [re.sub(r"^(a |the )", '', entity) for entity in entities_cand] entities_cand = [entity.strip() for entity in entities_cand] log.debug(f"entities_cand {entities_cand} entities_from_ner {entities_from_ner}") match = set(entities_cand) == set(entities_from_ner) or not entities_from_ner or template == "how to xxx?" return match, entities_cand ================================================ FILE: deeppavlov/models/kbqa/tree_to_sparql.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import copy import re from collections import defaultdict from io import StringIO from logging import getLogger from typing import Any, List, Tuple, Dict, Union import spacy from navec import Navec from razdel import tokenize from slovnet import Syntax from udapi.block.read.conllu import Conllu from udapi.core.node import Node from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.file import read_json from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component from deeppavlov.core.models.serializable import Serializable from deeppavlov.models.kbqa.ru_adj_to_noun import RuAdjToNoun from deeppavlov.models.kbqa.utils import preprocess_template_queries log = getLogger(__name__) @register('slovnet_syntax_parser') class SlovnetSyntaxParser(Component, Serializable): """Class for syntax parsing using Slovnet library""" def __init__(self, load_path: str, navec_filename: str, syntax_parser_filename: str, tree_patterns_filename: str, **kwargs): super().__init__(save_path=None, load_path=load_path) self.navec_filename = expand_path(navec_filename) self.syntax_parser_filename = expand_path(syntax_parser_filename) self.tree_patterns = read_json(expand_path(tree_patterns_filename)) self.re_tokenizer = re.compile(r"[\w']+|[^\w ]") self.pronouns = {"q_pronouns": {"какой", "какая", "какое", "каком", "каким", "какую", "кто", "что", "как", "когда", "где", "чем", "сколько"}, "how_many": {"сколько"}} self.first_tokens = {"первый", "первая", "первое"} self.nlp = spacy.load("ru_core_news_sm") self.load() def load(self) -> None: navec = Navec.load(self.navec_filename) self.syntax = Syntax.load(self.syntax_parser_filename) self.syntax.navec(navec) def save(self) -> None: pass def preprocess_sentences(self, sentences, entity_offsets_batch): sentences_tokens_batch, replace_dict_batch = [], [] for sentence, entity_offsets in zip(sentences, entity_offsets_batch): if sentence.islower(): for start, end in entity_offsets: entity_old = sentence[start:end] if entity_old: entity_new = f"{entity_old[0].upper()}{entity_old[1:]}" sentence = sentence.replace(entity_old, entity_new) sentence = f"{sentence[0].upper()}{sentence[1:]}" names3 = re.findall(r"([\w]{1}\.)([ ]?)([\w]{1}\.)([ ])([\w]{3,})", sentence) replace_dict = {} for name in names3: names_str = "".join(name) replace_dict[name[-1]] = (names_str, "name") sentence = sentence.replace(names_str, name[-1]) names2 = re.findall(r"([\w]{1}\.)([ ])([\w]{3,})", sentence) for name in names2: names_str = "".join(name) replace_dict[name[-1]] = (names_str, "name") sentence = sentence.replace(names_str, name[-1]) works_of_art = re.findall(r'(["«])(.*?)(["»])', sentence) for symb_start, work_of_art, symb_end in works_of_art: work_of_art_tokens = re.findall(self.re_tokenizer, work_of_art) if len(work_of_art.split()) > 1: short_substr = "" for tok in work_of_art_tokens: if self.nlp(tok)[0].pos_ == "NOUN": short_substr = tok break if not short_substr: short_substr = work_of_art_tokens[0] replace_dict[short_substr] = (work_of_art, "name") sentence = sentence.replace(work_of_art, short_substr) while True: tokens = sentence.split() found_substr = False for i in range(len(tokens) - 2): found = True for j in range(i, i + 3): if len(tokens[j]) < 2 or tokens[j][0] in '("' or tokens[j][-1] in '"),.?': found = False if found and i > 0: token_tags = [self.nlp(tokens[j])[0].pos_ for j in range(i, i + 3)] lemm_tokens = {self.nlp(tok)[0].lemma_ for tok in tokens[i:i + 3]} if token_tags == ["DET", "DET", "NOUN"] and not lemm_tokens & self.first_tokens: long_substr = " ".join(tokens[i:i + 3]) replace_dict[tokens[i + 2]] = (long_substr, "adj") sentence = sentence.replace(long_substr, tokens[i + 2]) found_substr = True if found_substr: break if not found_substr: break sentence_tokens = [tok.text for tok in tokenize(sentence)] sentences_tokens_batch.append(sentence_tokens) log.debug(f"replace_dict: {replace_dict} --- sentence: {sentence_tokens}") replace_dict_batch.append(replace_dict) return sentences_tokens_batch, replace_dict_batch def get_markup(self, proc_syntax_batch, replace_dict_batch): markup_batch = [] for proc_syntax, replace_dict in zip(proc_syntax_batch, replace_dict_batch): markup_list = [] for elem in proc_syntax.tokens: markup_list.append({"id": elem.id, "text": elem.text, "head_id": int(elem.head_id), "rel": elem.rel}) ids, words, head_ids, rels = self.get_elements(markup_list) head_ids, markup_list = self.correct_cycle(ids, head_ids, rels, markup_list) for substr in replace_dict: substr_full, substr_type = replace_dict[substr] found_n = -1 for n, markup_elem in enumerate(markup_list): if markup_elem["text"] == substr: found_n = n if found_n > -1: before_markup_list = copy.deepcopy(markup_list[:found_n]) after_markup_list = copy.deepcopy(markup_list[found_n + 1:]) substr_tokens = [tok.text for tok in tokenize(substr_full)] new_markup_list = [] if substr_type == "name": for j in range(len(substr_tokens)): new_markup_elem = {"id": str(found_n + j + 1), "text": substr_tokens[j]} if j == 0: new_markup_elem["rel"] = markup_list[found_n]["rel"] if int(markup_list[found_n]["head_id"]) < found_n + 1: new_markup_elem["head_id"] = markup_list[found_n]["head_id"] else: new_markup_elem["head_id"] = str(int(markup_list[found_n]["head_id"]) + len( substr_tokens) - 1) else: new_markup_elem["rel"] = "flat:name" new_markup_elem["head_id"] = str(found_n + 1) new_markup_list.append(new_markup_elem) elif substr_type == "adj": for j in range(len(substr_tokens)): new_elem = {"id": str(found_n + j + 1), "text": substr_tokens[j]} if j == len(substr_tokens) - 1: new_elem["rel"] = markup_list[found_n]["rel"] if markup_list[found_n]["head_id"] < found_n + 1: new_elem["head_id"] = markup_list[found_n]["head_id"] else: new_elem["head_id"] = markup_list[found_n]["head_id"] + len(substr_tokens) - 1 else: new_elem["rel"] = "amod" new_elem["head_id"] = str(found_n + len(substr_tokens)) new_markup_list.append(new_elem) for j in range(len(before_markup_list)): if int(before_markup_list[j]["head_id"]) > found_n + 1: before_markup_list[j]["head_id"] = int(before_markup_list[j]["head_id"]) + \ len(substr_tokens) - 1 if before_markup_list[j]["head_id"] == found_n + 1 and substr_type == "adj": before_markup_list[j]["head_id"] = found_n + len(substr_tokens) for j in range(len(after_markup_list)): after_markup_list[j]["id"] = str(int(after_markup_list[j]["id"]) + len(substr_tokens) - 1) if int(after_markup_list[j]["head_id"]) > found_n + 1: after_markup_list[j]["head_id"] = int(after_markup_list[j]["head_id"]) + \ len(substr_tokens) - 1 if after_markup_list[j]["head_id"] == found_n + 1 and substr_type == "adj": after_markup_list[j]["head_id"] = found_n + len(substr_tokens) markup_list = before_markup_list + new_markup_list + after_markup_list for j in range(len(markup_list)): markup_list[j]["head_id"] = str(markup_list[j]["head_id"]) markup_batch.append(markup_list) return markup_batch def find_cycle(self, ids, head_ids): for i in range(len(ids)): for j in range(len(ids)): if i < j and head_ids[j] == str(i + 1) and head_ids[i] == str(j + 1): return i + 1 return -1 def correct_markup(self, words, head_ids, rels, root_n): if len(words) > 3: pos = [self.nlp(words[i])[0].pos_ for i in range(len(words))] for tree_pattern in self.tree_patterns: first_word = tree_pattern.get("first_word", "") (r_start, r_end), rel_info = tree_pattern.get("rels", [[0, 0], ""]) (p_start, p_end), pos_info = tree_pattern.get("pos", [[0, 0], ""]) if (not first_word or words[0].lower() in self.pronouns[first_word]) \ and (not rel_info or rels[r_start:r_end] == rel_info) \ and (not pos_info or pos[p_start:p_end] == pos_info): for ind, deprel in tree_pattern.get("rel_ids", {}).items(): rels[int(ind)] = deprel for ind, head_id in tree_pattern.get("head_ids", {}).items(): head_ids[int(ind)] = head_id root_n = tree_pattern["root_n"] break if words[0].lower() in {"какой", "какая", "какое"} and rels[:3] == ["det", "obj", "root"] \ and pos[1:3] == ["NOUN", "VERB"] and "nsubj" not in rels: rels[1] = "nsubj" return head_ids, rels, root_n def find_root(self, rels): root_n = -1 for n in range(len(rels)): if rels[n] == "root": root_n = n + 1 break return root_n def get_elements(self, markup_elem): ids, words, head_ids, rels = [], [], [], [] for elem in markup_elem: ids.append(elem["id"]) words.append(elem["text"]) head_ids.append(elem["head_id"]) rels.append(elem["rel"]) return ids, words, head_ids, rels def correct_cycle(self, ids, head_ids, rels, markup_elem): cycle_num = -1 for n, (elem_id, head_id) in enumerate(zip(ids, head_ids)): if str(head_id) == str(elem_id): cycle_num = n root_n = self.find_root(rels) if cycle_num > 0 and root_n > -1: head_ids[cycle_num] = root_n markup_elem[cycle_num]["head_id"] = root_n return head_ids, markup_elem def process_markup(self, markup_batch): processed_markup_batch = [] for markup_elem in markup_batch: processed_markup = [] ids, words, head_ids, rels = self.get_elements(markup_elem) if "root" not in {rel.lower() for rel in rels}: found_root = False for n, (elem_id, head_id) in enumerate(zip(ids, head_ids)): if elem_id == head_id: rels[n] = "root" head_ids[n] = 0 found_root = True if not found_root: for n in range(len(ids)): if rels[n] == "nsubj": rels[n] = "root" head_ids[n] = 0 found_root = True if not found_root: for n in range(len(ids)): if self.nlp(words[n])[0].pos_ == "VERB": rels[n] = "root" head_ids[n] = 0 root_n = self.find_root(rels) head_ids, rels, root_n = self.correct_markup(words, head_ids, rels, root_n) if words[-1] == "?" and -1 < root_n != head_ids[-1]: head_ids[-1] = root_n head_ids, markup_elem = self.correct_cycle(ids, head_ids, rels, markup_elem) i = self.find_cycle(ids, head_ids) if i == 1 and root_n > -1: head_ids[i - 1] = root_n for elem_id, word, head_id, rel in zip(ids, words, head_ids, rels): processed_markup.append(f"{elem_id}\t{word}\t_\t_\t_\t_\t{head_id}\t{rel}\t_\t_") processed_markup_batch.append("\n".join(processed_markup)) return processed_markup_batch def __call__(self, sentences, entity_offsets_batch): sentences_tokens_batch, substr_dict_batch = self.preprocess_sentences(sentences, entity_offsets_batch) proc_syntax_batch = list(self.syntax.map(sentences_tokens_batch)) markup_batch = self.get_markup(proc_syntax_batch, substr_dict_batch) processed_markup_batch = self.process_markup(markup_batch) return processed_markup_batch @register('tree_to_sparql') class TreeToSparql(Component): """ Class for building of sparql query template using syntax parser """ def __init__(self, sparql_queries_filename: str, syntax_parser: Component, kb_prefixes: Dict[str, str], adj_to_noun: RuAdjToNoun = None, **kwargs): """ Args: sparql_queries_filename: file with sparql query templates syntax_parser: component for syntactic parsing of the input question kb_prefixes: prefixes for entities, relations and types in the knowledge base adj_to_noun: component deeppavlov.models.kbqa.tree_to_sparql:RuAdjToNoun **kwargs: """ self.q_pronouns = {"какой", "какая", "какое", "каком", "каким", "какую", "кто", "что", "как", "когда", "где", "чем", "сколько"} self.how_many = "сколько" self.change_root_tokens = {"каким был", "какой была"} self.first_tokens = {"первый", "первая", "первое"} self.last_tokens = {"последний"} self.begin_tokens = {"начинать", "начать"} self.end_tokens = {"завершить", "завершать", "закончить"} self.ranking_tokens = {"самый"} self.date_tokens = {"год", "месяц"} self.nlp = spacy.load("ru_core_news_sm") self.re_tokenizer = re.compile(r"[\w']+|[^\w ]") self.sparql_queries_filename = expand_path(sparql_queries_filename) template_queries = read_json(self.sparql_queries_filename) self.template_queries = preprocess_template_queries(template_queries, kb_prefixes) self.syntax_parser = syntax_parser self.adj_to_noun = adj_to_noun def __call__(self, questions_batch: List[str], substr_batch: List[List[str]], tags_batch: List[List[str]], offsets_batch: List[List[List[int]]], positions_batch: List[List[List[int]]], probas_batch: List[List[float]]) -> Tuple[ List[Union[str, Any]], List[Union[List[str], List[Union[str, Any]]]], List[Union[List[str], Any]], List[ Union[List[Union[str, Any]], Any]], List[Union[List[Union[float, Any]], Any]], List[List[int]], List[ Union[List[str], List[Any]]]]: substr_batch, tags_batch, offsets_batch, positions_batch, probas_batch = \ self.sort_substr(substr_batch, tags_batch, offsets_batch, positions_batch, probas_batch) log.debug(f"substr: {substr_batch} tags: {tags_batch} positions: {positions_batch}") query_nums_batch, s_substr_batch, s_tags_batch, s_probas_batch, types_batch = [], [], [], [], [] entities_to_link_batch = [] clean_questions_batch = [] count = False for question, substr_list, tags_list, offsets_list, probas_list, positions in \ zip(questions_batch, substr_batch, tags_batch, offsets_batch, probas_batch, positions_batch): entities_dict, probas_dict = {}, {} for substr, tag, proba in zip(substr_list, tags_list, probas_list): entities_dict[substr.lower()] = tag probas_dict[substr.lower()] = proba for i in range(len(substr_list)): substr = substr_list[i] if len(substr) > 2 and ("-" in substr or f"{substr}-" in question) and " - " not in substr: if "-" in substr: length = len(re.findall(self.re_tokenizer, substr)) else: length = 3 substr_tokens = list(tokenize(substr)) positions[i] = [positions[i][j] for j in range(len(substr_tokens))] if i < len(substr_list) - 1: for j in range(i + 1, len(substr_list)): pos_inds = positions[j] pos_inds = [ind - length + 1 for ind in pos_inds] positions[j] = pos_inds root, tree, tree_desc, unknown_node, unknown_branch = self.syntax_parse(question, offsets_list) query_nums = ["7"] s_substr_list = substr_list s_tags_list = tags_list s_probas_list = probas_list types_list = [] if unknown_node: log.debug(f"syntax tree info 1, unknown node: {unknown_node.form}, unkn branch: {unknown_branch.form}") log.debug(f"wh_leaf: {self.wh_leaf}") clause_node, clause_branch = self.find_clause_node(root, unknown_branch) log.debug(f"clause node: {clause_node}") tok_and_ord = {node.ord: node for node in tree.descendants} appos_token_nums = sorted(self.find_appos_tokens(root, tok_and_ord, [])) appos_tokens = [elem.form for elem in tree_desc if elem.ord in appos_token_nums] clause_token_nums = sorted(self.find_clause_tokens(root, tok_and_ord, clause_node)) clause_tokens = [elem.form for elem in tree_desc if elem.ord in clause_token_nums] log.debug(f"appos tokens: {appos_tokens}") log.debug(f"clause_tokens: {clause_tokens}") question, ranking_tokens = self.sanitize_question(tree, root, appos_token_nums, clause_token_nums) if appos_token_nums or clause_token_nums: root, tree, tree_desc, unknown_node, unknown_branch = self.syntax_parse(question, offsets_list) log.debug(f"syntax tree info 2, unknown node: {unknown_node}, unkn branch: {unknown_branch}") if unknown_node: modifiers, clause_modifiers = self.find_modifiers_of_unknown(unknown_node) log.debug(f"modifiers: {modifiers} --- clause modifiers: {[nd.form for nd in clause_modifiers]}") if f"{tree_desc[0].form.lower()} {tree_desc[1].form.lower()}" in self.change_root_tokens: new_root = root.children[0] else: new_root = root root_desc = defaultdict(list) for node in new_root.children: if node.deprel not in ["punct", "advmod", "cop", "mark"]: if node == unknown_branch: root_desc[node.deprel].append(node) else: if self.find_entities(node, positions) or \ (self.find_year_or_number(node) and node.deprel in ["obl", "nummod"]): root_desc[node.deprel].append(node) if root.form.lower() == self.how_many or ("nsubj" in root_desc.keys() and self.how_many in [nd.form.lower() for nd in root_desc["nsubj"]]): count = True log.debug(f"root_desc {root_desc.keys()}") self.root_entity = False if root.ord - 1 in positions: self.root_entity = True temporal_order = self.find_first_last(new_root) new_root_nf = self.nlp(new_root.form)[0].lemma_ if new_root_nf in self.begin_tokens or new_root_nf in self.end_tokens: temporal_order = new_root_nf query_nums, s_substr_list, types_list = self.build_query(new_root, unknown_branch, root_desc, unknown_node, modifiers, clause_modifiers, clause_node, positions, entities_dict, count, temporal_order, ranking_tokens) s_tags_list, s_probas_list = [], [] for substr in s_substr_list: substr = substr.replace(" - ", "-") s_tags_list.append(entities_dict.get(substr.lower(), "E")) s_probas_list.append(probas_dict.get(substr.lower(), 1.0)) clean_questions_batch.append(question) if query_nums and s_substr_list: entities_to_link = [1 for _ in s_substr_list] s_substr_list_lower = [s.lower() for s in s_substr_list] for substr, tag, proba in zip(substr_list, tags_list, probas_list): if substr.lower() not in s_substr_list_lower: s_substr_list.append(substr) s_tags_list.append(tag) s_probas_list.append(proba) entities_to_link.append(0) s_substr_batch.append(s_substr_list) s_tags_batch.append(s_tags_list) s_probas_batch.append(s_probas_list) entities_to_link_batch.append(entities_to_link) else: mod_len = 0 gr_len = 1 if all([tags_list[i] == tags_list[0] for i in range(len(tags_list))]): gr_len = len(substr_list) elif len(substr_list) > 1: mod_len = 1 for num, template in self.template_queries.items(): syntax_info = [gr_len, 0, mod_len, 0, False, False, False] if syntax_info == list(template["syntax_structure"].values()): query_nums.append(num) entities_to_link = [1 for _ in s_substr_list] s_substr_batch.append(substr_list) s_tags_batch.append(tags_list) s_probas_batch.append(probas_list) entities_to_link_batch.append(entities_to_link) query_nums_batch.append(query_nums) types_batch.append(types_list) log.debug(f"clean_questions: {clean_questions_batch} --- substr: {s_substr_batch} --- tags: {s_tags_batch} " f"--- entities_to_link {entities_to_link_batch} --- types: {types_batch}") return clean_questions_batch, query_nums_batch, s_substr_batch, s_tags_batch, s_probas_batch, \ entities_to_link_batch, types_batch def sort_substr(self, substr_batch: List[List[str]], tags_batch: List[List[str]], offsets_batch: List[List[List[int]]], positions_batch: List[List[List[int]]], probas_batch: List[List[float]]) -> Tuple[ List[List[str]], List[List[str]], List[List[List[int]]], List[List[List[int]]], List[List[float]]]: s_substr_batch, s_tags_batch, s_offsets_batch, s_positions_batch, s_probas_batch = [], [], [], [], [] for substr_list, tags_list, offsets_list, positions_list, probas_list \ in zip(substr_batch, tags_batch, offsets_batch, positions_batch, probas_batch): substr_info = [(substr, tag, offsets, positions, proba) for substr, tag, offsets, positions, proba in zip(substr_list, tags_list, offsets_list, positions_list, probas_list)] substr_info = sorted(substr_info, key=lambda x: x[3][0]) s_substr_batch.append([elem[0] for elem in substr_info]) s_tags_batch.append([elem[1] for elem in substr_info]) s_offsets_batch.append([elem[2] for elem in substr_info]) s_positions_batch.append([elem[3] for elem in substr_info]) s_probas_batch.append([elem[4] for elem in substr_info]) return s_substr_batch, s_tags_batch, s_offsets_batch, s_positions_batch, s_probas_batch def syntax_parse(self, question: str, entity_offsets_list: List[List[int]]) -> Tuple[ Union[str, Any], Union[str, Any], Union[str, Any], str, str]: syntax_tree = self.syntax_parser([question], [entity_offsets_list])[0] log.debug(f"syntax tree: \n{syntax_tree}") root, tree, tree_desc, unknown_node, unknown_branch = "", "", "", "", "" try: tree = Conllu(filehandle=StringIO(syntax_tree)).read_tree() root = self.find_root(tree) tree_desc = tree.descendants except ValueError as e: log.warning(f"error in parsing syntax tree, {e}") if root: unknown_node, unknown_branch = self.find_branch_with_unknown(root) log.debug(f"syntax tree info, root: {root.form} unk_node: {unknown_node} unk_branch: {unknown_branch}") return root, tree, tree_desc, unknown_node, unknown_branch def sanitize_question(self, tree: Node, root: Node, appos_token_nums: List[int], clause_token_nums: List[int]) -> \ Tuple[str, list]: ranking_tokens = self.find_ranking_tokens(root, appos_token_nums, clause_token_nums) question_tokens = [] for node in tree.descendants: if node.ord not in appos_token_nums + clause_token_nums: if ranking_tokens and (node.ord in ranking_tokens or node.form.lower() in self.q_pronouns): question_tokens.append(self.nlp(node.form)[0].lemma_) else: question_tokens.append(node.form) question = " ".join(question_tokens) log.debug(f"sanitized question: {question}") return question, ranking_tokens def find_root(self, tree: Node) -> Node: for node in tree.descendants: if node.deprel == "root" and node.children: return node def find_branch_with_unknown(self, root: Node) -> Tuple[str, str]: self.wh_leaf = False self.one_chain = False if root.form.lower() in self.q_pronouns: if "nsubj" in [node.deprel for node in root.children] or root.form.lower() in self.how_many: self.one_chain = True else: for node in root.children: if node.deprel == "nsubj": return node, node if not self.one_chain: for node in root.children: if node.form.lower() in self.q_pronouns: if node.children: for child in node.children: if child.deprel in ["nmod", "obl"]: return child, node else: self.wh_leaf = True else: for child in node.descendants: if child.form.lower() in self.q_pronouns: return child.parent, node if self.wh_leaf or self.one_chain: for node in root.children: if node.deprel in ["nsubj", "obl", "obj", "nmod", "xcomp"] and node.form.lower() not in self.q_pronouns: return node, node return "", "" def find_modifiers_of_unknown(self, node: Node) -> Tuple[List[Union[str, Any]], list]: modifiers = [] clause_modifiers = [] for mod in node.children: if mod.deprel in ["amod", "nmod"] or (mod.deprel == "appos" and mod.children): noun_mod = "" if self.adj_to_noun: noun_mod = self.adj_to_noun.search(mod.form) if noun_mod: modifiers.append(noun_mod) else: modifiers.append(mod) if mod.deprel == "acl": clause_modifiers.append(mod) return modifiers, clause_modifiers def find_clause_node(self, root: Node, unknown_branch: Node) -> Tuple[str, str]: for node in root.children: if node.deprel == "obl" and node != unknown_branch: for elem in node.children: if elem.deprel == "acl": return elem, node return "", "" def find_entities(self, node: Node, positions: List[List[int]]) -> List[str]: node_desc = [(node.form, node.ord, node.parent)] + \ [(elem.form, elem.ord, elem.parent) for elem in node.descendants] node_desc = sorted(node_desc, key=lambda x: x[1]) entities_list, heads_list = [], [] for pos_elem in positions: entity, parents = [], [] for ind in pos_elem: for node_elem in node_desc: if ind + 1 == node_elem[1]: entity.append(node_elem[0]) parents.append(node_elem[2]) break if len(entity) == len(pos_elem): entity = " ".join(entity).replace(" .", ".") entities_list.append(entity) heads_list.append(parents[0]) log.debug(f"node: {node.form} --- found_entities: {entities_list} --- node_desc: {node_desc} --- " f"positions: {positions}") return entities_list def find_year_or_number(self, node: Node) -> bool: found = False for elem in node.descendants: if elem.deprel == "nummod" or re.findall(r"[\d]{4}", elem.form): return True return found def find_year_constraint(self, node: Node) -> list: node_desc = [(node.form, node.ord)] + [(elem.form, elem.ord) for elem in node.descendants] node_desc = sorted(node_desc, key=lambda x: x[1]) desc_text = " ".join([elem[0] for elem in node_desc]) for symb in ".,:;)": desc_text = desc_text.replace(f" {symb}", symb) for pattern in [r"в ([\d]{3,4}) году", r"с ([\d]{3,4}) по ([\d]{3,4})"]: fnd = re.findall(pattern, desc_text) if fnd: return fnd return [] def find_appos_tokens(self, node: Node, tok_and_ord: List[Tuple[Node, int]], appos_token_nums: List[int]) -> List[int]: for elem in node.children: e_desc = elem.descendants if elem.deprel == "appos" and elem.ord > 1 and tok_and_ord[elem.ord - 1].deprel == "punct" \ and not all([nd.deprel in {"appos", "flat:name"} for nd in e_desc]) \ and not ({"«", '"', '``', '('} & {nd.form for nd in e_desc}): appos_token_nums.append(elem.ord) for desc in elem.descendants: appos_token_nums.append(desc.ord) else: appos_token_nums = self.find_appos_tokens(elem, tok_and_ord, appos_token_nums) return appos_token_nums def find_clause_tokens(self, node: Node, tok_and_ord: Dict[int, Node], clause_node: Node) -> List[int]: clause_token_nums = [] for elem in node.children: if elem != clause_node and elem.deprel == "acl": clause_token_nums.append(elem.ord) for desc in elem.descendants: clause_token_nums.append(desc.ord) else: clause_token_nums = self.find_appos_tokens(elem, tok_and_ord, clause_token_nums) return clause_token_nums def find_first_last(self, node: Node) -> str: first_or_last = "" nodes = [node] while nodes: for node in nodes: node_desc = defaultdict(set) for elem in node.children: normal_form = self.nlp(elem.form.lower())[0].lemma_ node_desc[elem.deprel].add(normal_form) log.debug(f"find_first_last {node_desc}") if "amod" in node_desc.keys() and "nmod" in node_desc.keys() and \ node_desc["amod"].intersection(self.first_tokens | self.last_tokens): first_or_last = ' '.join(node_desc["amod"].intersection(self.first_tokens | self.last_tokens)) return first_or_last nodes = [elem for node in nodes for elem in node.children] return first_or_last def find_ranking_tokens(self, node: Node, appos_token_nums: List[int], clause_token_nums: List[int]) -> list: ranking_tokens = [] for elem in node.descendants: if self.nlp(elem.form)[0].lemma_ in self.ranking_tokens \ and elem.ord not in appos_token_nums + clause_token_nums: ranking_tokens.append(elem.ord) ranking_tokens.append(elem.parent.ord) return ranking_tokens return ranking_tokens @staticmethod def choose_grounded_entity(grounded_entities: List[str], entities_dict: Dict[str, str]): tags = [entities_dict.get(entity.lower(), "") for entity in grounded_entities] if len(grounded_entities) > 1: if not all([tags[i] == tags[0] for i in range(1, len(tags))]): for f_tag in ["WORK_OF_ART", "FAC", "PERSON", "GPE"]: for entity, tag in zip(grounded_entities, tags): if tag == f_tag: return [entity] elif not all([entity[0].islower() for entity in grounded_entities]): for entity in grounded_entities: if entity[0].isupper(): return [entity] return grounded_entities def build_query(self, root: Node, unknown_branch: Node, root_desc: Dict[str, List[Node]], unknown_node: Node, unknown_modifiers: List[Node], clause_modifiers: List[Node], clause_node: Node, positions: List[List[int]], entities_dict: Dict[str, str], count: bool = False, temporal_order: str = "", ranking_tokens: List[str] = None) -> Tuple[ List[str], List[str], List[str]]: query_nums = [] grounded_entities_list, types_list, modifiers_list, qualifier_entities_list = [], [], [], [] found_year_or_number = False order = False root_desc_deprels = [] for key in root_desc.keys(): for i in range(len(root_desc[key])): if key in {"nsubj", "obj", "obl", "iobj", "acl", "nmod", "xcomp", "cop"}: root_desc_deprels.append(key) root_desc_deprels = sorted(root_desc_deprels) log.debug(f"build_query: root_desc.keys, {root_desc_deprels}, positions {positions}, wh_leaf {self.wh_leaf}, " f"one_chain {self.one_chain}, temporal order {temporal_order}, ranking tokens {ranking_tokens}") if root_desc_deprels in [["nsubj", "obl"], ["nsubj", "obj"], ["nsubj", "xcomp"], ["obj", "xcomp"], ["nmod", "nsubj"], ["obj", "obl"], ["iobj", "nsubj"], ["acl", "nsubj"], ["cop", "nsubj", "obl"], ["obj"], ["obl"], ["nmod"], ["xcomp"], ["nsubj"]]: if self.wh_leaf or self.one_chain: if root_desc_deprels == ["nsubj", "obl"]: grounded_entities_list = self.find_entities(root_desc["nsubj"][0], positions) if not grounded_entities_list: grounded_entities_list = self.find_entities(root_desc["obl"][0], positions) else: for nodes in root_desc.values(): if nodes[0].form not in self.q_pronouns: grounded_entities_list = self.find_entities(nodes[0], positions) if grounded_entities_list: break else: if self.root_entity: grounded_entities_list = [root.form] for nodes in root_desc.values(): if nodes[0] != unknown_branch: grounded_entities_list = self.find_entities(nodes[0], positions) if grounded_entities_list: type_entity = unknown_node.form types_list.append(type_entity) break if unknown_modifiers: for n, modifier in enumerate(unknown_modifiers): if isinstance(modifier, str): modifiers_list.append(modifier) else: modifier_entities = self.find_entities(modifier, positions) if modifier_entities: modifiers_list += modifier_entities if clause_modifiers: found_year_or_number = self.find_year_or_number(clause_modifiers[0]) if found_year_or_number: query_nums.append("0") qualifier_entities_list = self.find_entities(clause_modifiers[0], positions) if root_desc_deprels == ["nsubj", "obl", "obl"]: grounded_entities_list = self.find_entities(root_desc["nsubj"][0], positions) for node in root_desc["obl"]: if node == unknown_branch: types_list.append(node.form) else: grounded_entities_list += self.find_entities(node, positions) if root_desc_deprels == ["nsubj", "obj", "obj"]: obj_desc = root_desc["obj"] qualifier_entities_list = self.find_entities(obj_desc[0], positions) grounded_entities_list = self.find_entities(obj_desc[1], positions) year_constraint = self.find_year_constraint(root) if root_desc_deprels == ["nmod", "nsubj"] and year_constraint: if len(year_constraint[0]) == 2: query_nums.append("24") elif len(year_constraint[0]) == 1: query_nums.append("0") if root_desc_deprels == ["obj", "xcomp"]: grounded_entities_list = self.find_entities(root_desc["xcomp"][0], positions) if (self.wh_leaf and root_desc_deprels in [["nsubj", "obj", "obl"], ["obj", "obl"]]) \ or (root_desc_deprels in [["nsubj", "obj", "obl"], ["obl", "xcomp"]] and self.find_year_or_number(root_desc["obl"][0])): found_year_or_number = self.find_year_or_number(root_desc["obl"][0]) nsubj_ent_list, obj_ent_list = [], [] if "nsubj" in root_desc_deprels: nsubj_ent_list = self.find_entities(root_desc["nsubj"][0], positions) if "obj" in root_desc: obj_ent_list = self.find_entities(root_desc["obj"][0], positions) obl_ent_list = self.find_entities(root_desc["obl"][0], positions) log.debug(f"nsubj_ent: {nsubj_ent_list} --- obj_ent: {obj_ent_list} obl_ent: {obl_ent_list}") if self.wh_leaf: grounded_entities_list = obl_ent_list qualifier_entities_list = obj_ent_list elif not found_year_or_number and nsubj_ent_list and obl_ent_list: grounded_entities_list = nsubj_ent_list modifiers_list = obl_ent_list else: grounded_entities_list = obj_ent_list if found_year_or_number: query_nums.append("0") if not grounded_entities_list: grounded_entities_list = self.find_entities(root, positions) grounded_entities_list = self.choose_grounded_entity(grounded_entities_list, entities_dict) if clause_node: for node in clause_node.children: if node.deprel == "obj": grounded_entities_list = self.find_entities(node, positions) if self.find_year_or_number(node): query_nums.append("0") if not self.wh_leaf: type_entity = unknown_node.form types_list.append(type_entity) if root_desc_deprels == ["nmod", "nmod"]: grounded_entities_list = self.find_entities(root_desc["nmod"][0], positions) modifiers_list = self.find_entities(root_desc["nmod"][1], positions) if root_desc_deprels == ["nmod", "nsubj", "nummod"]: if not self.wh_leaf: grounded_entities_list = self.find_entities(root_desc["nmod"][0], positions) found_year_or_number = self.find_year_or_number(root_desc["nummod"][0]) if temporal_order and not query_nums: for deprel in root_desc: for node in root_desc[deprel]: entities = self.find_entities(node, positions) if entities: grounded_entities_list = entities break if grounded_entities_list: break if temporal_order in self.first_tokens | self.begin_tokens: query_nums += ["22"] if temporal_order in self.last_tokens | self.end_tokens: query_nums += ["23"] log.debug(f"query_nums: {query_nums} --- year_constraint: {year_constraint}") if count: grounded_entities_list = self.find_entities(root, positions) grounded_entities_list = self.choose_grounded_entity(grounded_entities_list, entities_dict) entities_list = grounded_entities_list + qualifier_entities_list + modifiers_list types_list = [tp for tp in types_list if not (len(tp.split()) == 1 and self.nlp(tp)[0].lemma_ in self.date_tokens)] gr_len = len(grounded_entities_list) types_len = len(types_list) mod_len = len(modifiers_list) qua_len = len(qualifier_entities_list) if qua_len or count: types_len = 0 if not temporal_order and not query_nums: for num, template in self.template_queries.items(): syntax_info = [gr_len, types_len, mod_len, qua_len, found_year_or_number, count, order] if syntax_info == list(template["syntax_structure"].values()): query_nums.append(num) if mod_len: syntax_info[1] = 0 if syntax_info == list(template["syntax_structure"].values()): query_nums.append(num) log.debug(f"tree_to_sparql, grounded entities: {grounded_entities_list} --- types: {types_list} --- " f"modifier entities: {modifiers_list} --- qualifier entities: {qualifier_entities_list} --- " f"year_or_number {found_year_or_number} --- count: {count} --- order: {order} --- " f"query nums: {query_nums}") return query_nums, entities_list, types_list ================================================ FILE: deeppavlov/models/kbqa/type_define.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import pickle from typing import List import spacy from nltk.corpus import stopwords from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.registry import register @register('answer_types_extractor') class AnswerTypesExtractor: """Class which defines answer types for the question""" def __init__(self, lang: str, types_filename: str, types_sets_filename: str, num_types_to_return: int = 15, **kwargs): """ Args: lang: Russian or English types_filename: filename with dictionary where keys are type ids and values are type labels types_sets_filename: filename with dictionary where keys are NER tags and values are Wikidata types corresponding to tags num_types_to_return: how many answer types to return for each question **kwargs: """ self.lang = lang self.types_filename = str(expand_path(types_filename)) self.types_sets_filename = str(expand_path(types_sets_filename)) self.num_types_to_return = num_types_to_return if self.lang == "@en": self.stopwords = set(stopwords.words("english")) self.nlp = spacy.load("en_core_web_sm") self.pronouns = ["what"] elif self.lang == "@ru": self.stopwords = set(stopwords.words("russian")) self.nlp = spacy.load("ru_core_news_sm") self.pronouns = ["какой", "каком"] with open(self.types_filename, 'rb') as fl: self.types_dict = pickle.load(fl) with open(self.types_sets_filename, 'rb') as fl: self.types_sets = pickle.load(fl) def __call__(self, questions_batch: List[str], entity_substr_batch: List[List[str]], tags_batch: List[List[str]], types_substr_batch: List[List[str]] = None): if types_substr_batch is None: types_substr_batch = [] for question, entity_substr_list in zip(questions_batch, entity_substr_batch): types_substr = [] type_noun = "" doc = self.nlp(question) token_pos_dict = {} for n, token in enumerate(doc): token_pos_dict[token.text] = n for token in doc: if token.text.lower() in self.pronouns and token.head.dep_ in ["attr", "nsubj"]: type_noun = token.head.text if not any([type_noun in entity_substr.lower() for entity_substr in entity_substr_list]): types_substr.append(type_noun) break if type_noun: for token in doc: if token.head.text == type_noun and token.dep_ in ["amod", "compound"]: type_adj = token.text if not any([type_adj.lower() in entity_substr.lower() for entity_substr in entity_substr_list]): types_substr.append(type_adj) break elif token.head.text == type_noun and token.dep_ == "prep": if len(list(token.children)) == 1 \ and not any([list(token.children)[0].text in entity_substr.lower() for entity_substr in entity_substr_list]): types_substr += [token.text, list(token.children)[0].text] elif any([word in question for word in self.pronouns]): for token in doc: if token.dep_ == "nsubj" and not any([token.text in entity_substr.lower() for entity_substr in entity_substr_list]): types_substr.append(token.text) types_substr = [(token, token_pos_dict[token]) for token in types_substr] types_substr = sorted(types_substr, key=lambda x: x[1]) types_substr = " ".join([elem[0] for elem in types_substr]) types_substr_batch.append(types_substr) types_sets_batch = [set() for _ in questions_batch] for n, (question, types_sets) in enumerate(zip(questions_batch, types_sets_batch)): question = question.lower() if not types_sets: if self.lang == "@ru": if question.startswith("кто"): types_sets_batch[n] = self.types_sets["PER"] elif question.startswith("где"): types_sets_batch[n] = self.types_sets["LOC"] elif any([question.startswith(elem) for elem in ["когда", "в каком году", "в каком месяце"]]): types_sets_batch[n] = {"date"} elif len(question.split()) > 1 and (any([question.startswith(elem) for elem in ["кем ", "как"]]) \ or question.split()[1].startswith("как")): types_sets_batch[n] = {"not_date"} elif self.lang == "@en": if question.startswith("who"): types_sets_batch[n] = self.types_sets["PER"] elif question.startswith("where"): types_sets_batch[n] = self.types_sets["LOC"] elif any([question.startswith(elem) for elem in ["when", "what year", "what month"]]): types_sets_batch[n] = {"date"} new_entity_substr_batch, new_entity_offsets_batch, new_tags_batch = [], [], [] for question, entity_substr_list, tags_list in zip(questions_batch, entity_substr_batch, tags_batch): new_entity_substr, new_tags = [], [] if not entity_substr_list: doc = self.nlp(question) for token in doc: if token.dep_ == "nsubj": new_entity_substr.append(token.text) new_tags.append("MISC") break new_entity_substr_batch.append(new_entity_substr) new_tags_batch.append(new_tags) else: new_entity_substr_batch.append(entity_substr_list) new_tags_batch.append(tags_list) return types_sets_batch, new_entity_substr_batch, new_tags_batch ================================================ FILE: deeppavlov/models/kbqa/utils.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import itertools import re from collections import namedtuple from typing import List, Tuple, Dict, Any def find_query_features(query, qualifier_rels=None, question=None, order_from_query=None): query = query.lower().replace("select distinct", "select") answer_ent = re.findall(r"select [\(]?([\S]+) ", query) order_info_nt = namedtuple("order_info", ["variable", "sorting_order"]) order_variable = re.findall("order by (asc|desc)\((.*)\)", query) if order_variable: if (qualifier_rels and len(qualifier_rels[0][4:]) > 1) or order_from_query: answers_sorting_order = order_variable[0][0] else: answers_sorting_order = order_of_answers_sorting(question) order_info = order_info_nt(order_variable[0][1], answers_sorting_order) else: order_info = order_info_nt(None, None) filter_from_query = re.findall("contains\((\?\w), (.+?)\)", query) return answer_ent, order_info, filter_from_query def extract_year(question_tokens: List[str], question: str) -> str: question_patterns = [r'.*\d{1,2}/\d{1,2}/(\d{4}).*', r'.*\d{1,2}-\d{1,2}-(\d{4}).*', r'.*(\d{4})-\d{1,2}-\d{1,2}.*'] from_to_patterns = [r"from ([\d]{3,4}) to [\d]{3,4}", r"с ([\d]{3,4}) по [\d]{3,4}"] token_patterns = [r'(\d{4})', r'^(\d{4})-.*', r'.*-(\d{4})$'] year = "" for pattern in question_patterns: fnd = re.search(pattern, question) if fnd is not None: year = fnd.group(1) break else: for pattern in from_to_patterns: fnd = re.findall(pattern, question) if fnd: return fnd[0] for token in question_tokens: for pattern in token_patterns: fnd = re.search(pattern, token) if fnd is not None: return fnd.group(1) return year def extract_number(question_tokens: List[str], question: str) -> str: number = "" fnd = re.search(r'.*(\d\.\d+e\+\d+)\D*', question) if fnd is not None: number = fnd.group(1) else: for tok in question_tokens: if tok[0].isdigit(): number = tok break number = number.replace('1st', '1').replace('2nd', '2').replace('3rd', '3') number = number.strip(".0") return number def order_of_answers_sorting(question: str) -> str: question_lower = question.lower() max_words = ["maximum", "highest", "max ", "greatest", "most", "longest", "biggest", "deepest", "завершил", "закончил", "завершает"] for word in max_words: if word in question_lower: return "desc" return "asc" def make_combs(entity_ids: List[List[str]], permut: bool) -> List[List[str]]: entity_ids = [[(entity, n) for n, entity in enumerate(entities_list)] for entities_list in entity_ids] entity_ids = list(itertools.product(*entity_ids)) entity_ids = [comb for comb in entity_ids if not (all([comb[i][0][0].split("/")[-1] == comb[0][0][0].split("/")[-1] for i in range(len(comb))]) and not all([comb[i][0][0] == comb[0][0][0] for i in range(len(comb))]))] entity_ids_permut = [] if permut: for comb in entity_ids: entity_ids_permut += itertools.permutations(comb) else: entity_ids_permut = entity_ids entity_ids = sorted(entity_ids_permut, key=lambda x: sum([elem[1] for elem in x])) ent_combs = [[elem[0] for elem in comb] + [sum([elem[1] for elem in comb])] for comb in entity_ids] return ent_combs def fill_slots(query: str, entity_comb: List[str], type_comb: List[str], rel_comb: List[Tuple[str, float]], delete_rel_prefix: bool = False) -> str: for n, entity in enumerate(entity_comb[:-1]): query = query.replace(f"e{n + 1}", entity) for n, entity_type in enumerate(type_comb[:-1]): # type_entity query = query.replace(f"t{n + 1}", entity_type) for n, (rel, score) in enumerate(rel_comb[:-1]): if not rel.startswith("?"): if delete_rel_prefix: rel = rel.split("/")[-1] query = query.replace(f"r{n + 1}", rel) return query def correct_variables(query_triplets: List[str], answer_ent: List[str], query_info: Dict[str, str]): for i in range(len(query_triplets)): for ent_var in answer_ent: triplet_elements = query_triplets[i].split() for j in range(len(triplet_elements)): if triplet_elements[j] not in ent_var and triplet_elements[j].startswith("?"): triplet_elements[j] = query_info["mid_var"] break if triplet_elements[j].startswith("?") \ and triplet_elements[j] not in [query_info["mid_var"], query_info["unk_var"]]: triplet_elements[j] = query_info["unk_var"] break query_triplets[i] = " ".join(triplet_elements) query_triplets[i] = query_triplets[i].replace(ent_var, query_info["unk_var"]) return query_triplets def query_from_triplets(query_triplets: List[str], answer_ent: List[str], query_info: Dict[str, str]) -> str: filled_query = " . ".join(query_triplets) if answer_ent and answer_ent[0].lower().startswith("count"): filled_query = f"SELECT COUNT({query_info['unk_var']}) " + \ f"WHERE {{ {filled_query}. }}" else: filled_query = f"SELECT {query_info['unk_var']} WHERE {{ {filled_query}. }}" filled_query = filled_query.replace(" ..", ".") return filled_query def fill_query(query: List[str], entity_comb: List[str], type_comb: List[str], rel_comb: List[Tuple[str, float]], map_query_str_to_kb) -> List[str]: ''' example of query: ["wd:E1", "p:R1", "?s"] entity_comb: ["Q159"] type_comb: [] rel_comb: ["P17"] map_query_str_to_kb = [("P0", "http://wd"), ("P00", "http://wl"), ("wd:", "http://we/"), ("wdt:", "http://wpd/"), (" p:", " http://wp/"), ("ps:", "http://wps/"), ("pq:", "http://wpq/")] ''' query = " ".join(query) for query_str, wikidata_str in map_query_str_to_kb: query = query.replace(query_str, wikidata_str) query = fill_slots(query, entity_comb, type_comb, rel_comb) query = query.replace("http://wpd/P0", "http://wd") query = query.replace("http://wpd/P00", "http://wl") query = query.split(' ') return query def make_sparql_query(query_info: Tuple[List[str], List[str], List[str], Dict[str, Any], Dict[str, Any]], entities: List[str], rels: List[Tuple[str, float]], types: List[str], query_info_dict: Dict[str, str]) -> List[str]: query_triplets, filled_triplets, answer_ent, filter_info, order_info = query_info query_triplets = [fill_slots(elem, entities, types, rels, delete_rel_prefix=True) for elem in query_triplets] query_triplets = correct_variables(query_triplets, answer_ent, query_info_dict) filled_queries = [] if any(["qualifier" in filter_info_element for filter_info_element in filter_info]): filled_queries.append(query_from_triplets(query_triplets, answer_ent, query_info_dict)) else: for triplets_p in list(itertools.permutations(query_triplets)): filled_queries.append(query_from_triplets(triplets_p, answer_ent, query_info_dict)) return filled_queries def merge_sparql_query(query_info: Tuple[List[str], List[str], Dict[str, Any], Dict[str, Any]], query_info_dict: Dict[str, str]) -> str: query_triplets, answer_ent, filter_info, order_info = query_info query = query_from_triplets(query_triplets, answer_ent, query_info_dict) return query def preprocess_template_queries(template_queries: Dict[str, Any], kb_prefixes: Dict[str, str]) -> Dict[str, Any]: for template_num in template_queries: template = template_queries[template_num] query = template["query_template"] q_triplets = re.findall("{[ ]?(.*?)[ ]?}", query)[0].split(' . ') q_triplets = [triplet.split(' ')[:3] for triplet in q_triplets] if not "rel_types" in template: template["rel_types"] = ["direct" for _ in q_triplets] rel_types = template["rel_types"] rel_dirs, n_hops, entities, types, gr_ent, mod_ent, q_ent = [], [], set(), set(), set(), set(), set() for n, (triplet, rel_type) in enumerate(zip(q_triplets, rel_types)): if not triplet[1].startswith(kb_prefixes["type_rel"]): if triplet[2].startswith("?"): rel_dirs.append("forw") else: rel_dirs.append("backw") for ind in [0, 2]: if triplet[ind].startswith(kb_prefixes["entity"]): entities.add(triplet[ind]) elif triplet[ind].startswith(kb_prefixes["type"]): types.add(triplet[ind]) if rel_type in {"qualifier", "statement"}: if triplet[2].startswith(kb_prefixes["entity"]): q_ent.add(triplet[2]) else: if triplet[0].startswith(kb_prefixes["entity"]): gr_ent.add(triplet[0]) elif triplet[2].startswith(kb_prefixes["entity"]): mod_ent.add(triplet[2]) if triplet[1].startswith(kb_prefixes["rel"]) and triplet[0].startswith("?") and triplet[2].startswith("?"): n_hops.append("2-hop") elif n == 0 and len(q_triplets) == 2 and q_triplets[1][1].startswith(kb_prefixes["rel"]) \ and q_triplets[1][0].startswith("?") and q_triplets[1][2].startswith("?"): n_hops.append("1-of-2-hop") else: n_hops.append("1-hop") syntax_structure = {"gr_ent": len(gr_ent), "types": len(types), "mod_ent": len(mod_ent), "q_ent": len(q_ent), "year_or_number": False, "count": False, "order": False} if "filter" in query.lower(): syntax_structure["year_or_number"] = True if "order" in query.lower(): syntax_structure["order"] = True if "count" in query.lower(): syntax_structure["count"] = True if not "query_sequence" in template: template["query_sequence"] = list(range(1, len(q_triplets) + 1)) template["rel_dirs"] = rel_dirs template["n_hops"] = n_hops template["entities_and_types_num"] = [len(entities), len(types)] if entities: entities_str = '_'.join([str(num) for num in list(range(1, len(entities) + 1))]) else: entities_str = "0" if types: types_str = '_'.join([str(num) for num in list(range(1, len(types) + 1))]) else: types_str = "0" template["entities_and_types_select"] = f"{entities_str} {types_str}" template["syntax_structure"] = syntax_structure if "return_if_found" not in template: template["return_if_found"] = False if "priority" not in template: template["priority"] = 1 template_queries[template_num] = template return template_queries ================================================ FILE: deeppavlov/models/kbqa/wiki_parser.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import datetime import re from collections import namedtuple from logging import getLogger from typing import List, Tuple, Dict, Any, Union from hdt import HDTDocument from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.file import load_pickle, read_json from deeppavlov.core.common.registry import register log = getLogger(__name__) @register('wiki_parser') class WikiParser: """This class extract relations, objects or triplets from Wikidata HDT file.""" def __init__(self, wiki_filename: str, file_format: str = "hdt", prefixes: Dict[str, Union[str, Dict[str, str]]] = None, rel_q2name_filename: str = None, max_comb_num: int = 1e6, lang: str = "@en", **kwargs) -> None: """ Args: wiki_filename: file with Wikidata file_format: format of Wikidata file lang: Russian or English language **kwargs: """ if prefixes is None: prefixes = { "entity": "http://we", "label": "http://wl", "alias": "http://wal", "description": "http://wd", "rels": { "direct": "http://wpd", "no_type": "http://wp", "statement": "http://wps", "qualifier": "http://wpq", "type": "http://wpd/P31" }, "statement": "http://ws" } self.prefixes = prefixes self.file_format = file_format self.wiki_filename = str(expand_path(wiki_filename)) if self.file_format == "hdt": self.document = HDTDocument(self.wiki_filename) elif self.file_format == "pickle": self.document = load_pickle(self.wiki_filename) self.parsed_document = {} else: raise ValueError("Unsupported file format") self.used_rels = set() self.rel_q2name = dict() if rel_q2name_filename: if rel_q2name_filename.endswith("json"): self.rel_q2name = read_json(str(expand_path(rel_q2name_filename))) elif rel_q2name_filename.endswith("pickle"): self.rel_q2name = load_pickle(str(expand_path(rel_q2name_filename))) else: raise ValueError(f"Unsupported file format: {rel_q2name_filename}") self.max_comb_num = max_comb_num self.lang = lang self.replace_tokens = [('"', ''), (self.lang, " "), ('$', ' '), (' ', ' ')] def __call__(self, parser_info_list: List[str], queries_list: List[Any]) -> List[Any]: wiki_parser_output = self.execute_queries_list(parser_info_list, queries_list) return wiki_parser_output def execute_queries_list(self, parser_info_list: List[str], queries_list: List[Any]): wiki_parser_output = [] query_answer_types = [] for parser_info, query in zip(parser_info_list, queries_list): if parser_info == "query_execute": answers, found_rels, found_combs = [], [], [] try: what_return, rels_from_query, query_seq, filter_info, order_info, answer_types, rel_types, \ return_if_found = query if answer_types: query_answer_types = answer_types answers, found_rels, found_combs = \ self.execute(what_return, rels_from_query, query_seq, filter_info, order_info, query_answer_types, rel_types) except ValueError: log.warning("Wrong arguments are passed to wiki_parser") wiki_parser_output.append([answers, found_rels, found_combs]) elif parser_info == "find_rels": rels = [] try: rels = self.find_rels(*query) except: log.warning("Wrong arguments are passed to wiki_parser") wiki_parser_output.append(rels) elif parser_info == "find_rels_2hop": rels = [] try: rels = self.find_rels_2hop(*query) except ValueError: log.warning("Wrong arguments are passed to wiki_parser") wiki_parser_output += rels elif parser_info == "find_object": objects = [] try: objects = self.find_object(*query) except: log.warning("Wrong arguments are passed to wiki_parser") wiki_parser_output.append(objects) elif parser_info == "check_triplet": check_res = False try: check_res = self.check_triplet(*query) except: log.warning("Wrong arguments are passed to wiki_parser") wiki_parser_output.append(check_res) elif parser_info == "find_label": label = "" try: label = self.find_label(*query) except: log.warning("Wrong arguments are passed to wiki_parser") wiki_parser_output.append(label) elif parser_info == "find_types": types = [] try: types = self.find_types(query) except: log.warning("Wrong arguments are passed to wiki_parser") wiki_parser_output.append(types) elif parser_info == "fill_triplets": filled_triplets = [] try: filled_triplets = self.fill_triplets(*query) except ValueError: log.warning("Wrong arguments are passed to wiki_parser") wiki_parser_output.append(filled_triplets) elif parser_info == "find_triplets": if self.file_format == "hdt": triplets = [] try: triplets_forw, c = self.document.search_triples(f"{self.prefixes['entity']}/{query}", "", "") triplets.extend([triplet for triplet in triplets_forw if not triplet[2].startswith(self.prefixes["statement"])]) triplets_backw, c = self.document.search_triples("", "", f"{self.prefixes['entity']}/{query}") triplets.extend([triplet for triplet in triplets_backw if not triplet[0].startswith(self.prefixes["statement"])]) except: log.warning("Wrong arguments are passed to wiki_parser") wiki_parser_output.append(list(triplets)) else: triplets = {} try: triplets = self.document.get(query, {}) except: log.warning("Wrong arguments are passed to wiki_parser") uncompressed_triplets = {} if triplets: if "forw" in triplets: uncompressed_triplets["forw"] = self.uncompress(triplets["forw"]) if "backw" in triplets: uncompressed_triplets["backw"] = self.uncompress(triplets["backw"]) wiki_parser_output.append(uncompressed_triplets) elif parser_info == "find_triplets_for_rel": found_triplets = [] try: found_triplets, c = \ self.document.search_triples("", f"{self.prefixes['rels']['direct']}/{query}", "") except: log.warning("Wrong arguments are passed to wiki_parser") wiki_parser_output.append(list(found_triplets)) elif parser_info == "parse_triplets" and self.file_format == "pickle": for entity in query: self.parse_triplets(entity) wiki_parser_output.append("ok") else: raise ValueError("Unsupported query type") return wiki_parser_output def execute(self, what_return: List[str], rels_from_query: List[str], query_seq: List[List[str]], filter_info: List[Tuple[str]] = None, order_info: namedtuple = None, answer_types: List[str] = None, rel_types: List[str] = None): """ Let us consider an example of the question "What is the deepest lake in Russia?" with the corresponding SPARQL query "SELECT ?ent WHERE { ?ent wdt:P31 wd:T1 . ?ent wdt:R1 ?obj . ?ent wdt:R2 wd:E1 } ORDER BY ASC(?obj) LIMIT 5" arguments: what_return: ["?obj"] query_seq: [["?ent", "http://www.wikidata.org/prop/direct/P17", "http://www.wikidata.org/entity/Q159"] ["?ent", "http://www.wikidata.org/prop/direct/P31", "http://www.wikidata.org/entity/Q23397"], ["?ent", "http://www.wikidata.org/prop/direct/P4511", "?obj"]] filter_info: [] order_info: order_info(variable='?obj', sorting_order='asc') """ extended_combs = [] answers, found_rels, found_combs = [], [], [] for n, (query, rel_type) in enumerate(zip(query_seq, rel_types)): unknown_elem_positions = [(pos, elem) for pos, elem in enumerate(query) if elem.startswith('?')] """ n = 0, query = ["?ent", "http://www.wikidata.org/prop/direct/P17", "http://www.wikidata.org/entity/Q159"] unknown_elem_positions = ["?ent"] n = 1, query = ["?ent", "http://www.wikidata.org/prop/direct/P31", "http://www.wikidata.org/entity/Q23397"] unknown_elem_positions = [(0, "?ent")] n = 2, query = ["?ent", "http://www.wikidata.org/prop/direct/P4511", "?obj"] unknown_elem_positions = [(0, "?ent"), (2, "?obj")] """ if n == 0: combs, triplets = self.search(query, unknown_elem_positions, rel_type) # combs = [{"?ent": "http://www.wikidata.org/entity/Q5513"}, ...] else: if combs: known_elements = [] extended_combs = [] if query[0].startswith("?"): for elem in query: if elem in combs[0].keys(): known_elements.append(elem) for comb in combs: """ n = 1 query = ["?ent", "http://www.wikidata.org/prop/direct/P31", "http://www.wikidata.org/entity/Q23397"] comb = {"?ent": "http://www.wikidata.org/entity/Q5513"} known_elements = ["?ent"], known_values = ["http://www.wikidata.org/entity/Q5513"] filled_query = ["http://www.wikidata.org/entity/Q5513", "http://www.wikidata.org/prop/direct/P31", "http://www.wikidata.org/entity/Q23397"] new_combs = [["http://www.wikidata.org/entity/Q5513", "http://www.wikidata.org/prop/direct/P31", "http://www.wikidata.org/entity/Q23397"], ...] extended_combs = [{"?ent": "http://www.wikidata.org/entity/Q5513"}, ...] """ if comb: known_values = [comb[known_elem] for known_elem in known_elements] for known_elem, known_value in zip(known_elements, known_values): filled_query = [elem.replace(known_elem, known_value) for elem in query] new_combs, triplets = self.search(filled_query, unknown_elem_positions, rel_type) for new_comb in new_combs: extended_combs.append(self.merge_combs(comb, new_comb)) else: new_combs, triplets = self.search(query, unknown_elem_positions, rel_type) for comb in combs: for new_comb in new_combs: extended_combs.append(self.merge_combs(comb, new_comb)) combs = extended_combs is_boolean = self.define_is_boolean(query_seq) if combs or is_boolean: if filter_info: for filter_elem, filter_value in filter_info: if filter_value == "qualifier": filter_value = "wpq/" combs = [comb for comb in combs if filter_value in comb[filter_elem]] if order_info and not isinstance(order_info, list) and order_info.variable is not None: reverse = True if order_info.sorting_order == "desc" else False sort_elem = order_info.variable if combs and "?p" in combs[0]: rel_combs = {} for comb in combs: if comb["?p"] not in rel_combs: rel_combs[comb["?p"]] = [] rel_combs[comb["?p"]].append(comb) rel_combs_list = rel_combs.values() else: rel_combs_list = [combs] new_rel_combs_list = [] for rel_combs in rel_combs_list: new_rel_combs = [] for rel_comb in rel_combs: value_str = rel_comb[sort_elem].split('^^')[0].strip('"+') fnd_date = re.findall(r"[\d]{3,4}-[\d]{1,2}-[\d]{1,2}", value_str) fnd_num = re.findall(r"([\d]+)\.([\d]+)", value_str) if fnd_date: rel_comb[sort_elem] = fnd_date[0] elif fnd_num or value_str.isdigit(): rel_comb[sort_elem] = float(value_str) new_rel_combs.append(rel_comb) new_rel_combs = [(elem, n) for n, elem in enumerate(new_rel_combs)] new_rel_combs = sorted(new_rel_combs, key=lambda x: (x[0][sort_elem], x[1]), reverse=reverse) new_rel_combs = [elem[0] for elem in new_rel_combs] new_rel_combs_list.append(new_rel_combs) combs = [new_rel_combs[0] for new_rel_combs in new_rel_combs_list] if what_return and what_return[-1].startswith("count"): answers = [[len(combs)]] else: answers = [[elem[key] for key in what_return if key in elem] for elem in combs] if answer_types: if list(answer_types) == ["date"]: answers = [[entity for entity in answer if re.findall(r"[\d]{3,4}-[\d]{1,2}-[\d]{1,2}", entity)] for answer in answers] elif list(answer_types) == ["not_date"]: answers = [[entity for entity in answer if not re.findall(r"[\d]{3,4}-[\d]{1,2}-[\d]{1,2}", entity)] for answer in answers] else: answer_types = set(answer_types) answers = [[entity for entity in answer if answer_types.intersection(self.find_types(entity))] for answer in answers] if is_boolean: answers = [["Yes" if len(triplets) > 0 else "No"]] found_rels = [[elem[key] for key in rels_from_query if key in elem] for elem in combs] ans_rels_combs = [(answer, rel, comb) for answer, rel, comb in zip(answers, found_rels, combs) if any([entity for entity in answer])] answers = [elem[0] for elem in ans_rels_combs] found_rels = [elem[1] for elem in ans_rels_combs] found_combs = [elem[2] for elem in ans_rels_combs] return answers, found_rels, found_combs @staticmethod def define_is_boolean(query_hdt_seq): return len(query_hdt_seq) == 1 and all([not query_hdt_seq[0][i].startswith("?") for i in [0, 2]]) @staticmethod def merge_combs(comb1, comb2): new_comb = {} for key in comb1: if (key in comb2 and comb1[key] == comb2[key]) or key not in comb2: new_comb[key] = comb1[key] for key in comb2: if (key in comb1 and comb2[key] == comb1[key]) or key not in comb1: new_comb[key] = comb2[key] return new_comb def search(self, query: List[str], unknown_elem_positions: List[Tuple[int, str]], rel_type): query = list(map(lambda elem: "" if elem.startswith('?') else elem, query)) subj, rel, obj = query if self.file_format == "hdt": combs = [] triplets, cnt = self.document.search_triples(subj, rel, obj) if cnt < self.max_comb_num: triplets = list(triplets) if rel == self.prefixes["description"] or rel == self.prefixes["label"]: triplets = [triplet for triplet in triplets if triplet[2].endswith(self.lang)] combs = [{elem: triplet[pos] for pos, elem in unknown_elem_positions} for triplet in triplets] else: if isinstance(self.prefixes["rels"][rel_type], str): combs = [{elem: triplet[pos] for pos, elem in unknown_elem_positions} for triplet in triplets if (triplet[1].startswith(self.prefixes["rels"][rel_type]) or triplet[1].startswith(self.prefixes["rels"]["type"]))] else: combs = [{elem: triplet[pos] for pos, elem in unknown_elem_positions} for triplet in triplets if (any(triplet[1].startswith(tp) for tp in self.prefixes["rels"][rel_type]) or triplet[1].startswith(self.prefixes["rels"]["type"]))] else: log.debug("max comb num exceeds") else: triplets = [] if subj: subj, triplets = self.find_triplets(subj, "forw") triplets = [[subj, triplet[0], obj] for triplet in triplets for obj in triplet[1:]] if obj: obj, triplets = self.find_triplets(obj, "backw") triplets = [[subj, triplet[0], obj] for triplet in triplets for subj in triplet[1:]] if rel: if rel == self.prefixes["description"]: triplets = [triplet for triplet in triplets if triplet[1] == "descr_en"] else: rel = rel.split('/')[-1] triplets = [triplet for triplet in triplets if triplet[1] == rel] combs = [{elem: triplet[pos] for pos, elem in unknown_elem_positions} for triplet in triplets] return combs, triplets def find_label(self, entity: str, question: str = "") -> str: entity = str(entity).replace('"', '') if self.file_format == "hdt": if entity.startswith("Q") or entity.startswith("P"): # example: "Q5513" entity = f"{self.prefixes['entity']}/{entity}" # "http://www.wikidata.org/entity/Q5513" if entity.startswith(self.prefixes["entity"]): labels, c = self.document.search_triples(entity, self.prefixes["label"], "") # labels = [["http://www.wikidata.org/entity/Q5513", "http://www.w3.org/2000/01/rdf-schema#label", # '"Lake Baikal"@en'], ...] for label in labels: if label[2].endswith(self.lang): found_label = label[2].strip(self.lang) for old_tok, new_tok in self.replace_tokens: found_label = found_label.replace(old_tok, new_tok) found_label = found_label.strip() return found_label elif entity.endswith(self.lang): # entity: '"Lake Baikal"@en' entity = entity[:-3].replace('$', ' ').replace(' ', ' ') return entity elif "^^" in entity: """ examples: '"1799-06-06T00:00:00Z"^^' (date) '"+1642"^^' (number) """ entity = entity.split("^^")[0] for token in ["T00:00:00Z", "+"]: entity = entity.replace(token, '') entity = self.format_date(entity, question).replace('$', '') return entity elif re.findall(r"[\d]{3,4}-[\d]{2}-[\d]{2}", entity): entity = self.format_date(entity, question).replace('$', '') return entity elif entity in ["Yes", "No"]: return entity elif entity.isdigit(): entity = entity.replace('.', ',') return entity if self.file_format == "pickle": if entity: if entity.startswith("Q") or entity.startswith("P"): triplets = self.document.get(entity, {}).get("forw", []) triplets = self.uncompress(triplets) for triplet in triplets: if triplet[0] == "name_en": return triplet[1] else: entity = self.format_date(entity, question) return entity return "Not Found" def format_date(self, entity, question): dates_dict = {"January": "января", "February": "февраля", "March": "марта", "April": "апреля", "May": "мая", "June": "июня", "July": "июля", "August": "августа", "September": "сентября", "October": "октября", "November": "ноября", "December": "декабря"} date_info = re.findall("([\d]{3,4})-([\d]{1,2})-([\d]{1,2})", entity) if date_info: year, month, day = date_info[0] if "how old" in question.lower() or "сколько лет" in question.lower(): entity = datetime.datetime.now().year - int(year) elif "в каком году" in question.lower(): entity = year elif "в каком месяце" in question.lower(): entity = month elif day not in {"00", "0"}: date = datetime.datetime.strptime(f"{year}-{month}-{day}", "%Y-%m-%d") entity = date.strftime("%d %B %Y") else: entity = year if self.lang == "@ru": for mnth, mnth_replace in dates_dict.items(): entity = entity.replace(mnth, mnth_replace) return str(entity) entity = entity.lstrip('+-') return entity def find_alias(self, entity: str) -> List[str]: aliases = [] if entity.startswith(self.prefixes["entity"]): labels, cardinality = self.document.search_triples(entity, self.prefixes["alias"], "") aliases = [label[2].strip(self.lang).strip('"') for label in labels if label[2].endswith(self.lang)] return aliases def find_rels(self, entity: str, direction: str, rel_type: str = "no_type") -> List[str]: rels = [] if self.file_format == "hdt": if not rel_type: rel_type = "direct" if direction == "forw": query = [f"{self.prefixes['entity']}/{entity}", "", ""] else: query = ["", "", f"{self.prefixes['entity']}/{entity}"] triplets, c = self.document.search_triples(*query) triplets = list(triplets) if isinstance(self.prefixes['rels'][rel_type], str): start_str = f"{self.prefixes['rels'][rel_type]}/P" rels = {triplet[1] for triplet in triplets if triplet[1].startswith(start_str)} else: rels = {triplet[1] for triplet in triplets if any([triplet[1].startswith(tp) for tp in self.prefixes['rels'][rel_type]])} rels = list(rels) if self.used_rels: rels = [rel for rel in rels if rel.split("/")[-1] in self.used_rels] return rels def find_rels_2hop(self, entity_ids, rels_1hop): rels = [] for entity_id in entity_ids: for rel_1hop in rels_1hop: triplets, cnt = self.document.search_triples(f"{self.prefixes['entity']}/{entity_id}", rel_1hop, "") triplets = [triplet for triplet in triplets if triplet[2].startswith(self.prefixes['entity'])] objects_1hop = [triplet[2].split("/")[-1] for triplet in triplets] triplets, cnt = self.document.search_triples("", rel_1hop, f"{self.prefixes['entity']}/{entity_id}") triplets = [triplet for triplet in triplets if triplet[0].startswith(self.prefixes['entity'])] objects_1hop += [triplet[0].split("/")[-1] for triplet in triplets] for object_1hop in objects_1hop[:5]: tr_2hop, cnt = self.document.search_triples(f"{self.prefixes['entity']}/{object_1hop}", "", "") rels_2hop = [elem[1] for elem in tr_2hop if elem[1] != rel_1hop] if self.used_rels: rels_2hop = [elem for elem in rels_2hop if elem.split("/")[-1] in self.used_rels] rels += rels_2hop tr_2hop, cnt = self.document.search_triples("", "", f"{self.prefixes['entity']}/{object_1hop}") rels_2hop = [elem[1] for elem in tr_2hop if elem[1] != rel_1hop] if self.used_rels: rels_2hop = [elem for elem in rels_2hop if elem.split("/")[-1] in self.used_rels] rels += rels_2hop rels = list(set(rels)) return rels def find_object(self, entity: str, rel: str, direction: str) -> List[str]: objects = [] if not direction: direction = "forw" if self.file_format == "hdt": entity = f"{self.prefixes['entity']}/{entity.split('/')[-1]}" rel = f"{self.prefixes['rels']['direct']}/{rel}" if direction == "forw": triplets, cnt = self.document.search_triples(entity, rel, "") if cnt < self.max_comb_num: objects.extend([triplet[2].split('/')[-1] for triplet in triplets]) else: triplets, cnt = self.document.search_triples("", rel, entity) objects.extend([triplet[0].split('/')[-1] for triplet in triplets]) else: entity = entity.split('/')[-1] rel = rel.split('/')[-1] triplets = self.document.get(entity, {}).get(direction, []) triplets = self.uncompress(triplets) for found_rel, *objects in triplets: if rel == found_rel: objects.extend(objects) return objects def check_triplet(self, subj: str, rel: str, obj: str) -> bool: if self.file_format == "hdt": subj = f"{self.prefixes['entity']}/{subj}" rel = f"{self.prefixes['rels']['direct']}/{rel}" obj = f"{self.prefixes['entity']}/{obj}" triplets, cnt = self.document.search_triples(subj, rel, obj) if cnt > 0: return True else: return False else: subj = subj.split('/')[-1] rel = rel.split('/')[-1] obj = obj.split('/')[-1] triplets = self.document.get(subj, {}).get("forw", []) triplets = self.uncompress(triplets) for found_rel, *objects in triplets: if found_rel == rel: for found_obj in objects: if found_obj == obj: return True return False def find_types(self, entity: str): types = [] if self.file_format == "hdt": if not entity.startswith("http"): entity = f"{self.prefixes['entity']}/{entity}" tr, c = self.document.search_triples(entity, f"{self.prefixes['rels']['direct']}/P31", "") types = [triplet[2].split('/')[-1] for triplet in tr] for rel in ["P106", "P21"]: tr, c = self.document.search_triples(entity, f"{self.prefixes['rels']['direct']}/{rel}", "") types += [triplet[2].split('/')[-1] for triplet in tr] if self.file_format == "pickle": entity = entity.split('/')[-1] triplets = self.document.get(entity, {}).get("forw", []) triplets = self.uncompress(triplets) for triplet in triplets: if triplet[0] == "P31": types = triplet[1:] types = set(types) return types def find_subclasses(self, entity: str): types = [] if self.file_format == "hdt": if not entity.startswith("http"): entity = f"{self.prefixes['entity']}/{entity}" tr, c = self.document.search_triples(entity, f"{self.prefixes['rels']['direct']}/P279", "") types = [triplet[2].split('/')[-1] for triplet in tr] if self.file_format == "pickle": entity = entity.split('/')[-1] triplets = self.document.get(entity, {}).get("forw", []) triplets = self.uncompress(triplets) for triplet in triplets: if triplet[0] == "P279": types = triplet[1:] types = set(types) return types def uncompress(self, triplets: Union[str, List[List[str]]]) -> List[List[str]]: if isinstance(triplets, str): triplets = triplets.split('\t') triplets = [triplet.strip().split(" ") for triplet in triplets] return triplets def parse_triplets(self, entity): triplets = self.document.get(entity, {}) for direction in ["forw", "backw"]: if direction in triplets: dir_triplets = triplets[direction] dir_triplets = self.uncompress(dir_triplets) if entity in self.parsed_document: self.parsed_document[entity][direction] = dir_triplets else: self.parsed_document[entity] = {direction: dir_triplets} def find_triplets(self, subj: str, direction: str) -> Tuple[str, List[List[str]]]: subj = subj.split('/')[-1] if subj in self.parsed_document: triplets = self.parsed_document.get(subj, {}).get(direction, []) else: triplets = self.document.get(subj, {}).get(direction, []) triplets = self.uncompress(triplets) return subj, triplets def fill_triplets(self, init_triplets, what_to_return, comb): filled_triplets = [] for n, (subj, rel, obj) in enumerate(init_triplets): if "statement" in self.prefixes and subj.startswith("?") \ and comb.get(subj, "").startswith(self.prefixes["statement"]) and not rel.startswith("?") \ and (obj == what_to_return[0] or re.findall(r"[\d]{3,4}", comb.get(what_to_return[0], ""))): continue else: if "statement" in self.prefixes and subj.startswith("?") \ and str(comb.get(subj, "")).startswith(self.prefixes["statement"]): if not comb.get(what_to_return[0], "").startswith("http") \ and re.findall(r"[\d]{3,4}", comb.get(what_to_return[0], "")): subj = init_triplets[1][2] else: subj = what_to_return[0] if "statement" in self.prefixes and obj.startswith("?") \ and str(comb.get(obj, "")).startswith(self.prefixes["statement"]): if not str(comb.get(what_to_return[0], "")).startswith("http") \ and re.findall(r"[\d]{3,4}", str(comb.get(what_to_return[0], ""))): obj = init_triplets[1][2] else: obj = what_to_return[0] subj, obj = str(subj), str(obj) if subj.startswith("?"): subj = comb.get(subj, "") if obj.startswith("?"): obj = comb.get(obj, "") if rel.startswith("?"): rel = comb.get(rel, "") subj_label = self.find_label(subj) obj_label = self.find_label(obj) if rel in self.rel_q2name: rel_label = self.rel_q2name[rel] elif rel.split("/")[-1] in self.rel_q2name: rel_label = self.rel_q2name[rel.split("/")[-1]] else: rel_label = self.find_label(rel) if isinstance(rel_label, list) and rel_label: rel_label = rel_label[0] filled_triplets.append([subj_label, rel_label, obj_label]) return filled_triplets ================================================ FILE: deeppavlov/models/morpho_syntax_parser/__init__.py ================================================ ================================================ FILE: deeppavlov/models/morpho_syntax_parser/dependency_decoding.py ================================================ # Copyright 2019 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import List import numpy as np from ufal.chu_liu_edmonds import chu_liu_edmonds from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component @register('chu_liu_edmonds_transformer') class ChuLiuEdmonds(Component): """ A wrapper for Chu-Liu-Edmonds algorithm for maximum spanning tree """ def __init__(self, min_edge_prob=1e-6, **kwargs): self.min_edge_prob = min_edge_prob def __call__(self, probs: List[np.ndarray]) -> List[List[int]]: """Applies Chu-Liu-Edmonds algorithm to the matrix of head probabilities. probs: a 3D-array of probabilities of shape B*L*(L+1) """ answer = [] for elem in probs: m, n = elem.shape if n == m + 1: elem = np.log10(np.maximum(self.min_edge_prob, elem)) - np.log10(self.min_edge_prob) elem = np.concatenate([np.zeros_like(elem[:1, :]), elem], axis=0) # it makes impossible to create multiple edges 0->i elem[1:, 0] += np.log10(self.min_edge_prob) * len(elem) heads, _ = chu_liu_edmonds(elem.astype("float64")) answer.append(heads[1:]) else: raise ValueError("First and second axis lengths m, n of probs should satisfy the condition n == m + 1") return answer ================================================ FILE: deeppavlov/models/morpho_syntax_parser/joint.py ================================================ # Copyright 2019 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Union, List from deeppavlov.core.common.chainer import Chainer from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component UD_COLUMN_FEAT_MAPPING = {"id": 0, "word": 1, "lemma": 2, "upos": 3, "feats": 5, "head": 6, "deprel": 7} @register("joint_tagger_parser") class JointTaggerParser(Component): """ A class to perform joint morphological and syntactic parsing. It is just a wrapper that calls the models for tagging and parsing and comprises their results in a single output. Args: tagger: the morphological tagger model (a :class:`~deeppavlov.core.common.chainer.Chainer` instance) parser_path: the syntactic parser model (a :class:`~deeppavlov.core.common.chainer.Chainer` instance) output_format: the output format, it may be either `ud` (alias: `conllu`) or `json`. Attributes: tagger: a morphological tagger model (a :class:`~deeppavlov.core.common.chainer.Chainer` instance) parser: a syntactic parser model (a :class:`~deeppavlov.core.common.chainer.Chainer` instance) """ def __init__(self, tagger: Chainer, parser: Chainer, output_format: str = "ud", *args, **kwargs): if output_format not in ["ud", "conllu", "json", "dict"]: UserWarning("JointTaggerParser output_format can be only `ud`, `conllu` or `json`. " \ "Unknown format: {}, setting the output_format to `ud`.".format(output_format)) output_format = "ud" self.output_format = output_format self.tagger = tagger self.parser = parser def __call__(self, data: Union[List[str], List[List[str]]]) \ -> Union[List[List[dict]], List[str], List[List[str]]]: tagger_output = self.tagger(data) parser_output = self.parser(data) answer = [] for i, (tagger_sent, parser_sent) in enumerate(zip(tagger_output, parser_output)): curr_sent_answer = [] for j, curr_word_tagger_output in enumerate(tagger_sent): curr_word_tagger_output = curr_word_tagger_output.split("\t") curr_word_parser_output = parser_sent[j].split("\t") curr_word_answer = curr_word_tagger_output[:] # setting parser output curr_word_answer[6:8] = curr_word_parser_output[6:8] if self.output_format in ["json", "dict"]: curr_word_answer = {key: curr_word_answer[index] for key, index in UD_COLUMN_FEAT_MAPPING.items()} curr_word_answer = str(curr_word_answer) curr_word_answer = "\t".join(curr_word_answer) curr_sent_answer.append(curr_word_answer) curr_sent_answer = "\n".join(str(x) for x in curr_sent_answer) answer.append(curr_sent_answer) return answer ================================================ FILE: deeppavlov/models/morpho_syntax_parser/spacy_lemmatizer.py ================================================ # Copyright 2019 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import List import spacy from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component @register('spacy_lemmatizer') class SpacyLemmatizer(Component): def __init__(self, model: str, **kwargs): self.nlp = spacy.load(model) def __call__(self, words_batch: List[List[str]]): return [[self.nlp(word)[0].lemma_ for word in words_list] for words_list in words_batch] ================================================ FILE: deeppavlov/models/morpho_syntax_parser/syntax_parsing.py ================================================ # Copyright 2019 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import List, Optional, Tuple, Union from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component def make_pos_and_tag(tag: str, sep: str = ",", return_mode: Optional[str] = None) -> Tuple[str, Union[str, list, dict, tuple]]: """ Args: tag: the part-of-speech tag sep: the separator between part-of-speech tag and grammatical features return_mode: the type of return value, can be None, list, dict or sorted_items Returns: the part-of-speech label and grammatical features in required format """ if tag.endswith(" _"): tag = tag[:-2] if sep in tag: pos, tag = tag.split(sep, maxsplit=1) else: pos, tag = tag, ("_" if return_mode is None else "") if return_mode in ["dict", "list", "sorted_items"]: tag = tag.split("|") if tag != "" else [] if return_mode in ["dict", "sorted_items"]: tag = dict(tuple(elem.split("=")) for elem in tag) if return_mode == "sorted_items": tag = tuple(sorted(tag.items())) return pos, tag class OutputPrettifier(Component): """Base class for formatting the output of dependency parser and morphotagger""" def __init__(self, return_string: bool = True, begin: str = "", end: str = "\n", sep: str = "\n", **kwargs) -> None: self.return_string = return_string self.begin = begin self.end = end self.sep = sep def prettify(self, tokens: List[str], heads: List[int], deps: List[str]) -> Union[List[str], str]: raise NotImplementedError def __call__(self, X: List[List[str]], Y: List[List[int]], Z: List[List[str]]) -> List[Union[List[str], str]]: """Calls the :meth:`~prettify` function for each input sentence. Args: X: a list of input sentences Y: a list of lists of head positions for sentence words Z: a list of lists of dependency labels for sentence words Returns: a list of prettified UD outputs """ return [self.prettify(x, y, z) for x, y, z in zip(X, Y, Z)] @register('dependency_output_prettifier') class DependencyOutputPrettifier(OutputPrettifier): """Class which prettifies dependency parser output to 10-column (Universal Dependencies) format. Args: begin: a string to append in the beginning end: a string to append in the end sep: separator between word analyses """ def __init__(self, return_string: bool = True, begin: str = "", end: str = "\n", sep: str = "\n", **kwargs) -> None: super().__init__(return_string, begin, end, sep, **kwargs) self.format_string = "{}\t{}\t_\t_\t_\t_\t{}\t{}\t_\t_" def prettify(self, tokens: List[str], heads: List[int], deps: List[str]) -> Union[List[str], str]: """Prettifies output of dependency parser. Args: tokens: tokenized source sentence heads: list of head positions, the output of the parser deps: list of head positions, the output of the parser Returns: the prettified output of the parser """ answer = [] for i, (word, head, dep) in enumerate(zip(tokens, heads, deps)): answer.append(self.format_string.format(i + 1, word, head, dep)) if self.return_string: answer = self.begin + self.sep.join(answer) + self.end return answer @register('lemmatized_output_prettifier') class LemmatizedOutputPrettifier(OutputPrettifier): """Class which prettifies morphological tagger output to 4-column or 10-column (Universal Dependencies) format. Args: format_mode: output format, in `basic` mode output data contains 4 columns (id, word, pos, features), in `conllu` or `ud` mode it contains 10 columns: id, word, lemma, pos, xpos, feats, head, deprel, deps, misc (see http://universaldependencies.org/format.html for details) Only id, word, lemma, tag and pos columns are predicted in current version, other columns are filled by `_` value. begin: a string to append in the beginning end: a string to append in the end sep: separator between word analyses """ def __init__(self, return_string: bool = True, begin: str = "", end: str = "\n", sep: str = "\n", **kwargs) -> None: super().__init__(return_string, begin, end, sep, **kwargs) self.format_string = "{}\t{}\t{}\t{}\t_\t{}\t_\t_\t_\t_" def prettify(self, tokens: List[str], tags: List[str], lemmas: List[str]) -> Union[List[str], str]: """Prettifies output of morphological tagger. Args: tokens: tokenized source sentence tags: list of tags, the output of a tagger lemmas: list of lemmas, the output of a lemmatizer Returns: the prettified output of the tagger. Examples: >>> sent = "John really likes pizza .".split() >>> tags = ["PROPN,Number=Sing", "ADV", >>> "VERB,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin", >>> "NOUN,Number=Sing", "PUNCT"] >>> lemmas = "John really like pizza .".split() >>> prettifier = LemmatizedOutputPrettifier() >>> self.prettify(sent, tags, lemmas) 1 John John PROPN _ Number=Sing _ _ _ _ 2 really really ADV _ _ _ _ _ _ 3 likes like VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin _ _ _ _ 4 pizza pizza NOUN _ Number=Sing _ _ _ _ 5 . . PUNCT _ _ _ _ _ _ """ answer = [] for i, (word, tag, lemma) in enumerate(zip(tokens, tags, lemmas)): pos, tag = make_pos_and_tag(tag, sep=",") answer.append(self.format_string.format(i + 1, word, lemma, pos, tag)) if self.return_string: answer = self.begin + self.sep.join(answer) + self.end return answer ================================================ FILE: deeppavlov/models/preprocessors/__init__.py ================================================ ================================================ FILE: deeppavlov/models/preprocessors/dirty_comments_preprocessor.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import re import string from typing import List from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component @register('dirty_comments_preprocessor') class DirtyCommentsPreprocessor(Component): """ Class implements preprocessing of english texts with low level of literacy such as comments """ def __init__(self, remove_punctuation: bool = True, *args, **kwargs): self.remove_punctuation = remove_punctuation def __call__(self, batch: List[str], **kwargs) -> List[str]: """ Preprocess given batch Args: batch: list of text samples **kwargs: additional arguments Returns: list of preprocessed text samples """ f = [x.lower() for x in batch] f = [re.sub("<\S*>", " ", x) for x in f] f = [re.sub('\s+', ' ', x) for x in f] f = [x.replace("won't", "will not") for x in f] f = [x.replace("can't", "cannot") for x in f] f = [x.replace("i'm", "i am") for x in f] f = [x.replace(" im ", " i am ") for x in f] f = [x.replace("'re", " are") for x in f] f = [x.replace("ain't", "is not") for x in f] f = [x.replace("'ll", " will") for x in f] f = [x.replace("n't", " not") for x in f] f = [x.replace("'ve", " have") for x in f] f = [x.replace("'s", " is") for x in f] f = [x.replace("'d", " would") for x in f] f = [re.sub("ies( |$)", "y ", x) for x in f] f = [re.sub("s( |$)", " ", x) for x in f] f = [re.sub("ing( |$)", " ", x) for x in f] f = [x.replace(" u ", " you ") for x in f] f = [x.replace(" em ", " them ") for x in f] f = [x.replace(" da ", " the ") for x in f] f = [x.replace(" yo ", " you ") for x in f] f = [x.replace(" ur ", " your ") for x in f] f = [x.replace(" u r ", " you are ") for x in f] f = [x.replace(" urs ", " yours ") for x in f] f = [x.replace("y'all", "you all") for x in f] f = [x.replace(" r u ", " are you ") for x in f] f = [x.replace(" r you", " are you") for x in f] f = [x.replace(" are u ", " are you ") for x in f] f = [x.replace("\\n", " ") for x in f] f = [x.replace("\\t", " ") for x in f] f = [x.replace("\\xa0", " ") for x in f] f = [x.replace("\\xc2", " ") for x in f] f = [re.sub("[0-9]+", " 0 ", x) for x in f] f = [re.sub(r'([' + string.printable + r'])\1{3,}', r'\1\1', x).strip() for x in f] if self.remove_punctuation: f = [re.sub(r'([' + string.punctuation + '])', ' ', x) for x in f] f = [re.sub(' +', ' ', x) for x in f] return f ================================================ FILE: deeppavlov/models/preprocessors/dnnc_preprocessor.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from typing import List, Tuple import numpy as np from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component log = getLogger(__name__) @register('dnnc_pair_generator') class PairGenerator(Component): """ Generates all possible ordered pairs from 'texts_batch' and 'support_dataset' Args: bidirectional: adds pairs in reverse order """ def __init__(self, bidirectional: bool = False, **kwargs) -> None: self.bidirectional = bidirectional def __call__(self, texts: List[str], dataset: List[List[str]], ) -> Tuple[List[str], List[str], List[str], List[str]]: hypotesis_batch = [] premise_batch = [] hypotesis_labels_batch = [] for [premise, [hypotesis, hypotesis_labels]] in zip(texts * len(dataset), np.repeat(dataset, len(texts), axis=0)): premise_batch.append(premise) hypotesis_batch.append(hypotesis) hypotesis_labels_batch.append(hypotesis_labels) if self.bidirectional: premise_batch.append(hypotesis) hypotesis_batch.append(premise) hypotesis_labels_batch.append(hypotesis_labels) return texts, hypotesis_batch, premise_batch, hypotesis_labels_batch ================================================ FILE: deeppavlov/models/preprocessors/mask.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component @register('mask') class Mask(Component): """Takes a batch of tokens and returns the masks of corresponding length""" def __init__(self, *args, **kwargs): pass @staticmethod def __call__(tokens_batch, **kwargs): batch_size = len(tokens_batch) max_len = max(len(utt) for utt in tokens_batch) mask = np.zeros([batch_size, max_len], dtype=np.float32) for n, utterance in enumerate(tokens_batch): mask[n, :len(utterance)] = 1 return mask ================================================ FILE: deeppavlov/models/preprocessors/multitask_preprocessor.py ================================================ from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Iterable from logging import getLogger from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component from deeppavlov.models.preprocessors.torch_transformers_preprocessor import * log = getLogger(__name__) @register('multitask_pipeline_preprocessor') class MultiTaskPipelinePreprocessor(Component): """ Extracts out the task_id from the first index of each example for each task. Then splits the input and performs tokenization Params: vocab_file(str): vocabulary file for tokenization do_lower_case(bool): if True, tokenization is lower-cased. Default: True preprocessor(str): name of DeepPavlov class that is used for tokenization. Default: TorchTransformersPreprocessor preprocessors(List[str]): list of names of DeepPavlov classes that are used for tokenization. Overrides preprocessor . The length of list must be equal to the number of tasks max_seq_length(int): Maximum sequence length for tokenizer. Default: 512 strict(bool): if True, we always try to split data assuming predefined modes as in multitask_example.json If False, we go without splitting if we are not sure how to split the data. Default: False print_first_example(bool): if True, we print the first input example after initialization. Default: False """ def __init__(self, vocab_file, do_lower_case: bool = True, preprocessor: str = 'TorchTransformersPreprocessor', preprocessors: List[str] = None, max_seq_length: int = 512, strict=False, print_first_example=False, *args, **kwargs): self.strict = strict self.printed = False self.print_first_example = print_first_example self.prefix = '' if preprocessors is None: log.info( f'Assuming the same preprocessor name for all : {preprocessor}') self.preprocessor = eval(preprocessor)(vocab_file, do_lower_case, max_seq_length, *args, **kwargs) self.preprocessors = None else: for i in range(len(preprocessors)): preprocessors[i] = eval(preprocessors[i]) self.n_task = len(preprocessors) self.preprocessors = [preprocessors[i](vocab_file=vocab_file, do_lower_case=do_lower_case, max_seq_length=max_seq_length, *args, **kwargs) for i in range(len(preprocessors))] def split(self, features): if all([isinstance(k, str) for k in features]) or all([k is None for k in features]): # single sentence classification log.debug('Assuming single sentence classification') texts_a, texts_b = features, None elif all([isinstance(k, tuple) and len(k) == 2 for k in features]): log.debug( 'Assuming sentence pair classification or classification for multichoice') texts_a, texts_b = [], [] for feature in features: text_a, text_b = feature texts_a.append(text_a) texts_b.append(text_b) elif all([isinstance(k, list) for k in features]): log.debug('Assuming ner classification') texts_a, texts_b = list(features), None else: if self.strict: raise Exception(f'Unsupported task data {features}') else: log.warning('Data not split.Going without splitting') texts_a, texts_b = features, None return texts_a, texts_b def __call__(self, *args): """ Returns batches of values from ``inp``. Every batch contains values that have same key from ``keys_to_extract`` attribute. The order of elements of ``keys_to_extract`` is preserved. Args: inp: A sequence of dictionaries with identical keys Returns: A list of lists of values of dictionaries from ``inp`` """ self.n_task = len(args) if self.preprocessors is None: # Defining preprocessor list while we call the function, as only he self.preprocessors = [self.preprocessor for _ in range(self.n_task)] answer = [] for i in range(len(args)): if all([j is None for j in args[i]]): log.debug('All nones received') answer.append([]) else: texts_a, texts_b = self.split(args[i]) #log.debug(f'Preprocessor {self.preprocessors[i]}') if all([j is None for j in texts_a]): log.debug('All nones') answer.append([]) else: if 'choice' in str(self.preprocessors[i]): if isinstance(texts_a[0], str) and isinstance(texts_b[0],list): for j in range(len(texts_b)): texts_a[j] = [texts_a[j] for _ in range(len(texts_b[j]))] if self.prefix: for j in range(len(texts_a)): texts_a[j] = [' '.join([self.prefix, text]) for text in texts_a[j]] else: if self.prefix: texts_a = [' '.join([self.prefix, text]) for text in texts_a] answer.append(self.preprocessors[i](texts_a, texts_b)) if not self.printed and self.print_first_example: print((texts_a, texts_b)) print(answer[-1]) self.printed = True if answer == [[]]: raise Exception('Empty answer') return answer ================================================ FILE: deeppavlov/models/preprocessors/ner_preprocessor.py ================================================ import errno import os from logging import getLogger from typing import List import numpy as np from deeppavlov.core.common.registry import register from deeppavlov.core.models.estimator import Estimator log = getLogger(__name__) @register("ner_vocab") class NerVocab(Estimator): """ Implementation of the NER vocabulary Params: word_file_path: the path to the pre-trained word embedding model save_path: the folder path to save dictionary files load_path: the folder path from which the dictionary files are loaded char_level: the flag arg indicating the character vocabulary """ def __init__(self, word_file_path=None, save_path=None, load_path=None, char_level=False, **kwargs): super().__init__(save_path=save_path, load_path=load_path, **kwargs) self.word_file_path = word_file_path self.char_level = char_level if word_file_path is not None: self.load_from_file(word_file_path) if self.save_path is not None: self.save_to_file(self.save_path) elif self.load_path is not None: self.load_from_file(self.load_path) def load_from_file(self, filename): if filename is None or not os.path.exists(filename): return self._t2i, self._i2t = {}, {} for i, line in enumerate(open(file=filename, mode="r", encoding="utf-8").readlines()): word = line.strip() self._t2i[word] = i self._i2t[i] = word def save_to_file(self, filename): if filename is None: return dir_name = os.path.dirname(filename) if not os.path.exists(dir_name): os.makedirs(dir_name) with open(file=filename, mode="w", encoding="utf-8") as fo: for word in self._t2i.keys(): fo.write("{}\n".format(word)) def fit(self, sents: [List[List[str]]], *args): if self.word_file_path is not None: return if self.char_level: items = set([char for sent in sents for word in sent for char in word]) else: items = set([word for sent in sents for word in sent]) items = ["", ""] + list(items) self._t2i = {k: v for v, k in enumerate(items)} self._i2t = {k: v for k, v in enumerate(items)} self.save_to_file(self.save_path) def pad_batch(self, tokens: List[List[int]]): """ Create padded batch of words, tags, chunk pos, even batch of characters Params: tokens: list of raw words, pos, chunk, or tags. Returns: the padded batch """ batch_size = len(tokens) if not self.char_level: max_len = max([len(seq) for seq in tokens]) padded_batch = np.full((batch_size, max_len), self._t2i[""]) for i, seq in enumerate(tokens): padded_batch[i, :len(seq)] = seq else: max_len_seq = max([len(seq) for seq in tokens]) if max_len_seq == 0: max_len_sub_seq = 0 else: max_len_sub_seq = max([len(sub_seq) for seq in tokens for sub_seq in seq]) padded_batch = np.full((batch_size, max_len_seq, max_len_sub_seq), self._t2i[""]) for i, seq in enumerate(tokens): for j, sub_seq in enumerate(seq): padded_batch[i, j, :len(sub_seq)] = sub_seq return padded_batch def __call__(self, sents, **kwargs): if not self.char_level: sents_ind = [[self._t2i[word] if word in self._t2i else 0 for word in sent] for sent in sents] else: sents_ind = [[[self._t2i[char] if char in self._t2i else 0 for char in word] for word in sent] for sent in sents] padded_sents = self.pad_batch(sents_ind) return padded_sents def load(self, *args, **kwargs): log.debug("[loading vocabulary from {}]".format(self.load_path)) if self.load_path is not None: self.load_from_file(self.load_path) def save(self, *args, **kwargs): log.info("[saving vocabulary to {}]".format(self.save_path)) if not os.path.exists(os.path.dirname(self.save_path)): try: os.makedirs(os.path.dirname(self.save_path)) except OSError as exc: if exc.errno != errno.EEXIST: raise self.save_to_file(self.save_path) @property def len(self): return len(self._t2i) @property def t2i(self): return self._t2i @property def i2t(self): return self._i2t ================================================ FILE: deeppavlov/models/preprocessors/odqa_preprocessors.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from itertools import chain from logging import getLogger from typing import List, Callable, Union, Tuple, Optional from nltk import sent_tokenize from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component logger = getLogger(__name__) @register('document_chunker') class DocumentChunker(Component): """Make chunks from a document or a list of documents. Don't tear up sentences if needed. Args: sentencize_fn: a function for sentence segmentation keep_sentences: whether to tear up sentences between chunks or not tokens_limit: a number of tokens in a single chunk (usually this number corresponds to the squad model limit) flatten_result: whether to flatten the resulting list of lists of chunks paragraphs: whether to split document by paragrahs; if set to True, tokens_limit is ignored Attributes: keep_sentences: whether to tear up sentences between chunks or not tokens_limit: a number of tokens in a single chunk flatten_result: whether to flatten the resulting list of lists of chunks paragraphs: whether to split document by paragrahs; if set to True, tokens_limit is ignored """ def __init__(self, sentencize_fn: Callable = sent_tokenize, keep_sentences: bool = True, tokens_limit: int = 400, flatten_result: bool = False, paragraphs: bool = False, number_of_paragraphs: int = -1, *args, **kwargs) -> None: self._sentencize_fn = sentencize_fn self.keep_sentences = keep_sentences self.tokens_limit = tokens_limit self.flatten_result = flatten_result self.paragraphs = paragraphs self.number_of_paragraphs = number_of_paragraphs def __call__(self, batch_docs: List[Union[str, List[str]]], batch_docs_ids: Optional[List[Union[str, List[str]]]] = None) -> \ Union[Tuple[Union[List[str], List[List[str]]], Union[List[str], List[List[str]]]], Union[List[str], List[List[str]]]]: """Make chunks from a batch of documents. There can be several documents in each batch. Args: batch_docs: a batch of documents / a batch of lists of documents batch_docs_ids (optional) : a batch of documents ids / a batch of lists of documents ids Returns: chunks of docs, flattened or not and chunks of docs ids, flattened or not if batch_docs_ids were passed """ result = [] result_ids = [] empty_docs_ids_flag = False if not batch_docs_ids: empty_docs_ids_flag = True if empty_docs_ids_flag: batch_docs_ids = [[[] for j in i] for i in batch_docs] for ids, docs in zip(batch_docs_ids, batch_docs): batch_chunks = [] batch_chunks_ids = [] if isinstance(docs, str): docs = [docs] ids = [ids] for id, doc in zip(ids, docs): if self.paragraphs: split_doc = doc.split('\n\n') split_doc = [sd.strip() for sd in split_doc] split_doc = list(filter(lambda x: len(x) > 40, split_doc)) if self.number_of_paragraphs != -1: split_doc = split_doc[:self.number_of_paragraphs] batch_chunks.append(split_doc) batch_chunks_ids.append([id] * len(split_doc)) else: doc_chunks = [] if self.keep_sentences: sentences = sent_tokenize(doc) n_tokens = 0 keep = [] for s in sentences: n_tokens += len(s.split()) if n_tokens > self.tokens_limit: if keep: doc_chunks.append(' '.join(keep)) n_tokens = 0 keep.clear() keep.append(s) if keep: doc_chunks.append(' '.join(keep)) batch_chunks.append(doc_chunks) batch_chunks_ids.append([id] * len(doc_chunks)) else: split_doc = doc.split() doc_chunks = [split_doc[i:i + self.tokens_limit] for i in range(0, len(split_doc), self.tokens_limit)] batch_chunks.append(doc_chunks) batch_chunks_ids.append([id] * len(doc_chunks)) result.append(batch_chunks) result_ids.append(batch_chunks_ids) if self.flatten_result: if isinstance(result[0][0], list): for i in range(len(result)): flattened = list(chain.from_iterable(result[i])) flattened_ids = list(chain.from_iterable(result_ids[i])) result[i] = flattened result_ids[i] = flattened_ids if empty_docs_ids_flag: return result return result, result_ids @register('string_multiplier') class StringMultiplier(Component): """Make a list of strings from a provided string. A length of the resulting list equals a length of a provided reference argument. """ def __init__(self, **kwargs): pass def __call__(self, batch_s: List[str], ref: List[str]) -> List[List[str]]: """ Multiply each string in a provided batch of strings. Args: batch_s: a batch of strings to be multiplied ref: a reference to obtain a length of the resulting list Returns: a multiplied s as list """ res = [] for s, r in zip(batch_s, ref): res.append([s] * len(r)) return res ================================================ FILE: deeppavlov/models/preprocessors/one_hotter.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import List, Union, Iterable import numpy as np from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register from deeppavlov.core.data.utils import zero_pad from deeppavlov.core.models.component import Component @register('one_hotter') class OneHotter(Component): """ One-hot featurizer with zero-padding. If ``single_vector``, return the only vector per sample which can have several elements equal to ``1``. Parameters: depth: the depth for one-hotting pad_zeros: whether to pad elements of batch with zeros single_vector: whether to return one vector for the sample (sum of each one-hotted vectors) """ def __init__(self, depth: int, pad_zeros: bool = False, single_vector=False, *args, **kwargs): self._depth = depth self._pad_zeros = pad_zeros self.single_vector = single_vector if self._pad_zeros and self.single_vector: raise ConfigError("Cannot perform ``single_vector`` with zero padding for OneHotter") def __call__(self, batch: List[List[int]], **kwargs) -> Union[List[List[np.ndarray]], List[np.ndarray]]: """ Convert given batch of list of labels to one-hot representation of the batch. Args: batch: list of samples, where each sample is a list of integer labels. **kwargs: additional arguments Returns: if ``single_vector``, list of one-hot representations of each sample, otherwise, list of lists of one-hot representations of each label in a sample """ one_hotted_batch = [] for utt in batch: if isinstance(utt, Iterable): one_hotted_utt = self._to_one_hot(utt, self._depth) elif isinstance(utt, int): if self._pad_zeros or self.single_vector: one_hotted_utt = self._to_one_hot([utt], self._depth) else: one_hotted_utt = self._to_one_hot([utt], self._depth).reshape(-1) if self.single_vector: one_hotted_utt = np.sum(one_hotted_utt, axis=0) one_hotted_batch.append(one_hotted_utt) if self._pad_zeros: one_hotted_batch = zero_pad(one_hotted_batch) return one_hotted_batch @staticmethod def _to_one_hot(x, n): b = np.zeros([len(x), n], dtype=np.float32) for q, tok in enumerate(x): b[q, int(tok)] = 1 return b ================================================ FILE: deeppavlov/models/preprocessors/re_preprocessor.py ================================================ # Copyright 2021 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from pathlib import Path from typing import Tuple, List, Union import numpy as np from transformers import BertTokenizer from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.file import read_json from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component log = getLogger(__name__) @register('re_preprocessor') class REPreprocessor(Component): def __init__( self, vocab_file: str, special_token: str = '', ner_tags=None, max_seq_length: int = 512, do_lower_case: bool = False, default_tag: str = None, **kwargs ): """ Args: vocab_file: path to vocabulary / name of vocabulary for tokenizer initialization special_token: an additional token that will be used for marking the entities in the document do_lower_case: set True if lowercasing is needed default_tag: used for test purposes to create a valid input Return: list of feature batches with input_ids, attention_mask, entity_pos, ner_tags """ self.special_token = special_token self.special_tokens_dict = {'additional_special_tokens': [self.special_token]} self.default_tag = default_tag if ner_tags is None: ner_tags = ['ORG', 'TIME', 'MISC', 'LOC', 'PER', 'NUM'] self.ner2id = {tag: tag_id for tag_id, tag in enumerate(ner_tags)} self.max_seq_length = max_seq_length if Path(vocab_file).is_file(): vocab_file = str(expand_path(vocab_file)) self.tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) else: self.tokenizer = BertTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case) def __call__( self, tokens: Union[Tuple, List[List[str]]], entity_pos: Union[Tuple, List[List[Tuple]]], entity_tags: Union[Tuple, List[List[str]]], ) -> Tuple[List, List, List, List, List]: """ Tokenize and create masks; recalculate the entity positions regarding the document boarders. Args: tokens: List of tokens of each document: List[List[tokens in doc]] entity_pos: start and end positions of the entities' mentions entity_tags: NER tag of the entities Return: input_ids: List[List[int]], attention_mask: List[List[int]], entity_poss: List[ List[ List[(entity1_mention1_start_id, entity1_mention1_end_id), ...], List[(entity2_mention1_start_id, entity2_mention1_end_id), ...] ] ] entity_tags: List[List[int]] nf_samples: List[int] - contains the information about whether the corresponding sample is real sample or fake (for testing): 0 means the sample is real, 1 - it is fake. """ _ = self.tokenizer.add_special_tokens(self.special_tokens_dict) input_ids, attention_mask, upd_entity_pos, upd_entity_tags, nf_samples = [], [], [], [], [] # this workaround is for proper testing: for an unknown reason during test in test_quick_start.py # each input list is transformed into a tuple, e.g., tokens -> tuple(tokens, ). # todo: refactoring if type(tokens) == tuple and type(entity_pos) == tuple and type(entity_tags) == tuple: tokens = tokens[0] entity_pos = entity_pos[0] entity_tags = entity_tags[0] for n_sample, (doc, ent_pos, ent_tags) in enumerate(zip(tokens, entity_pos, entity_tags)): # valid scenario if isinstance(ent_pos, list) and len(ent_pos) == 2: count = 0 doc_wordpiece_tokens = [] entity1_pos_start = list(zip(*ent_pos[0]))[0] # first entity mentions' start positions entity1_pos_end = list(zip(*ent_pos[0]))[1] # first entity mentions' end positions entity2_pos_start = list(zip(*ent_pos[1]))[0] # second entity mentions' start positions entity2_pos_end = list(zip(*ent_pos[1]))[1] # second entity mentions' end positions upd_entity1_pos_start, upd_entity2_pos_start, upd_entity1_pos_end, upd_entity2_pos_end = [], [], [], [] for n, token in enumerate(doc): if n in entity1_pos_start: doc_wordpiece_tokens.append(self.special_token) upd_entity1_pos_start.append(count) count += 1 if n in entity1_pos_end: doc_wordpiece_tokens.append(self.special_token) count += 1 upd_entity1_pos_end.append(count) if n in entity2_pos_start: doc_wordpiece_tokens.append(self.special_token) upd_entity2_pos_start.append(count) count += 1 if n in entity2_pos_end: doc_wordpiece_tokens.append(self.special_token) count += 1 upd_entity2_pos_end.append(count) word_tokens = self.tokenizer.tokenize(token) doc_wordpiece_tokens += word_tokens count += len(word_tokens) # special case when the entity is the last in the doc if len(doc) in entity1_pos_end: doc_wordpiece_tokens.append(self.special_token) count += 1 upd_entity1_pos_end.append(count) if len(doc) in entity2_pos_end: doc_wordpiece_tokens.append(self.special_token) count += 1 upd_entity2_pos_end.append(count) word_tokens = self.tokenizer.tokenize(token) doc_wordpiece_tokens += word_tokens count += len(word_tokens) upd_entity_1_pos = list(zip(upd_entity1_pos_start, upd_entity1_pos_end)) upd_entity_2_pos = list(zip(upd_entity2_pos_start, upd_entity2_pos_end)) # text entities for self check upd_entity1_text = [doc_wordpiece_tokens[ent_m[0]:ent_m[1]] for ent_m in upd_entity_1_pos] upd_entity2_text = [doc_wordpiece_tokens[ent_m[0]:ent_m[1]] for ent_m in upd_entity_2_pos] enc_entity_tags = self.encode_ner_tag(ent_tags) encoding = self.tokenizer.encode_plus( doc_wordpiece_tokens[:self.max_seq_length], # truncate tokens add_special_tokens=True, truncation=True, max_length=self.max_seq_length, pad_to_max_length=True, return_attention_mask=True ) upd_entity_pos.append([upd_entity_1_pos, upd_entity_2_pos]) nf_samples.append(0) # api test scenario else: # for api test: dump values of entity tags and entity pos encoding = self.tokenizer.encode_plus( doc, add_special_tokens=True, truncation=True, max_length=self.max_seq_length, pad_to_max_length=True, return_attention_mask=True ) upd_entity_pos.append([[(0, 1)], [(0, 1)]]) enc_entity_tags = self.encode_ner_tag([self.default_tag] * 2) nf_samples.append(1) input_ids.append(encoding['input_ids']) attention_mask.append(encoding['attention_mask']) upd_entity_tags.append(enc_entity_tags) return input_ids, attention_mask, upd_entity_pos, upd_entity_tags, nf_samples def encode_ner_tag(self, ner_tags: List) -> List: """ Encode NER tags with one hot encodings """ enc_ner_tags = [] for ner_tag in ner_tags: ner_tag_one_hot = [0] * len(self.ner2id) ner_tag_one_hot[self.ner2id[ner_tag]] = 1 enc_ner_tags.append(ner_tag_one_hot) return enc_ner_tags @register('re_postprocessor') class REPostprocessor: def __init__(self, rel2id_path: str, rel2label_path: str, **kwargs): self.rel2id_path = rel2id_path self.rel2label_path = rel2label_path self.rel2id = read_json(str(expand_path(self.rel2id_path))) self.id2rel = {rel_id: rel for rel, rel_id in self.rel2id.items()} self.rel2label = read_json(str(expand_path(self.rel2label_path))) def __call__(self, model_output: List, nf_samples: List) -> Tuple[List[str], List[str]]: """ The model output is transformed to the relation id and relation name Args: model_output: List of probability vectors nf_samples: contains the information about true and fake samples (0 - true sample and should be included to the output, 1 - fake sample) Return: wikidata_relation_id: List of wiki ids of found relations relation_name: List of names of found relations """ wikidata_relation_id, relation_name = [], [] for predictions, nf_sample in zip(model_output, nf_samples): if nf_sample: wikidata_relation_id.append("-") relation_name.append("-") else: rel_indices = np.nonzero(predictions)[0] for index in rel_indices: if index == 0: wikidata_relation_id.append("-") relation_name.append("no relation") continue rel_p = self.id2rel[index] wikidata_relation_id.append(rel_p) if rel_p in self.rel2label: relation_name.append(self.rel2label[rel_p]) else: relation_name.append("-") return wikidata_relation_id, relation_name ================================================ FILE: deeppavlov/models/preprocessors/response_base_loader.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys from logging import getLogger import numpy as np from deeppavlov.core.common.registry import register from deeppavlov.core.models.serializable import Serializable logger = getLogger(__name__) @register('response_base_loader') class ResponseBaseLoader(Serializable): """Class for loading a base with text responses (and contexts) and their vector representations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.resps = None self.resp_vecs = None self.conts = None self.cont_vecs = None self.load() def load(self): if self.load_path is not None: resp_file = self.load_path / "responses.csv" if resp_file.exists(): with open(resp_file) as f: responses = f.readlines() self.resps = [el.strip('#\n') for el in responses] else: logger.error("Please provide responses.csv file to the {} directory".format(self.load_path)) sys.exit(1) resp_vec_file = self.load_path / "resp_vecs.npy" if resp_vec_file.exists(): self.resp_vecs = np.load(resp_vec_file) cont_file = self.load_path / "contexts.csv" if cont_file.exists(): with open(cont_file) as f: contexts = f.readlines() self.conts = [el.strip('#\n') for el in contexts] else: logger.error("Please add contexts.csv file to the {} directory".format(self.load_path)) sys.exit(1) cont_vec_file = self.load_path / "cont_vecs.npy" if cont_vec_file.exists(): self.cont_vecs = np.load(cont_vec_file) def save(self): logger.error("The method save of the {} class is not used.".format(self.__class__)) ================================================ FILE: deeppavlov/models/preprocessors/sanitizer.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import re import sys import unicodedata from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component @register('sanitizer') class Sanitizer(Component): """Remove all combining characters like diacritical marks from tokens Args: diacritical: whether to remove diacritical signs or not diacritical signs are something like hats and stress marks nums: whether to replace all digits with 1 or not """ def __init__(self, diacritical: bool = True, nums: bool = False, *args, **kwargs) -> None: self.diacritical = diacritical self.nums = nums self.combining_characters = dict.fromkeys([c for c in range(sys.maxunicode) if unicodedata.combining(chr(c))]) def filter_diacritical(self, tokens_batch): """Takes batch of tokens and returns the batch with sanitized tokens""" sanitized_batch = [] for utterance in tokens_batch: sanitized_utterance = [] for token in utterance: token = unicodedata.normalize('NFD', token) sanitized_utterance.append(token.translate(self.combining_characters)) sanitized_batch.append(sanitized_utterance) return sanitized_batch def replace_nums(self, tokens_batch): sanitized_batch = [] for utterance in tokens_batch: sanitized_batch.append([re.sub('[0-9]', '1', token) for token in utterance]) return sanitized_batch def __call__(self, tokens_batch, **kwargs): if self.filter_diacritical: tokens_batch = self.filter_diacritical(tokens_batch) if self.nums: tokens_batch = self.replace_nums(tokens_batch) return tokens_batch ================================================ FILE: deeppavlov/models/preprocessors/sentseg_preprocessor.py ================================================ from typing import List from deeppavlov.core.common.registry import register @register("sentseg_restore_sent") def SentSegRestoreSent(batch_words: List[List[str]], batch_tags: List[List[str]]) -> List[str]: ret = [] for words, tags in zip(batch_words, batch_tags): if len(tags) == 0: ret.append("") continue sent = words[0] punct = "" if tags[0] == "O" else tags[0][-1] for word, tag in zip(words[1:], tags[1:]): if tag != "O": sent += punct punct = tag[-1] sent += " " + word sent += punct ret.append(sent) return ret ================================================ FILE: deeppavlov/models/preprocessors/squad_preprocessor.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import bisect from logging import getLogger from typing import List, Dict from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component logger = getLogger(__name__) @register('squad_bert_mapping') class SquadBertMappingPreprocessor(Component): """Create mapping from BERT subtokens to their characters positions and vice versa. Args: do_lower_case: set True if lowercasing is needed """ def __init__(self, do_lower_case: bool = True, *args, **kwargs): self.do_lower_case = do_lower_case def __call__(self, contexts_batch, bert_features_batch, subtokens_batch, **kwargs): subtok2chars_batch: List[List[Dict[int, int]]] = [] char2subtoks_batch: List[List[Dict[int, int]]] = [] for batch_counter, (context_list, features_list, subtokens_list) in \ enumerate(zip(contexts_batch, bert_features_batch, subtokens_batch)): subtok2chars_list, char2subtoks_list = [], [] for context, features, subtokens in zip(context_list, features_list, subtokens_list): if self.do_lower_case: context = context.lower() context_start = subtokens.index('[SEP]') + 1 idx = 0 subtok2char: Dict[int, int] = {} char2subtok: Dict[int, int] = {} for i, subtok in list(enumerate(subtokens))[context_start:-1]: subtok = subtok[2:] if subtok.startswith('##') else subtok subtok_pos = context[idx:].find(subtok) if subtok_pos == -1: # it could be UNK idx += 1 # len was at least one else: # print(k, '\t', t, p + idx) idx += subtok_pos subtok2char[i] = idx for j in range(len(subtok)): char2subtok[idx + j] = i idx += len(subtok) subtok2chars_list.append(subtok2char) char2subtoks_list.append(char2subtok) subtok2chars_batch.append(subtok2chars_list) char2subtoks_batch.append(char2subtoks_list) return subtok2chars_batch, char2subtoks_batch @register('squad_bert_ans_preprocessor') class SquadBertAnsPreprocessor(Component): """Create answer start and end positions in subtokens. Args: do_lower_case: set True if lowercasing is needed """ def __init__(self, do_lower_case: bool = True, *args, **kwargs): self.do_lower_case = do_lower_case def __call__(self, answers_raw, answers_start, char2subtoks, **kwargs): answers, starts, ends = [], [], [] for answers_raw, answers_start, c2sub in zip(answers_raw, answers_start, char2subtoks): answers.append([]) starts.append([]) ends.append([]) for ans, ans_st in zip(answers_raw, answers_start): if self.do_lower_case: ans = ans.lower() try: indices = {c2sub[0][i] for i in range(ans_st, ans_st + len(ans)) if i in c2sub[0]} st = min(indices) end = max(indices) except ValueError: # 0 - CLS token st, end = 0, 0 ans = '' starts[-1] += [st] ends[-1] += [end] answers[-1] += [ans] return answers, starts, ends @register('squad_bert_ans_postprocessor') class SquadBertAnsPostprocessor(Component): """Extract answer and create answer start and end positions in characters from subtoken positions.""" def __init__(self, *args, **kwargs): pass def __call__(self, answers_start_batch, answers_end_batch, contexts_batch, subtok2chars_batch, subtokens_batch, ind_batch, *args, **kwargs): answers = [] starts = [] ends = [] for answer_st, answer_end, context_list, sub2c_list, subtokens_list, ind in \ zip(answers_start_batch, answers_end_batch, contexts_batch, subtok2chars_batch, subtokens_batch, ind_batch): sub2c = sub2c_list[ind] subtok = subtokens_list[ind][answer_end] context = context_list[ind] # CLS token is no_answer token if answer_st == 0 or answer_end == 0: answers += [''] starts += [-1] ends += [-1] else: st = self.get_char_position(sub2c, answer_st) end = self.get_char_position(sub2c, answer_end) subtok = subtok[2:] if subtok.startswith('##') else subtok answer = context[st:end + len(subtok)] answers += [answer] starts += [st] ends += [ends] return answers, starts, ends @staticmethod def get_char_position(sub2c, sub_pos): keys = list(sub2c.keys()) found_idx = bisect.bisect(keys, sub_pos) if found_idx == 0: return sub2c[keys[0]] return sub2c[keys[found_idx - 1]] ================================================ FILE: deeppavlov/models/preprocessors/str_lower.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Union from deeppavlov.core.common.registry import register @register('str_lower') def str_lower(batch: Union[str, list, tuple]): """Recursively search for strings in a list and convert them to lowercase Args: batch: a string or a list containing strings at some level of nesting Returns: the same structure where all strings are converted to lowercase """ if isinstance(batch, str): return batch.lower() else: return list(map(str_lower, batch)) ================================================ FILE: deeppavlov/models/preprocessors/str_token_reverser.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import List, Union from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component StrTokenReverserInfo = Union[List[str], List['StrTokenReverserInfo']] @register('str_token_reverser') class StrTokenReverser(Component): """Component for converting strings to strings with reversed token positions Args: tokenized: The parameter is only needed to reverse tokenized strings. """ def __init__(self, tokenized: bool = False, *args, **kwargs) -> None: self.tokenized = tokenized @staticmethod def _reverse_str(raw_string): splitted = raw_string.split() splitted.reverse() string = ' '.join(splitted) return string @staticmethod def _reverse_tokens(raw_tokens): raw_tokens.reverse() return raw_tokens def __call__(self, batch: Union[str, list, tuple]) -> StrTokenReverserInfo: """Recursively search for strings in a list and convert them to strings with reversed token positions Args: batch: a string or a list containing strings Returns: the same structure where all strings tokens are reversed """ if isinstance(batch, (list, tuple)): batch = batch.copy() if self.tokenized: if isinstance(batch, (list, tuple)): if isinstance(batch[-1], str): return self._reverse_tokens(batch) else: return [self(line) for line in batch] raise RuntimeError(f'The objects passed to the reverser are not list or tuple! ' f' But they are {type(batch)}.' f' If you want to passed str type directly use option tokenized = False') else: if isinstance(batch, (list, tuple)): return [self(line) for line in batch] else: return self._reverse_str(batch) ================================================ FILE: deeppavlov/models/preprocessors/str_utf8_encoder.py ================================================ # originally based on https://github.com/allenai/bilm-tf/blob/master/bilm/data.py # Modifications copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from collections import Counter, OrderedDict from itertools import chain from logging import getLogger from typing import Union, List, Tuple import numpy as np from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register from deeppavlov.core.models.estimator import Estimator log = getLogger(__name__) StrUTF8EncoderInfo = Union[List[str], List['StrUTF8EncoderInfo']] @register('str_utf8_encoder') class StrUTF8Encoder(Estimator): """Component for encoding all strings to utf8 codes Args: max_word_length: Max length of words of input and output batches. pad_special_char_use: Whether to use special char for padding or not. word_boundary_special_char_use: Whether to add word boundaries by special chars or not. sentence_boundary_special_char_use: Whether to add word boundaries by special chars or not. reversed_sentense_tokens: Whether to use reversed sequences of tokens or not. bos: Name of a special token of the begin of a sentence. eos: Name of a special token of the end of a sentence. """ def __init__(self, max_word_length: int = 50, pad_special_char_use: bool = False, word_boundary_special_char_use: bool = False, sentence_boundary_special_char_use: bool = False, reversed_sentense_tokens: bool = False, bos: str = '', eos: str = '', **kwargs) -> None: super().__init__(**kwargs) if word_boundary_special_char_use and max_word_length < 3: raise ConfigError(f"`max_word_length` should be more than 3!") if max_word_length < 1: raise ConfigError(f"`max_word_length` should be more than 1!") self._max_word_length = max_word_length self._reverse = reversed_sentense_tokens self._pad_special_char_use = pad_special_char_use self._word_boundary_special_char_use = word_boundary_special_char_use self._sentence_boundary_special_char_use = sentence_boundary_special_char_use # char ids 0-255 come from utf-8 encoding bytes # assign 256-300 to special chars self.bos_char = 256 # self.eos_char = 257 # self.bow_char = 258 # self.eow_char = 259 # self.pad_char = 260 # self._len = 261 # an upper bound of all indexes # the charcter representation of the begin/end of sentence characters def _make_bos_eos(indx): indx = np.array([indx], dtype=np.int32) if self._word_boundary_special_char_use: code = np.pad(indx, (1, 1), 'constant', constant_values=(self.bow_char, self.eow_char)) else: code = indx if self._pad_special_char_use: code = np.pad(code, (0, self._max_word_length - code.shape[0]), 'constant', constant_values=(self.pad_char)) else: pass return code self.bos_chars = _make_bos_eos(self.bos_char) self.eos_chars = _make_bos_eos(self.eos_char) if self._sentence_boundary_special_char_use: self._eos_chars = [self.eos_chars] self._bos_chars = [self.bos_chars] else: self._eos_chars = [] self._bos_chars = [] if self.load_path: self.load() else: self.tokens = [] self._word_char_ids = OrderedDict() for token in self.tokens: self._word_char_ids[token] = self._convert_word_to_char_ids(token) self._word_char_ids[bos] = self.bos_chars self._word_char_ids[eos] = self.eos_chars def __call__(self, batch: Union[List[str], Tuple[str]]) -> StrUTF8EncoderInfo: """Recursively search for strings in a list and utf8 encode Args: batch: a string or a list containing strings Returns: the same structure where all strings are utf8 encoded """ if isinstance(batch, (list, tuple)): if isinstance(batch[-1], str): return self._encode_chars(batch) else: return [self(line) for line in batch] raise RuntimeError(f'The objects passed to the reverser are not list or tuple of str! ' f' But they are {type(batch)}.') def load(self) -> None: if self.load_path: if self.load_path.is_file(): log.debug(f"[loading vocabulary from {self.load_path}]") self.tokens = [] for ln in self.load_path.open('r', encoding='utf8'): token = ln.strip().split()[0] self.tokens.append(token) else: raise ConfigError(f"Provided `load_path` for {self.__class__.__name__} doesn't exist!") else: raise ConfigError(f"`load_path` for {self} is not provided!") def save(self) -> None: log.info(f"[saving vocabulary to {self.save_path}]") with self.save_path.open('wt', encoding='utf8') as f: for token in self._word_char_ids.keys(): f.write('{}\n'.format(token)) def fit(self, *args) -> None: words = chain(*args) # filter(None, <>) -- to filter empty words freqs = Counter(filter(None, chain(*words))) for token, _ in freqs.most_common(): if not (token in self._word_char_ids): self._word_char_ids[token] = self._convert_word_to_char_ids(token) def _convert_word_to_char_ids(self, word): code = np.zeros([self._max_word_length], dtype=np.int32) if self._pad_special_char_use: code[:] = self.pad_char if self._word_boundary_special_char_use: word_encoded = word.encode('utf-8', 'ignore')[:self._max_word_length - 2] code[0] = self.bow_char for k, chr_id in enumerate(word_encoded, start=1): code[k] = chr_id code[len(word_encoded) + 1] = self.eow_char else: word_encoded = word.encode('utf-8', 'ignore')[:self._max_word_length] for k, chr_id in enumerate(word_encoded): code[k] = chr_id if not self._pad_special_char_use: if self._word_boundary_special_char_use: code = code[:len(word_encoded) + 2] else: code = code[:len(word_encoded)] return code def _word_to_char_ids(self, word): if word in self._word_char_ids: return self._word_char_ids[word] else: return self._convert_word_to_char_ids(word) def _encode_chars(self, sentence): """ Encode the sentence as a white space delimited string of tokens. """ chars_ids = [self._word_to_char_ids(cur_word) for cur_word in sentence] return self._wrap_in_s_char(chars_ids) def _wrap_in_s_char(self, chars_ids): chars_ids = chars_ids if self._pad_special_char_use else list(chars_ids) if self._reverse: ret = self._eos_chars + chars_ids + self._bos_chars else: ret = self._bos_chars + chars_ids + self._eos_chars return np.vstack(ret) if self._pad_special_char_use else ret def __len__(self): return self._len @property def len(self): """ An upper bound of all indexes. """ return len(self) ================================================ FILE: deeppavlov/models/preprocessors/torch_transformers_preprocessor.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import math import random import re from collections import defaultdict from dataclasses import dataclass from logging import getLogger from pathlib import Path from typing import Tuple, List, Optional, Union, Dict, Set, Any import nltk import numpy as np import torch from transformers import AutoTokenizer from transformers.data.processors.utils import InputFeatures from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.registry import register from deeppavlov.core.data.utils import zero_pad from deeppavlov.core.models.component import Component from deeppavlov.models.preprocessors.mask import Mask log = getLogger(__name__) @register('torch_transformers_multiplechoice_preprocessor') class TorchTransformersMultiplechoicePreprocessor(Component): """Tokenize text on subtokens, encode subtokens with their indices, create tokens and segment masks. Args: vocab_file: path to vocabulary do_lower_case: set True if lowercasing is needed max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens Attributes: max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens tokenizer: instance of Bert FullTokenizer """ def __init__(self, vocab_file: str, do_lower_case: bool = True, max_seq_length: int = 512, **kwargs) -> None: self.max_seq_length = max_seq_length if Path(vocab_file).is_file(): vocab_file = str(expand_path(vocab_file)) self.tokenizer = AutoTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case, **kwargs) else: self.tokenizer = AutoTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case, **kwargs) def tokenize_mc_examples(self, contexts: List[List[str]], choices: List[List[str]]) -> Dict[str, torch.tensor]: num_choices = len(contexts[0]) batch_size = len(contexts) # tokenize examples in groups of `num_choices` examples = [] for context_list, choice_list in zip(contexts, choices): for context, choice in zip(context_list, choice_list): tokenized_input = self.tokenizer.encode_plus(text=context, text_pair=choice, return_attention_mask=True, add_special_tokens=True, truncation=True) examples.append(tokenized_input) padded_examples = self.tokenizer.pad( examples, padding=True, max_length=self.max_seq_length, return_tensors='pt', ) padded_examples = {k: v.view(batch_size, num_choices, -1) for k, v in padded_examples.items()} return padded_examples def __call__(self, texts_a: List[List[str]], texts_b: List[List[str]] = None) -> Dict[str, torch.tensor]: """Tokenize and create masks. texts_a and texts_b are separated by [SEP] token Args: texts_a: list of texts, texts_b: list of texts, it could be None, e.g. single sentence classification task Returns: batch of :class:`transformers.data.processors.utils.InputFeatures` with subtokens, subtoken ids, \ subtoken mask, segment mask, or tuple of batch of InputFeatures and Batch of subtokens """ input_features = [] if texts_a and texts_b and texts_a[0] and texts_b[0]: input_features = self.tokenize_mc_examples(texts_a, texts_b) return input_features @register('torch_transformers_preprocessor') class TorchTransformersPreprocessor(Component): """Tokenize text on subtokens, encode subtokens with their indices, create tokens and segment masks. Args: vocab_file: A string, the `model id` of a predefined tokenizer hosted inside a model repo on huggingface.co or a path to a `directory` containing vocabulary files required by the tokenizer. do_lower_case: set True if lowercasing is needed max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens Attributes: max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens tokenizer: instance of Bert FullTokenizer """ def __init__(self, vocab_file: str, do_lower_case: bool = True, max_seq_length: int = 512, **kwargs) -> None: self.max_seq_length = max_seq_length self.tokenizer = AutoTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case, **kwargs) def __call__(self, texts_a: List, texts_b: Optional[List[str]] = None) -> Union[List[InputFeatures], Tuple[List[InputFeatures], List[List[str]]]]: """Tokenize and create masks. texts_a and texts_b are separated by [SEP] token Args: texts_a: list of texts, texts_b: list of texts, it could be None, e.g. single sentence classification task Returns: batch of :class:`transformers.data.processors.utils.InputFeatures` with subtokens, subtoken ids, \ subtoken mask, segment mask, or tuple of batch of InputFeatures and Batch of subtokens """ # in case of iterator's strange behaviour if isinstance(texts_a, tuple): texts_a = list(texts_a) elif isinstance(texts_a, str): raise TypeError(f'Received string {texts_a} as an input! Check the iterator output') elif texts_a == []: return {} texts_a = [k for k in texts_a if k is not None] # handle dummy output input_features = self.tokenizer(text=texts_a, text_pair=texts_b, add_special_tokens=True, max_length=self.max_seq_length, padding='max_length', return_attention_mask=True, truncation=True, return_tensors='pt') return input_features @register('torch_transformers_entity_ranker_preprocessor') class TorchTransformersEntityRankerPreprocessor(Component): """Class for tokenization of text into subtokens, encoding of subtokens with indices and obtaining positions of special [ENT]-tokens Args: vocab_file: path to vocabulary do_lower_case: set True if lowercasing is needed max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens special_tokens: list of special tokens special_token_id: id of special token return_special_tokens_pos: whether to return positions of found special tokens """ def __init__(self, vocab_file: str, do_lower_case: bool = False, max_seq_length: int = 512, special_tokens: List[str] = None, special_token_id: int = None, return_special_tokens_pos: bool = False, **kwargs) -> None: self.max_seq_length = max_seq_length self.do_lower_case = do_lower_case if Path(vocab_file).is_file(): vocab_file = str(expand_path(vocab_file)) self.tokenizer = AutoTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) else: self.tokenizer = AutoTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case) if special_tokens is not None: special_tokens_dict = {'additional_special_tokens': special_tokens} self.tokenizer.add_special_tokens(special_tokens_dict) self.special_token_id = special_token_id self.return_special_tokens_pos = return_special_tokens_pos def __call__(self, texts_a: List[str]) -> Tuple[Any, List[int]]: """Tokenize and find special tokens positions. Args: texts_a: list of texts, Returns: batch of :class:`transformers.data.processors.utils.InputFeatures` with subtokens, subtoken ids, \ subtoken mask, segment mask, or tuple of batch of InputFeatures and Batch of subtokens batch of indices of special token ids in input ids sequence """ # in case of iterator's strange behaviour if isinstance(texts_a, tuple): texts_a = list(texts_a) if self.do_lower_case: texts_a = [text.lower() for text in texts_a] lengths = [] input_ids_batch = [] for text_a in texts_a: encoding = self.tokenizer.encode_plus( text_a, add_special_tokens=True, pad_to_max_length=True, return_attention_mask=True) input_ids = encoding["input_ids"] input_ids_batch.append(input_ids) lengths.append(len(input_ids)) max_length = min(max(lengths), self.max_seq_length) input_features = self.tokenizer(text=texts_a, add_special_tokens=True, max_length=max_length, padding='max_length', return_attention_mask=True, truncation=True, return_tensors='pt') special_tokens_pos = [] for input_ids_list in input_ids_batch: found_n = -1 for n, input_id in enumerate(input_ids_list): if input_id == self.special_token_id: found_n = n break if found_n == -1: found_n = 0 special_tokens_pos.append(found_n) if self.return_special_tokens_pos: return input_features, special_tokens_pos else: return input_features @register('torch_squad_transformers_preprocessor') class TorchSquadTransformersPreprocessor(Component): """Tokenize text on subtokens, encode subtokens with their indices, create tokens and segment masks. Args: vocab_file: path to vocabulary do_lower_case: set True if lowercasing is needed max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens Attributes: max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens tokenizer: instance of Bert FullTokenizer """ def __init__(self, vocab_file: str, do_lower_case: bool = True, max_seq_length: int = 512, add_token_type_ids: bool = False, **kwargs) -> None: self.max_seq_length = max_seq_length self.add_token_type_ids = add_token_type_ids if Path(vocab_file).is_file(): vocab_file = str(expand_path(vocab_file)) self.tokenizer = AutoTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) else: self.tokenizer = AutoTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case) def __call__(self, question_batch: List[str], context_batch: Optional[List[str]] = None) -> Union[ List[InputFeatures], Tuple[List[InputFeatures], List[List[str]]]]: """Tokenize and create masks. texts_a_batch and texts_b_batch are separated by [SEP] token Args: texts_a_batch: list of texts, texts_b_batch: list of texts, it could be None, e.g. single sentence classification task Returns: batch of :class:`transformers.data.processors.utils.InputFeatures` with subtokens, subtoken ids, \ subtoken mask, segment mask, or tuple of batch of InputFeatures, batch of subtokens and batch of split paragraphs """ if context_batch is None: context_batch = [None] * len(question_batch) input_features_batch, tokens_batch, split_context_batch = [], [], [] for question, context in zip(question_batch, context_batch): question_list, context_list = [], [] context_subtokens = self.tokenizer.tokenize(context) question_subtokens = self.tokenizer.tokenize(question) max_chunk_len = self.max_seq_length - len(question_subtokens) - 3 if 0 < max_chunk_len < len(context_subtokens): number_of_chunks = math.ceil(len(context_subtokens) / max_chunk_len) sentences = nltk.sent_tokenize(context) for chunk in np.array_split(sentences, number_of_chunks): context_list += [' '.join(chunk)] question_list += [question] else: context_list += [context] question_list += [question] input_features_list, tokens_list = [], [] for question_elem, context_elem in zip(question_list, context_list): encoded_dict = self.tokenizer.encode_plus( text=question_elem, text_pair=context_elem, add_special_tokens=True, max_length=self.max_seq_length, truncation=True, padding='max_length', return_attention_mask=True, return_tensors='pt') if 'token_type_ids' not in encoded_dict: if self.add_token_type_ids: input_ids = encoded_dict['input_ids'] seq_len = input_ids.size(1) sep = torch.where(input_ids == self.tokenizer.sep_token_id)[1][0].item() len_a = min(sep + 1, seq_len) len_b = seq_len - len_a encoded_dict['token_type_ids'] = torch.cat((torch.zeros(1, len_a, dtype=int), torch.ones(1, len_b, dtype=int)), dim=1) else: encoded_dict['token_type_ids'] = torch.tensor([0]) curr_features = InputFeatures(input_ids=encoded_dict['input_ids'], attention_mask=encoded_dict['attention_mask'], token_type_ids=encoded_dict['token_type_ids'], label=None) input_features_list.append(curr_features) tokens_list.append(self.tokenizer.convert_ids_to_tokens(encoded_dict['input_ids'][0])) input_features_batch.append(input_features_list) tokens_batch.append(tokens_list) split_context_batch.append(context_list) return input_features_batch, tokens_batch, split_context_batch @register('rel_ranking_preprocessor') class RelRankingPreprocessor(Component): """Class for tokenization of text and relation labels Args: vocab_file: path to vocabulary add_special_tokens: special_tokens_list do_lower_case: set True if lowercasing is needed max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens """ def __init__(self, vocab_file: str, do_lower_case: bool = True, max_seq_length: int = 512, **kwargs) -> None: self.max_seq_length = max_seq_length self.tokenizer = AutoTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case) def __call__(self, questions_batch: List[List[str]], rels_batch: List[List[str]] = None) -> Dict[str, torch.tensor]: """Tokenize questions and relations texts_a and texts_b are separated by [SEP] token Args: questions_batch: list of texts, rels_batch: list of relations list Returns: batch of :class:`transformers.data.processors.utils.InputFeatures` with subtokens, subtoken ids, \ subtoken mask, segment mask, or tuple of batch of InputFeatures and Batch of subtokens """ lengths, proc_rels_batch = [], [] for question, rels_list in zip(questions_batch, rels_batch): if isinstance(rels_list, list): rels_str = " ".join(rels_list) else: rels_str = rels_list encoding = self.tokenizer.encode_plus(text=question, text_pair=rels_str, return_attention_mask=True, add_special_tokens=True, truncation=True) lengths.append(len(encoding["input_ids"])) proc_rels_batch.append(rels_str) max_len = max(lengths) input_ids_batch, attention_mask_batch, token_type_ids_batch = [], [], [] for question, rels_list in zip(questions_batch, proc_rels_batch): encoding = self.tokenizer.encode_plus(text=question, text_pair=rels_list, truncation=True, max_length=max_len, pad_to_max_length=True, return_attention_mask=True) input_ids_batch.append(encoding["input_ids"]) attention_mask_batch.append(encoding["attention_mask"]) if "token_type_ids" in encoding: token_type_ids_batch.append(encoding["token_type_ids"]) else: token_type_ids_batch.append([0]) input_features = {"input_ids": torch.LongTensor(input_ids_batch), "attention_mask": torch.LongTensor(attention_mask_batch), "token_type_ids": torch.LongTensor(token_type_ids_batch)} return input_features @register('path_ranking_preprocessor') class PathRankingPreprocessor(Component): def __init__(self, vocab_file: str, additional_special_tokens: List[str] = None, do_lower_case: bool = True, max_seq_length: int = 67, **kwargs) -> None: self.max_seq_length = max_seq_length self.tokenizer = AutoTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case) self.additional_special_tokens = additional_special_tokens if self.additional_special_tokens: self.tokenizer.add_special_tokens({'additional_special_tokens': additional_special_tokens}) def __call__(self, questions_batch: List[str], rels_batch: List[List[List[str]]]): lengths, proc_rels_batch = [], [] for question, rels_list in zip(questions_batch, rels_batch): proc_rels_list = [] for rels in rels_list: if isinstance(rels, str): rels = [rels] rels_str = "" if len(rels) == 1: if self.additional_special_tokens: rels_str = f" {rels[0]} " else: rels_str = rels[0] elif len(rels) == 2: if rels[0] == rels[1]: rels_str = f" {rels[0]} " else: rels_str = f" {rels[0]} {rels[1]} " encoding = self.tokenizer.encode_plus(text=question, text_pair=rels_str, return_attention_mask=True, add_special_tokens=True, truncation=True) lengths.append(len(encoding["input_ids"])) proc_rels_list.append(rels_str) proc_rels_batch.append(proc_rels_list) max_len = min(max(lengths), self.max_seq_length) input_ids_batch, attention_mask_batch, token_type_ids_batch = [], [], [] for question, rels_list in zip(questions_batch, proc_rels_batch): input_ids_list, attention_mask_list, token_type_ids_list = [], [], [] for rels_str in rels_list: encoding = self.tokenizer.encode_plus(text=question, text_pair=rels_str, truncation=True, max_length=max_len, add_special_tokens=True, pad_to_max_length=True, return_attention_mask=True) input_ids_list.append(encoding["input_ids"]) attention_mask_list.append(encoding["attention_mask"]) if "token_type_ids" in encoding: token_type_ids_list.append(encoding["token_type_ids"]) else: token_type_ids_list.append([0]) input_ids_batch.append(input_ids_list) attention_mask_batch.append(attention_mask_list) token_type_ids_batch.append(token_type_ids_list) input_features = {"input_ids": input_ids_batch, "attention_mask": attention_mask_batch, "token_type_ids": token_type_ids_batch} return input_features @register('torch_transformers_ner_preprocessor') class TorchTransformersNerPreprocessor(Component): """ Takes tokens and splits them into bert subtokens, encodes subtokens with their indices. Creates a mask of subtokens (one for the first subtoken, zero for the others). If tags are provided, calculates tags for subtokens. Args: vocab_file: path to vocabulary do_lower_case: set True if lowercasing is needed max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens max_subword_length: replace token to if it's length is larger than this (defaults to None, which is equal to +infinity) token_masking_prob: probability of masking token while training provide_subword_tags: output tags for subwords or for words subword_mask_mode: subword to select inside word tokens, can be "first" or "last" (default="first") return_features: if True, returns answer in features format Attributes: max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens max_subword_length: rmax lenght of a bert subtoken tokenizer: instance of Bert FullTokenizer """ def __init__(self, vocab_file: str, do_lower_case: bool = False, max_seq_length: int = 512, max_subword_length: int = None, token_masking_prob: float = 0.0, provide_subword_tags: bool = False, subword_mask_mode: str = "first", return_features: bool = False, **kwargs): self._re_tokenizer = re.compile(r"[\d]+[\d\.,]+[\d]+|[\w'\.:@]+|[^\w ]") self.provide_subword_tags = provide_subword_tags self.mode = kwargs.get('mode') self.max_seq_length = max_seq_length self.max_subword_length = max_subword_length self.subword_mask_mode = subword_mask_mode if Path(vocab_file).is_file(): vocab_file = str(expand_path(vocab_file)) self.tokenizer = AutoTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) else: self.tokenizer = AutoTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case) self.token_masking_prob = token_masking_prob self.return_features = return_features def __call__(self, tokens: Union[List[List[str]], List[str]], tags: List[List[str]] = None, **kwargs): tokens_offsets_batch = [[] for _ in tokens] if isinstance(tokens[0], str): tokens_batch = [] tokens_offsets_batch = [] for s in tokens: tokens_list = [] tokens_offsets_list = [] matches = tuple(re.finditer(self._re_tokenizer, s)) for i, elem in enumerate(matches): if (i == len(matches) - 1) and (elem[0][-1] == '.'): tokens_list.append(elem[0][:-1]) tokens_list.append('.') tokens_offsets_list.append((elem.start(), elem.end() - 1)) tokens_offsets_list.append((elem.end() - 1, elem.end())) else: tokens_list.append(elem[0]) tokens_offsets_list.append((elem.start(), elem.end())) tokens_batch.append(tokens_list) tokens_offsets_batch.append(tokens_offsets_list) tokens = tokens_batch subword_tokens, subword_tok_ids, startofword_markers, subword_tags = [], [], [], [] for i in range(len(tokens)): toks = tokens[i] ys = ['O'] * len(toks) if tags is None else tags[i] assert len(toks) == len(ys), \ f"toks({len(toks)}) should have the same length as ys({len(ys)})" sw_toks, sw_marker, sw_ys = \ self._ner_bert_tokenize(toks, ys, self.tokenizer, self.max_subword_length, mode=self.mode, subword_mask_mode=self.subword_mask_mode, token_masking_prob=self.token_masking_prob) if self.max_seq_length is not None: if len(sw_toks) > self.max_seq_length: raise RuntimeError(f"input sequence after bert tokenization" f" shouldn't exceed {self.max_seq_length} tokens.") subword_tokens.append(sw_toks) subword_tok_ids.append(self.tokenizer.convert_tokens_to_ids(sw_toks)) startofword_markers.append(sw_marker) subword_tags.append(sw_ys) assert len(sw_marker) == len(sw_toks) == len(subword_tok_ids[-1]) == len(sw_ys), \ f"length of sow_marker({len(sw_marker)}), tokens({len(sw_toks)})," \ f" token ids({len(subword_tok_ids[-1])}) and ys({len(ys)})" \ f" for tokens = `{toks}` should match" subword_tok_ids = zero_pad(subword_tok_ids, dtype=int, padding=0) startofword_markers = zero_pad(startofword_markers, dtype=int, padding=0) attention_mask = Mask()(subword_tokens) if tags is not None: if self.provide_subword_tags: return tokens, subword_tokens, subword_tok_ids, \ attention_mask, startofword_markers, subword_tags else: nonmasked_tags = [[t for t in ts if t != 'X'] for ts in tags] for swts, swids, swms, ts in zip(subword_tokens, subword_tok_ids, startofword_markers, nonmasked_tags): if (len(swids) != len(swms)) or (len(ts) != sum(swms)): log.warning('Not matching lengths of the tokenization!') log.warning(f'Tokens len: {len(swts)}\n Tokens: {swts}') log.warning(f'Markers len: {len(swms)}, sum: {sum(swms)}') log.warning(f'Masks: {swms}') log.warning(f'Tags len: {len(ts)}\n Tags: {ts}') if self.return_features: feature_list = ({'input_ids': torch.Tensor(subword_tok_ids), 'attention_mask': torch.Tensor(attention_mask), 'token_type_ids': torch.Tensor(startofword_markers), 'labels': torch.Tensor(nonmasked_tags)}) return feature_list else: return tokens, subword_tokens, subword_tok_ids, \ attention_mask, startofword_markers, nonmasked_tags if self.return_features: feature_list = ({'input_ids': torch.Tensor(subword_tok_ids), 'attention_mask': torch.Tensor(attention_mask), 'token_type_ids': torch.Tensor(startofword_markers) }) return feature_list else: return tokens, subword_tokens, subword_tok_ids, \ startofword_markers, attention_mask, tokens_offsets_batch @staticmethod def _ner_bert_tokenize(tokens: List[str], tags: List[str], tokenizer: AutoTokenizer, max_subword_len: int = None, mode: str = None, subword_mask_mode: str = "first", token_masking_prob: float = None) -> Tuple[List[str], List[int], List[str]]: do_masking = (mode == 'train') and (token_masking_prob is not None) do_cutting = (max_subword_len is not None) tokens_subword = ['[CLS]'] startofword_markers = [0] tags_subword = ['X'] for token, tag in zip(tokens, tags): token_marker = int(tag != 'X') subwords = tokenizer.tokenize(token) if not subwords or (do_cutting and (len(subwords) > max_subword_len)): tokens_subword.append('[UNK]') startofword_markers.append(token_marker) tags_subword.append(tag) else: if do_masking and (random.random() < token_masking_prob): tokens_subword.extend(['[MASK]'] * len(subwords)) else: tokens_subword.extend(subwords) if subword_mask_mode == "last": startofword_markers.extend([0] * (len(subwords) - 1) + [token_marker]) else: startofword_markers.extend([token_marker] + [0] * (len(subwords) - 1)) tags_subword.extend([tag] + ['X'] * (len(subwords) - 1)) tokens_subword.append('[SEP]') startofword_markers.append(0) tags_subword.append('X') return tokens_subword, startofword_markers, tags_subword @register('torch_bert_ranker_preprocessor') class TorchBertRankerPreprocessor(TorchTransformersPreprocessor): """Tokenize text to sub-tokens, encode sub-tokens with their indices, create tokens and segment masks for ranking. Builds features for a pair of context with each of the response candidates. """ def __call__(self, batch: List[List[str]]) -> List[List[InputFeatures]]: """Tokenize and create masks. Args: batch: list of elements where the first element represents the batch with contexts and the rest of elements represent response candidates batches Returns: list of feature batches with subtokens, subtoken ids, subtoken mask, segment mask. """ if isinstance(batch[0], str): batch = [batch] cont_resp_pairs = [] if len(batch[0]) == 1: contexts = batch[0] responses_empt = [None] * len(batch) cont_resp_pairs.append(zip(contexts, responses_empt)) else: contexts = [el[0] for el in batch] for i in range(1, len(batch[0])): responses = [] for el in batch: responses.append(el[i]) cont_resp_pairs.append(zip(contexts, responses)) input_features = [] for s in cont_resp_pairs: sub_list_features = [] for context, response in s: encoded_dict = self.tokenizer.encode_plus( text=context, text_pair=response, add_special_tokens=True, max_length=self.max_seq_length, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt') curr_features = InputFeatures(input_ids=encoded_dict['input_ids'], attention_mask=encoded_dict['attention_mask'], token_type_ids=encoded_dict['token_type_ids'], label=None) sub_list_features.append(curr_features) input_features.append(sub_list_features) return input_features @dataclass class RecordFlatExample: """Dataclass to store a flattened ReCoRD example. Contains `probability` for a given `entity` candidate, as well as its label. """ index: str label: int probability: float entity: str @dataclass class RecordNestedExample: """Dataclass to store a nested ReCoRD example. Contains a single predicted entity, as well as a list of correct answers. """ index: str prediction: str answers: List[str] @register("torch_record_postprocessor") class TorchRecordPostprocessor: """Combines flat classification examples into nested examples. When called returns nested examples that weren't previously returned during current iteration over examples. Args: is_binary: signifies whether the classifier uses binary classification head Attributes: record_example_accumulator: underling accumulator that transforms flat examples total_examples: overall number of flat examples that must be processed during current iteration """ def __init__(self, is_binary: bool = False, *args, **kwargs): self.record_example_accumulator: RecordExampleAccumulator = RecordExampleAccumulator() self.total_examples: Optional[int, None] = None self.is_binary: bool = is_binary def __call__(self, idx: List[str], y: List[int], y_pred_probas: np.ndarray, entities: List[str], num_examples: List[int], *args, **kwargs) -> List[RecordNestedExample]: """Postprocessor call Args: idx: list of string indices y: list of integer labels y_pred_probas: array of predicted probabilities num_examples: list of duplicated total numbers of examples Returns: List[RecordNestedExample]: processed but not previously returned examples (may be empty in some cases) """ if isinstance(y_pred_probas, list): y_pred_probas = [k for k in y_pred_probas if k is not None] y = [k for k in y if k is not None] y_pred_probas = np.array(y_pred_probas) if y == []: return [] if not self.is_binary: # if we have outputs for both classes `0` and `1` y_pred_probas = y_pred_probas[:, 1] if self.total_examples != num_examples[0]: # start over if num_examples is different # implying that a different split is being evaluated self.reset_accumulator() self.total_examples = num_examples[0] for index, label, probability, entity in zip(idx, y, y_pred_probas, entities): self.record_example_accumulator.add_flat_example(index, label, probability, entity) self.record_example_accumulator.collect_nested_example(index) if self.record_example_accumulator.examples_processed >= self.total_examples: # start over if all examples were processed self.reset_accumulator() return self.record_example_accumulator.return_examples() def reset_accumulator(self): """Reinitialize the underlying accumulator from scratch """ self.record_example_accumulator = RecordExampleAccumulator() class RecordExampleAccumulator: """ReCoRD example accumulator Attributes: examples_processed: total number of examples processed so far record_counter: number of examples processed for each index nested_len: expected number of flat examples for a given index flat_examples: stores flat examples nested_examples: stores nested examples collected_indices: indices of collected nested examples returned_indices: indices that have been returned """ def __init__(self): self.examples_processed: int = 0 self.record_counter: Dict[str, int] = defaultdict(lambda: 0) self.nested_len: Dict[str, int] = dict() self.flat_examples: Dict[str, List[RecordFlatExample]] = defaultdict(lambda: []) self.nested_examples: Dict[str, RecordNestedExample] = dict() self.collected_indices: Set[str] = set() self.returned_indices: Set[str] = set() def add_flat_example(self, index: str, label: int, probability: float, entity: str): """Add a single flat example to the accumulator Args: index: example index label: example label (`-1` means that label is not available) probability: predicted probability entity: candidate entity """ self.flat_examples[index].append(RecordFlatExample(index, label, probability, entity)) if index not in self.nested_len: self.nested_len[index] = self.get_expected_len(index) self.record_counter[index] += 1 self.examples_processed += 1 def ready_to_nest(self, index: str) -> bool: """Checks whether all the flat examples for a given index were collected at this point. Args: index: the index of the candidate nested example Returns: bool: indicates whether the collected flat examples can be combined into a nested example """ return self.record_counter[index] == self.nested_len[index] def collect_nested_example(self, index: str): """Combines a list of flat examples denoted by the given index into a single nested example provided that all the necessary flat example have been collected by this time. Args: index: the index of the candidate nested example """ if self.ready_to_nest(index): example_list: List[RecordFlatExample] = self.flat_examples[index] entities: List[str] = [] labels: List[int] = [] probabilities: List[float] = [] answers: List[str] = [] for example in example_list: entities.append(example.entity) labels.append(example.label) probabilities.append(example.probability) if example.label == 1: answers.append(example.entity) prediction_index = np.argmax(probabilities) prediction = entities[prediction_index] self.nested_examples[index] = RecordNestedExample(index, prediction, answers) self.collected_indices.add(index) def return_examples(self) -> List[RecordNestedExample]: """Determines which nested example were not yet returned during the current evaluation cycle and returns them. May return an empty list if there are no new nested examples to return yet. Returns: List[RecordNestedExample]: zero or more nested examples """ indices_to_return: Set[str] = self.collected_indices.difference(self.returned_indices) examples_to_return: List[RecordNestedExample] = [] for index in indices_to_return: examples_to_return.append(self.nested_examples[index]) self.returned_indices.update(indices_to_return) log.debug(f'Returning {examples_to_return}') return examples_to_return @staticmethod def get_expected_len(index: str) -> int: """ Calculates the total number of flat examples denoted by the give index Args: index: the index to calculate the number of examples for Returns: int: the expected number of examples for this index """ return int(index.split("-")[-1]) ================================================ FILE: deeppavlov/models/preprocessors/transformers_preprocessor.py ================================================ # Copyright 2020 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from typing import List, Union, Tuple import numpy as np from transformers import BertTokenizer from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component log = getLogger(__name__) def _pad(data: List[List[Union[int, float]]], value: Union[int, float] = 0): max_len = max(map(len, data)) res = np.ones([len(data), max_len], dtype=type(value)) * value for i, item in enumerate(data): res[i][:len(item)] = item return res @register('transformers_bert_preprocessor') class TransformersBertPreprocessor(Component): def __init__(self, vocab_file: str, do_lower_case: bool = False, max_seq_length: int = 512, tokenize_chinese_chars: bool = True, **kwargs): vocab_file = expand_path(vocab_file) self.tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case, tokenize_chinese_chars=tokenize_chinese_chars) self.max_seq_length = max_seq_length def __call__(self, tokens_batch: Union[List[str], List[List[str]]]) ->\ Tuple[List[List[str]], List[List[str]], np.ndarray, np.ndarray, np.ndarray]: if isinstance(tokens_batch[0], str): # skip for already tokenized text tokens_batch = [self.tokenizer.basic_tokenizer.tokenize(sentence, self.tokenizer.all_special_tokens) for sentence in tokens_batch] startofword_markers_batch = [] subtokens_batch = [] for tokens in tokens_batch: startofword_markers = [0] subtokens = ['[CLS]'] for token in tokens: for i, subtoken in enumerate(self.tokenizer.wordpiece_tokenizer.tokenize(token)): startofword_markers.append(int(i == 0)) subtokens.append(subtoken) startofword_markers.append(0) subtokens.append('[SEP]') if len(subtokens) > self.max_seq_length: raise RuntimeError(f"input sequence after bert tokenization" f" cannot exceed {self.max_seq_length} tokens.") startofword_markers_batch.append(startofword_markers) subtokens_batch.append(subtokens) encoded = self.tokenizer.batch_encode_plus([[subtokens, None] for subtokens in subtokens_batch], add_special_tokens=False) return (tokens_batch, subtokens_batch, _pad(encoded['input_ids'], value=self.tokenizer.pad_token_id), _pad(startofword_markers_batch), _pad(encoded['attention_mask'])) ================================================ FILE: deeppavlov/models/ranking/__init__.py ================================================ ================================================ FILE: deeppavlov/models/ranking/metrics.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np from deeppavlov.core.common.metrics_registry import register_metric @register_metric('rank_response') def rank_response(y_true, y_pred): num_examples = float(len(y_pred)) predictions = np.array(y_pred) predictions = np.flip(np.argsort(predictions, -1), -1) rank_tot = 0 for el in predictions: for i, x in enumerate(el): if x == 0: rank_tot += i break return float(rank_tot) / num_examples @register_metric('r@1_insQA') def r_at_1_insQA(y_true, y_pred): return recall_at_k_insQA(y_true, y_pred, k=1) def recall_at_k_insQA(y_true, y_pred, k): labels = np.repeat(np.expand_dims(np.asarray(y_true), axis=1), k, axis=1) predictions = np.array(y_pred) predictions = np.flip(np.argsort(predictions, -1), -1)[:, :k] flags = np.zeros_like(predictions) for i in range(predictions.shape[0]): for j in range(predictions.shape[1]): if predictions[i][j] in np.arange(labels[i][j]): flags[i][j] = 1. return np.mean((np.sum(flags, -1) >= 1.).astype(float)) ================================================ FILE: deeppavlov/models/relation_extraction/__init__.py ================================================ ================================================ FILE: deeppavlov/models/relation_extraction/losses.py ================================================ """ This code is copied from ATLOP algorithm (https://github.com/wzhouad/ATLOP/blob/main/losses.py) """ import torch import torch.nn as nn import torch.nn.functional as F from torch import Tensor class ATLoss(nn.Module): def __init__(self): super().__init__() def forward(self, logits: Tensor, labels: Tensor) -> float: """ Args: logits: predicted probabilities (shape: batch size x num classes) labels: one-hot encoded true labels (shape: batch size x num classes) """ # TH label th_label = torch.zeros_like(labels, dtype=torch.float).to(labels) th_label[:, 0] = 1.0 labels[:, 0] = 0.0 p_mask = labels + th_label # = 1 for the gold labels + for 0 (negative) class, 0 otherwise n_mask = 1 - labels # = 0 for the gold labels, 1 otherwise # Rank positive classes to TH logit1 = logits - (1 - p_mask) * 1e30 # org logits remain for gold labels + 0 class, others are reduced by 1 loss1 = -(F.log_softmax(logit1, dim=-1) * labels).sum(1) # Rank TH to negative classes logit2 = logits - (1 - n_mask) * 1e30 # org logits remain for not gold and not 0-class, others are reduced by 1 loss2 = -(F.log_softmax(logit2, dim=-1) * th_label).sum(1) # Sum two parts loss = loss1 + loss2 loss = loss.mean() return loss def get_label(self, logits: Tensor, num_labels: int = -1, threshold: float = None) -> Tensor: """ Calculated the labels """ if threshold: th_logit = torch.full((len(logits), 1), threshold) else: th_logit = logits[:, 0].unsqueeze(1) # vector of predicted probabilities for class 0 (negative class) output = torch.zeros_like(logits).to(logits) mask = (logits > th_logit) # for each sample: True, if prob for a class > prob for neg class, False otherwise if num_labels > 0: top_v, _ = torch.topk(logits, num_labels, dim=1) # len(num_labels) max elements; sorted top_v = top_v[:, -1] # the smallest pro for each sample mask = (logits >= top_v.unsqueeze(1)) & mask # mask + additionally: logits should be bigger than minimum output[mask] = 1.0 output[:, 0] = (output.sum(1) == 0.).to(logits) # no relation if no label matched return output ================================================ FILE: deeppavlov/models/relation_extraction/relation_extraction_bert.py ================================================ from logging import getLogger from typing import List, Optional, Union import numpy as np import torch from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register from deeppavlov.core.models.torch_model import TorchModel from deeppavlov.models.classifiers.re_bert import BertWithAdaThresholdLocContextPooling log = getLogger(__name__) @register('re_classifier') class REBertModel(TorchModel): def __init__( self, n_classes: int, num_ner_tags: int, pretrained_bert: str = None, return_probas: bool = False, threshold: Optional[float] = None, **kwargs ) -> None: """ Transformer-based model on PyTorch for relation extraction. It predicts a relation hold between entities in a text sample (one or several sentences). Args: n_classes: number of output classes num_ner_tags: number of NER tags pretrained_bert: key title of pretrained Bert model (e.g. "bert-base-uncased") return_probas: set this to `True` if you need the probabilities instead of raw answers threshold: manually set value for defining the positively predicted classes (instead of adaptive one) """ self.n_classes = n_classes self.return_probas = return_probas if self.n_classes == 0: raise ConfigError("Please provide a valid number of classes.") model = BertWithAdaThresholdLocContextPooling( n_classes=self.n_classes, pretrained_bert=pretrained_bert, bert_tokenizer_config_file=pretrained_bert, num_ner_tags=num_ner_tags, threshold=threshold, ) super().__init__(model, **kwargs) def train_on_batch( self, input_ids: List, attention_mask: List, entity_pos: List, entity_tags: List, labels: List ) -> float: """ Trains the relation extraction BERT model on the given batch. Returns: dict with loss and learning rate values. """ _input = { 'input_ids': torch.LongTensor(input_ids).to(self.device), 'attention_mask': torch.LongTensor(attention_mask).to(self.device), 'entity_pos': entity_pos, 'ner_tags': entity_tags, 'labels': labels } self.model.train() self.model.zero_grad() self.optimizer.zero_grad() # zero the parameter gradients hidden_states = self.model(**_input) loss = hidden_states[0] self._make_step(loss) return loss.item() def __call__( self, input_ids: List, attention_mask: List, entity_pos: List, entity_tags: List ) -> Union[List[int], List[np.ndarray]]: """ Get model predictions using features as input """ self.model.eval() _input = { 'input_ids': torch.LongTensor(input_ids).to(self.device), 'attention_mask': torch.LongTensor(attention_mask).to(self.device), 'entity_pos': entity_pos, 'ner_tags': entity_tags } with torch.no_grad(): indices, probas = self.model(**_input) if self.return_probas: pred = probas.cpu().numpy() pred[np.isnan(pred)] = 0 pred_without_no_rel = [] # eliminate no_relation predictions for elem in pred: elem[0] = 0.0 pred_without_no_rel.append(elem) new_pred = np.argmax(pred_without_no_rel, axis=1) one_hot = [[0.0] * self.n_classes] * len(new_pred) for i in range(len(new_pred)): one_hot[i][new_pred[i]] = 1.0 pred = np.array(one_hot) else: pred = indices.cpu().numpy() pred[np.isnan(pred)] = 0 return pred ================================================ FILE: deeppavlov/models/sklearn/__init__.py ================================================ from .sklearn_component import * ================================================ FILE: deeppavlov/models/sklearn/sklearn_component.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import inspect import pickle from logging import getLogger from pathlib import Path from typing import List, Tuple, Union, Callable import numpy as np from scipy.sparse import issparse, csr_matrix from scipy.sparse import spmatrix from scipy.sparse import vstack, hstack from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register, cls_from_str from deeppavlov.core.models.estimator import Estimator log = getLogger(__name__) @register("sklearn_component") class SklearnComponent(Estimator): """ Class implements wrapper for sklearn components for feature extraction, feature selection, classification, regression etc. Args: model_class: string with full name of sklearn model to use, e.g. ``sklearn.linear_model:LogisticRegression`` save_path: save path for model, e.g. full name ``model_path/model.pkl`` \ or prefix ``model_path/model`` (still model will be saved to ``model_path/model.pkl``) load_path: load path for model, e.g. full name ``model_path/model.pkl`` \ or prefix ``model_path/model`` (still model will be loaded from ``model_path/model.pkl``) infer_method: string name of class method to use for infering model, \ e.g. ``predict``, ``predict_proba``, ``predict_log_proba``, ``transform`` ensure_list_output: whether to ensure that output for each sample is iterable (but not string) kwargs: dictionary with parameters for the sklearn model Attributes: model: sklearn model instance model_class: string with full name of sklearn model to use, e.g. ``sklearn.linear_model:LogisticRegression`` model_params: dictionary with parameters for the sklearn model without pipe parameters pipe_params: dictionary with parameters for pipe: ``in``, ``out``, ``fit_on``, ``main``, ``name`` save_path: save path for model, e.g. full name ``model_path/model.pkl`` \ or prefix ``model_path/model`` (still model will be saved to ``model_path/model.pkl``) load_path: load path for model, e.g. full name ``model_path/model.pkl`` \ or prefix ``model_path/model`` (still model will be loaded from ``model_path/model.pkl``) infer_method: string name of class method to use for infering model, \ e.g. ``predict``, ``predict_proba``, ``predict_log_proba``, ``transform`` ensure_list_output: whether to ensure that output for each sample is iterable (but not string) """ def __init__(self, model_class: str, save_path: Union[str, Path] = None, load_path: Union[str, Path] = None, infer_method: str = "predict", ensure_list_output: bool = False, **kwargs) -> None: """ Initialize component with given parameters """ super().__init__(save_path=save_path, load_path=load_path, **kwargs) self.model_class = model_class self.model_params = kwargs self.model = None self.ensure_list_output = ensure_list_output self.pipe_params = {} for required in ["in", "out", "fit_on", "main", "name"]: self.pipe_params[required] = self.model_params.pop(required, None) self.load() self.infer_method = getattr(self.model, infer_method) def fit(self, *args) -> None: """ Fit model on the given data Args: *args: list of x-inputs and, optionally, one y-input (the last one) to fit on. Possible input (x0, ..., xK, y) or (x0, ..., xK) ' where K is the number of input data elements (the length of list ``in`` from config). \ In case of several inputs (K > 1) input features will be stacked. \ For example, one has x0: (n_samples, n_features0), ..., xK: (n_samples, n_featuresK), \ then model will be trained on x: (n_samples, n_features0 + ... + n_featuresK). Returns: None """ n_inputs = len(self.pipe_params["in"]) if isinstance(self.pipe_params["in"], list) else 1 x_features = self.compose_input_data(args[:n_inputs]) if len(args) > n_inputs: y_ = np.squeeze(np.array(args[-1])) else: y_ = None try: log.info("Fitting model {}".format(self.model_class)) self.model.fit(x_features, y_) except TypeError or ValueError: if issparse(x_features): log.info("Converting input for model {} to dense array".format(self.model_class)) self.model.fit(x_features.todense(), y_) else: log.info("Converting input for model {} to sparse array".format(self.model_class)) self.model.fit(csr_matrix(x_features), y_) return def __call__(self, *args): """ Infer on the given data according to given in the config infer method, \ e.g. ``"predict", "predict_proba", "transform"`` Args: *args: list of inputs Returns: predictions, e.g. list of labels, array of probability distribution, sparse array of vectorized samples """ x_features = self.compose_input_data(args) try: predictions = self.infer_method(x_features) except TypeError or ValueError: if issparse(x_features): log.debug("Converting input for model {} to dense array".format(self.model_class)) predictions = self.infer_method(x_features.todense()) else: log.debug("Converting input for model {} to sparse array".format(self.model_class)) predictions = self.infer_method(csr_matrix(x_features)) if isinstance(predictions, list): # ``predict_proba`` sometimes returns list of n_outputs (each output corresponds to a label) # but we will return (n_samples, n_labels) # where each value is a probability of a sample to belong with the label predictions_ = [[predictions[j][i][1] for j in range(len(predictions))] for i in range(x_features.shape[0])] predictions = np.array(predictions_) if self.ensure_list_output and len(predictions.shape) == 1: predictions = predictions.reshape(-1, 1) if issparse(predictions): return predictions else: return predictions.tolist() def init_from_scratch(self) -> None: """ Initialize ``self.model`` as some sklearn model from scratch with given in ``self.model_params`` parameters. Returns: None """ log.debug("Initializing model {} from scratch".format(self.model_class)) model_function = cls_from_str(self.model_class) if model_function is None: raise ConfigError("Model with {} model_class was not found.".format(self.model_class)) given_params = {} if self.model_params: available_params = self.get_function_params(model_function) for param_name in self.model_params.keys(): if param_name in available_params: try: given_params[param_name] = cls_from_str(self.model_params[param_name]) except (AttributeError, ValueError, ConfigError): given_params[param_name] = self.model_params[param_name] self.model = model_function(**given_params) return def load(self, fname: str = None) -> None: """ Initialize ``self.model`` as some sklearn model from saved re-initializing ``self.model_params`` parameters. \ If in new given parameters ``warm_start`` is set to True and given model admits ``warm_start`` parameter, \ model will be initilized from saved with opportunity to continue fitting. Args: fname: string name of path to model to load from Returns: None """ if fname is None: fname = self.load_path fname = Path(fname).with_suffix('.pkl') if fname.exists(): log.debug("Loading model {} from {}".format(self.model_class, str(fname))) with open(fname, "rb") as f: self.model = pickle.load(f) warm_start = self.model_params.get("warm_start", None) self.model_params = {param: getattr(self.model, param) for param in self.get_class_attributes(self.model)} self.model_class = self.model.__module__ + self.model.__class__.__name__ log.debug("Model {} loaded with parameters".format(self.model_class)) if warm_start and "warm_start" in self.model_params.keys(): self.model_params["warm_start"] = True log.debug("Fitting of loaded model can be continued because `warm_start` is set to True") else: log.warning("Fitting of loaded model can not be continued. Model can be fitted from scratch." "If one needs to continue fitting, please, look at `warm_start` parameter") else: log.warning("Cannot load model from {}".format(str(fname))) self.init_from_scratch() return def save(self, fname: str = None) -> None: """ Save ``self.model`` to the file from ``fname`` or, if not given, ``self.save_path``. \ If ``self.save_path`` does not have ``.pkl`` extension, then it will be replaced \ to ``str(Path(self.save_path).stem) + ".pkl"`` Args: fname: string name of path to model to save to Returns: None """ if fname is None: fname = self.save_path fname = Path(fname).with_suffix('.pkl') log.info("Saving model to {}".format(str(fname))) with open(fname, "wb") as f: pickle.dump(self.model, f, protocol=4) return @staticmethod def compose_input_data(x: List[Union[Tuple[Union[np.ndarray, list, spmatrix, str]], List[Union[np.ndarray, list, spmatrix, str]], np.ndarray, spmatrix]]) -> Union[spmatrix, np.ndarray]: """ Stack given list of different types of inputs to the one matrix. If one of the inputs is a sparse matrix, \ then output will be also a sparse matrix Args: x: list of data elements Returns: sparse or dense array of stacked data """ x_features = [] for i in range(len(x)): if ((isinstance(x[i], tuple) or isinstance(x[i], list) or isinstance(x[i], np.ndarray) and len(x[i])) or (issparse(x[i]) and x[i].shape[0])): if issparse(x[i][0]): x_features.append(vstack(list(x[i]))) elif isinstance(x[i][0], np.ndarray) or isinstance(x[i][0], list): x_features.append(np.vstack(list(x[i]))) elif isinstance(x[i][0], str): x_features.append(np.array(x[i])) else: raise ConfigError('Not implemented this type of vectors') else: raise ConfigError("Input vectors cannot be empty") sparse = False for inp in x_features: if issparse(inp): sparse = True if sparse: x_features = hstack(list(x_features)) else: x_features = np.hstack(list(x_features)) return x_features @staticmethod def get_function_params(f: Callable) -> List[str]: """ Get list of names of given function's parameters Args: f: function Returns: list of names of given function's parameters """ return inspect.getfullargspec(f)[0] @staticmethod def get_class_attributes(cls: type) -> List[str]: """ Get list of names of given class' attributes Args: cls: class Returns: list of names of given class' attributes """ return list(cls.__dict__.keys()) ================================================ FILE: deeppavlov/models/spelling_correction/__init__.py ================================================ ================================================ FILE: deeppavlov/models/spelling_correction/brillmoore/__init__.py ================================================ from .error_model import ErrorModel ================================================ FILE: deeppavlov/models/spelling_correction/brillmoore/error_model.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import csv import itertools from collections import defaultdict, Counter from heapq import heappop, heappushpop, heappush from logging import getLogger from math import log, exp from typing import List, Iterable, Tuple from tqdm import tqdm from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register from deeppavlov.core.models.estimator import Estimator from deeppavlov.vocabs.typos import StaticDictionary logger = getLogger(__name__) @register('spelling_error_model') class ErrorModel(Estimator): """Component that uses statistics based error model to find best candidates in a static dictionary. Based on An Improved Error Model for Noisy Channel Spelling Correction by Eric Brill and Robert C. Moore Args: dictionary: a :class:`~deeppavlov.vocabs.typos.StaticDictionary` object window: maximum context window size candidates_count: maximum number of replacement candidates to return for every token in the input Attributes: costs: logarithmic probabilities of character sequences replacements dictionary: a :class:`~deeppavlov.vocabs.typos.StaticDictionary` object window: maximum context window size candidates_count: maximum number of replacement candidates to return for every token in the input """ def __init__(self, dictionary: StaticDictionary, window: int = 1, candidates_count: int = 1, *args, **kwargs): super().__init__(*args, **kwargs) self.costs = defaultdict(itertools.repeat(float('-inf')).__next__) self.dictionary = dictionary self.window = window if self.window == 0: self.find_candidates = self._find_candidates_window_0 else: self.find_candidates = self._find_candidates_window_n self.costs[('', '')] = log(1) self.costs[('⟬', '⟬')] = log(1) self.costs[('⟭', '⟭')] = log(1) for c in self.dictionary.alphabet: self.costs[(c, c)] = log(1) # if self.ser_path.is_file(): self.load() self.candidates_count = candidates_count def _find_candidates_window_0(self, word, prop_threshold=1e-6): threshold = log(prop_threshold) d = {} prefixes_heap = [(0, {''})] candidates = [(float('-inf'), '') for _ in range(self.candidates_count)] word = '⟬{}⟭'.format(word.lower().replace('ё', 'е')) word_len = len(word) + 1 while prefixes_heap and -prefixes_heap[0][0] > candidates[0][0]: _, prefixes = heappop(prefixes_heap) for prefix in prefixes: res = [] for i in range(word_len): c = word[i - 1:i] res.append(max( (res[-1] + self.costs[('', c)]) if i else float('-inf'), d[prefix[:-1]][i] + self.costs[(prefix[-1], '')] if prefix else float( '-inf'), (d[prefix[:-1]][i - 1] + (self.costs[(prefix[-1], c)])) if prefix and i else float('-inf') ) if i or prefix else 0) d[prefix] = res if prefix in self.dictionary.words_set: heappushpop(candidates, (res[-1], prefix)) potential = max(res) if potential > threshold: heappush(prefixes_heap, (-potential, self.dictionary.words_trie[prefix])) return [(w.strip('⟬⟭'), score) for score, w in sorted(candidates, reverse=True) if score > threshold] def _find_candidates_window_n(self, word, prop_threshold=1e-6): threshold = log(prop_threshold) word = '⟬{}⟭'.format(word.lower().replace('ё', 'е')) word_len = len(word) + 1 inf = float('-inf') d = defaultdict(list) d[''] = [0.] + [inf] * (word_len - 1) prefixes_heap = [(0, self.dictionary.words_trie[''])] candidates = [(inf, '')] * self.candidates_count while prefixes_heap and -prefixes_heap[0][0] > candidates[0][0]: _, prefixes = heappop(prefixes_heap) for prefix in prefixes: prefix_len = len(prefix) d[prefix] = res = [inf] for i in range(1, word_len): c_res = [inf] for li in range(1, min(prefix_len + 1, self.window + 2)): for ri in range(1, min(i + 1, self.window + 2)): prev = d[prefix[:-li]][i - ri] if prev > threshold: edit = (prefix[-li:], word[i - ri:i]) if edit in self.costs: c_res.append(prev + self.costs[edit]) res.append(max(c_res)) if prefix in self.dictionary.words_set: heappushpop(candidates, (res[-1], prefix)) potential = max(res) # potential = max( # [e for i in range(self.window + 2) for e in d[prefix[:prefix_len - i]]]) if potential > threshold: heappush(prefixes_heap, (-potential, self.dictionary.words_trie[prefix])) return [(w.strip('⟬⟭'), score) for score, w in sorted(candidates, reverse=True) if score > threshold] def _infer_instance(self, instance: List[str]) -> List[List[Tuple[float, str]]]: candidates = [] for incorrect in instance: if any([c not in self.dictionary.alphabet for c in incorrect]): candidates.append([(0, incorrect)]) else: res = self.find_candidates(incorrect, prop_threshold=1e-6) if res: candidates.append([(score, candidate) for candidate, score in res]) else: candidates.append([(0, incorrect)]) return candidates def __call__(self, data: Iterable[Iterable[str]], *args, **kwargs) -> List[List[List[Tuple[float, str]]]]: """Propose candidates for tokens in sentences Args: data: batch of tokenized sentences Returns: batch of lists of probabilities and candidates for every token """ data = list(data) if len(data) > 1: data = tqdm(data, desc='Infering a batch with the error model', leave=False) return [self._infer_instance(instance) for instance in data] @staticmethod def _distance_edits(seq1, seq2): l1, l2 = len(seq1), len(seq2) d = [[(i, ()) for i in range(l2 + 1)]] d += [[(i, ())] + [(0, ())] * l2 for i in range(1, l1 + 1)] for i in range(1, l1 + 1): for j in range(1, l2 + 1): edits = [ (d[i - 1][j][0] + 1, d[i - 1][j][1] + ((seq1[i - 1], ''),)), (d[i][j - 1][0] + 1, d[i][j - 1][1] + (('', seq2[j - 1]),)), (d[i - 1][j - 1][0] + (seq1[i - 1] != seq2[j - 1]), d[i - 1][j - 1][1] + ((seq1[i - 1], seq2[j - 1]),)) ] if i > 1 and j > 1 and seq1[i - 1] == seq2[j - 2] and seq1[i - 2] == seq2[j - 1]: edits.append((d[i - 2][j - 2][0] + (seq1[i - 1] != seq2[j - 1]), d[i - 2][j - 2][1] + ((seq1[i - 2:i], seq2[j - 2:j]),))) d[i][j] = min(edits, key=lambda x: x[0]) return d[-1][-1] def fit(self, x: List[str], y: List[str]): """Calculate character sequences replacements probabilities Args: x: words with spelling errors y: words without spelling errors """ changes = [] entries = [] data = list(zip(x, y)) window = 4 for error, correct in tqdm(data, desc='Training the error model'): correct = '⟬{}⟭'.format(' '.join(correct)) error = '⟬{}⟭'.format(' '.join(error)) d, ops = self._distance_edits(correct, error) if d <= 2: w_ops = set() for pos in range(len(ops)): left, right = list(zip(*ops)) for l in range(pos, max(0, pos - window) - 1, -1): for r in range(pos + 1, min(len(ops), l + 2 + window)): w_ops.add(((''.join(left[l:r]), ''.join(right[l:r])), l, r)) ops = [x[0] for x in w_ops] entries += [op[0] for op in ops] changes += [op for op in ops] e_count = Counter(entries) c_count = Counter(changes) incorrect_prior = 1 correct_prior = 19 for (w, s), c in c_count.items(): c = c + (incorrect_prior if w != s else correct_prior) e = e_count[w] + incorrect_prior + correct_prior p = c / e self.costs[(w, s)] = log(p) def save(self): """Save replacements probabilities to a file """ logger.info("[saving error_model to `{}`]".format(self.save_path)) with open(self.save_path, 'w', newline='', encoding='utf8') as tsv_file: writer = csv.writer(tsv_file, delimiter='\t') for (w, s), log_p in self.costs.items(): writer.writerow([w, s, exp(log_p)]) def load(self): """Load replacements probabilities from a file """ if self.load_path: if self.load_path.is_file(): logger.debug("loading error_model from `{}`".format(self.load_path)) with open(self.load_path, 'r', newline='', encoding='utf8') as tsv_file: reader = csv.reader(tsv_file, delimiter='\t') for w, s, p in reader: self.costs[(w, s)] = log(float(p)) elif not self.load_path.parent.is_dir(): raise ConfigError("Provided `load_path` for {} doesn't exist!".format( self.__class__.__name__)) else: logger.warning('No load_path provided, initializing error model from scratch') ================================================ FILE: deeppavlov/models/spelling_correction/electors/__init__.py ================================================ ================================================ FILE: deeppavlov/models/spelling_correction/electors/kenlm_elector.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from pathlib import Path from typing import List, Tuple import kenlm from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component logger = getLogger(__name__) @register('kenlm_elector') class KenlmElector(Component): """Component that chooses a candidate with the highest product of base and language model probabilities Args: load_path: path to the kenlm model file beam_size: beam size for highest probability search Attributes: lm: kenlm object beam_size: beam size for highest probability search """ def __init__(self, load_path: Path, beam_size: int = 4, *args, **kwargs): self.lm = kenlm.Model(str(expand_path(load_path))) self.beam_size = beam_size def __call__(self, batch: List[List[List[Tuple[float, str]]]]) -> List[List[str]]: """Choose the best candidate for every token Args: batch: batch of probabilities and string values of candidates for every token in a sentence Returns: batch of corrected tokenized sentences """ return [self._infer_instance(candidates) for candidates in batch] def _infer_instance(self, candidates: List[List[Tuple[float, str]]]): candidates = candidates + [[(0, '')]] state = kenlm.State() self.lm.BeginSentenceWrite(state) beam = [(0, state, [])] for sublist in candidates: new_beam = [] for beam_score, beam_state, beam_words in beam: for score, candidate in sublist: prev_state = beam_state c_score = 0 cs = candidate.split() for candidate in cs: state = kenlm.State() c_score += self.lm.BaseScore(prev_state, candidate, state) prev_state = state new_beam.append((beam_score + score + c_score, state, beam_words + cs)) new_beam.sort(reverse=True) beam = new_beam[:self.beam_size] score, state, words = beam[0] return words[:-1] ================================================ FILE: deeppavlov/models/spelling_correction/electors/top1_elector.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from typing import List, Tuple from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component logger = getLogger(__name__) @register('top1_elector') class TopOneElector(Component): """Component that chooses a candidate with highest base probability for every token """ def __init__(self, *args, **kwargs): pass def __call__(self, batch: List[List[List[Tuple[float, str]]]]) -> List[List[str]]: """Choose the best candidate for every token Args: batch: batch of probabilities and string values of candidates for every token in a sentence Returns: batch of corrected tokenized sentences """ return [[max(sublist)[1] for sublist in candidates] for candidates in batch] ================================================ FILE: deeppavlov/models/spelling_correction/levenshtein/__init__.py ================================================ from .searcher_component import LevenshteinSearcherComponent ================================================ FILE: deeppavlov/models/spelling_correction/levenshtein/levenshtein_searcher.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import copy import itertools import numpy as np from sortedcontainers import SortedListWithKey from .tabled_trie import Trie, make_trie class LevenshteinSearcher: """ Класс для поиска близких слов в соответствии с расстоянием Левенштейна """ def __init__(self, alphabet, dictionary, operation_costs=None, allow_spaces=False, euristics='none'): self.alphabet = alphabet self.allow_spaces = allow_spaces if isinstance(euristics, int): if euristics < 0: raise ValueError("Euristics should be non-negative integer or None") else: self.euristics = euristics if euristics != 0 else None elif euristics in ["none", "None", None]: self.euristics = None else: raise ValueError("Euristics should be non-negative integer or None") if isinstance(dictionary, Trie): # словарь передан уже в виде бора self.dictionary = dictionary else: self.dictionary = make_trie(alphabet, dictionary, make_cashed=True, precompute_symbols=self.euristics, allow_spaces=self.allow_spaces) self.transducer = SegmentTransducer( alphabet, operation_costs=operation_costs, allow_spaces=allow_spaces) self._precompute_euristics() self._define_h_function() def __contains__(self, word): return word in self.dictionary def search(self, word, d, allow_spaces=True, return_cost=True): """ Finds all dictionary words in d-window from word """ if not all((c in self.alphabet or (c == " " and self.allow_spaces)) for c in word): return [] # raise ValueError("{0} contains an incorrect symbol".format(word)) return self._trie_search( word, d, allow_spaces=allow_spaces, return_cost=return_cost) def _trie_search(self, word, d, transducer=None, allow_spaces=True, return_cost=True): """ Находит все слова в префиксном боре, расстояние до которых в соответствии с заданным преобразователем не превышает d """ if transducer is None: # разобраться с пробелами transducer = self.transducer.inverse() allow_spaces &= self.allow_spaces trie = self.dictionary # инициализация переменных used_agenda_keys = set() agenda = SortedListWithKey(key=(lambda x: x[1])) h = self.h_func(word, trie.root) # agenda[self.agenda_key("", 0, trie.root)] = (0.0, 0.0, h) key, value = ("", 0, trie.root), (0.0, 0.0, h) agenda.add((key, value)) answer = dict() k = 0 # очередь с приоритетом с промежуточными результатами while len(agenda) > 0: key, value = agenda.pop(0) if key in used_agenda_keys: continue used_agenda_keys.add(key) low, pos, index = key cost, g, h = value # g --- текущая стоимость, h --- нижняя оценка будущей стоимости # cost = g + h --- нижняя оценка суммарной стоимости k += 1 max_upperside_length = min(len(word) - pos, transducer.max_up_length) for upperside_length in range(max_upperside_length + 1): new_pos = pos + upperside_length curr_up = word[pos: new_pos] if curr_up not in transducer.operation_costs: continue for curr_low, curr_cost in transducer.operation_costs[curr_up].items(): new_g = g + curr_cost if new_g > d: # если g > d, то h можно не вычислять continue if curr_low == " ": if allow_spaces and trie.is_final(index): new_index = trie.root else: new_index = Trie.NO_NODE else: new_index = trie.descend(index, curr_low) if new_index is Trie.NO_NODE: continue new_low = low + curr_low new_h = self.h_func(word[new_pos:], new_index) new_cost = new_g + new_h if new_cost > d: continue new_key = (new_low, new_pos, new_index) new_value = (new_cost, new_g, new_h) if new_pos == len(word) and trie.is_final(new_index): old_g = answer.get(new_low, None) if old_g is None or new_g < old_g: answer[new_low] = new_g agenda.add((new_key, new_value)) answer = sorted(answer.items(), key=(lambda x: x[1])) if return_cost: return answer else: return [elem[0] for elem in answer] def _precompute_euristics(self): """ Предвычисляет будущие символы и стоимости операций с ними для h-эвристики """ if self.euristics is None: return # вычисление минимальной стоимости операции, # приводящей к появлению ('+') или исчезновению ('-') данного символа removal_costs = {a: np.inf for a in self.alphabet} insertion_costs = {a: np.inf for a in self.alphabet} if self.allow_spaces: removal_costs[' '] = np.inf insertion_costs[' '] = np.inf for up, costs in self.transducer.operation_costs.items(): for low, cost in costs.items(): if up == low: continue if up != '': removal_cost = cost / len(up) for a in up: removal_costs[a] = min(removal_costs[a], removal_cost) if low != '': insertion_cost = cost / len(low) for a in low: insertion_costs[a] = min(insertion_costs[a], insertion_cost) # предвычисление возможных будущих символов в узлах дерева # precompute_future_symbols(self.dictionary, self.euristics, self.allow_spaces) # предвычисление стоимостей потери символа в узлах дерева self._absense_costs_by_node = _precompute_absense_costs( self.dictionary, removal_costs, insertion_costs, self.euristics, self.allow_spaces) # массив для сохранения эвристик self._temporary_euristics = [dict() for i in range(len(self.dictionary))] def _define_h_function(self): if self.euristics in [None, 0]: self.h_func = (lambda *x: 0.0) else: self.h_func = self._euristic_h_function def _euristic_h_function(self, suffix, index): """ Вычисление h-эвристики из работы Hulden,2009 для текущей вершины словаря Аргументы: ---------- suffix : string непрочитанный суффикс входного слова index : int индекс текущего узла в словаре Возвращает: ----------- cost : float оценка снизу для стоимости замены, приводящей к входному слову с суффиксом suffix, если прочитанный префикс слова без опечатки привёл в вершину с номером index """ if self.euristics > 0: suffix = suffix[:self.euristics] # кэширование результатов index_temporary_euristics = self._temporary_euristics[index] cost = index_temporary_euristics.get(suffix, None) if cost is not None: return cost # извлечение нужных данных из массивов absense_costs = self._absense_costs_by_node[index] data = self.dictionary.data[index] costs = np.zeros(dtype=np.float64, shape=(self.euristics,)) # costs[j] --- оценка штрафа при предпросмотре вперёд на j символов for i, a in enumerate(suffix): costs[i:] += absense_costs[a][i:] cost = max(costs) index_temporary_euristics[suffix] = cost return cost def _minimal_replacement_cost(self, first, second): first_symbols, second_symbols = set(), set() removal_cost, insertion_cost = 0, 0 for a, b in itertools.zip_longest(first, second, fillvalue=None): if a is not None: first_symbols.add(a) if b is not None: second_symbols.add(b) removal_cost = max(removal_cost, len(first_symbols - second_symbols)) insertion_cost = max(insertion_cost, len(second_symbols - first_symbols)) return min(removal_cost, insertion_cost) def _precompute_absense_costs(dictionary, removal_costs, insertion_costs, n, allow_spaces=False): """ Вычисляет минимальную стоимость появления нового символа в узлах словаря в соответствии со штрафами из costs Аргументы: --------------- dictionary : Trie словарь, хранящийся в виде ациклического автомата removal_costs : dict штрафы за удаление символов insertion_costs : dict штрафы за вставку символов n : int глубина ``заглядывания вперёд'' в словаре Возвращает --------------- answer : list of dicts, len(answer)=len(dictionary) answer[i][a][j] равно минимальному штрафу за появление символа a в j-ой позиции в вершине с номером i """ answer = [dict() for node in dictionary.data] if n == 0: return answer curr_alphabet = copy.copy(dictionary.alphabet) if allow_spaces: curr_alphabet += [' '] for l, (costs_in_node, node) in enumerate(zip(answer, dictionary.data)): # определение минимальной стоимости удаления символов curr_node_removal_costs = np.empty(dtype=np.float64, shape=(n,)) if len(node[0]) > 0: curr_node_removal_costs[0] = min(removal_costs[symbol] for symbol in node[0]) for j, symbols in enumerate(node[1:], 1): if len(symbols) == 0: curr_node_removal_costs[j:] = curr_node_removal_costs[j - 1] break curr_cost = min(removal_costs[symbol] for symbol in symbols) curr_node_removal_costs[j] = min(curr_node_removal_costs[j - 1], curr_cost) else: curr_node_removal_costs[:] = np.inf # определение минимальной стоимости вставки for a in curr_alphabet: curr_symbol_costs = np.empty(dtype=np.float64, shape=(n,)) curr_symbol_costs.fill(insertion_costs[a]) for j, symbols in enumerate(node): if a in symbols: curr_symbol_costs[j:] = 0.0 break curr_symbol_costs[j] = min(curr_symbol_costs[j], curr_node_removal_costs[j]) costs_in_node[a] = curr_symbol_costs return answer class SegmentTransducer: """ Класс, реализующий взвешенный конечный преобразователь, осуществляющий замены из заданного списка операций Аргументы: ---------- alphabet : list алфавит operation_costs : dict or None(optional, default=None) словарь вида {(up,low) : cost} allow_spaces : bool(optional, default=False) разрешены ли элементы трансдукции, содержащие пробел (используется только если явно не заданы operation costs и они равны значению по умолчанию) """ def __init__(self, alphabet, operation_costs=None, allow_spaces=False): self.alphabet = alphabet if operation_costs is None: self._make_default_operation_costs(allow_spaces=allow_spaces) elif not isinstance(operation_costs, dict): raise TypeError("Operation costs must be a dictionary") else: self.operation_costs = operation_costs self._make_reversed_operation_costs() self._make_maximal_key_lengths() # self.maximal_value_lengths = {} # for up, probs in self.operation_costs.items(): # СЛИШКОМ МНОГО ВЫЗОВОВ, НАДО КАК-ТО ЗАПОМНИТЬ # МАКСИМАЛЬНЫЕ ДЛИНЫ КЛЮЧЕЙ ПРИ ОБРАЩЕНИИ # max_low_length = max(len(low) for low in probs) if (len(probs) > 0) else -1 # self.maximal_value_lengths[up] = self.maximal_key_length def get_operation_cost(self, up, low): """ Возвращает стоимость элементарной трансдукции up->low или np.inf, если такой элементарной трансдукции нет Аргументы: ---------- up, low : string элементы элементарной трансдукции Возвращает: ----------- cost : float стоимость элементарной трансдукции up->low (np.inf, если такая трансдукция отсутствует) """ up_costs = self.operation_costs.get(up, None) if up_costs is None: return np.inf cost = up_costs.get(low, np.inf) return cost def inverse(self): """ Строит пробразователь, задающий обратное конечное преобразование """ # УПРОСТИТЬ ОБРАЩЕНИЕ!!! inversed_transducer = SegmentTransducer(self.alphabet, operation_costs=dict()) inversed_transducer.operation_costs = self._reversed_operation_costs inversed_transducer._reversed_operation_costs = self.operation_costs inversed_transducer.max_low_length = self.max_up_length inversed_transducer.max_up_length = self.max_low_length inversed_transducer.max_low_lengths_by_up = self.max_up_lengths_by_low inversed_transducer.max_up_lengths_by_low = self.max_low_lengths_by_up return inversed_transducer def distance(self, first, second, return_transduction=False): """ Вычисляет трансдукцию минимальной стоимости, отображающую first в second Аргументы: ----------- first : string second : string Верхний и нижний элементы трансдукции return_transduction : bool (optional, default=False) следует ли возвращать трансдукцию минимального веса (см. возвращаемое значение) Возвращает: ----------- (final_cost, transductions) : tuple(float, list) если return_transduction=True, то возвращает минимальную стоимость трансдукции, переводящей first в second и список трансдукций с данной стоимостью final_cost : float если return_transduction=False, то возвращает минимальную стоимость трансдукции, переводящей first в second """ if return_transduction: add_pred = (lambda x, y: (y == np.inf or x < y)) else: add_pred = (lambda x, y: (y == np.inf or x <= y)) clear_pred = (lambda x, y: x < y < np.inf) update_func = lambda x, y: min(x, y) costs, backtraces = self._fill_levenshtein_table(first, second, update_func, add_pred, clear_pred) final_cost = costs[-1][-1] if final_cost == np.inf: transductions = [None] elif return_transduction: transductions = self._backtraces_to_transductions(first, second, backtraces, final_cost, return_cost=False) if return_transduction: return final_cost, transductions else: return final_cost def transduce(self, first, second, threshold): """ Возвращает все трансдукции, переводящие first в second, чья стоимость не превышает threshold Возвращает: ---------- result : list список вида [(трансдукция, стоимость)] """ add_pred = (lambda x, y: x <= threshold) clear_pred = (lambda x, y: False) update_func = (lambda x, y: min(x, y)) costs, backtraces = self._fill_levenshtein_table(first, second, update_func, add_pred, clear_pred, threshold=threshold) result = self._backtraces_to_transductions(first, second, backtraces, threshold, return_cost=True) return result def lower_transductions(self, word, max_cost, return_cost=True): """ Возвращает все трансдукции с верхним элементом word, чья стоимость не превышает max_cost ` Возвращает: ---------- result : list список вида [(трансдукция, стоимость)], если return_cost=True список трансдукций, если return_cost=False список отсортирован в порядке возрастания стоимости трансдукции """ prefixes = [[] for i in range(len(word) + 1)] prefixes[0].append(((), 0.0)) for pos in range(len(prefixes)): # вставки prefixes[pos] = self._perform_insertions(prefixes[pos], max_cost) max_upperside_length = min(len(word) - pos, self.max_up_length) for upperside_length in range(1, max_upperside_length + 1): up = word[pos: pos + upperside_length] for low, low_cost in self.operation_costs.get(up, dict()).items(): for transduction, cost in prefixes[pos]: new_cost = cost + low_cost if new_cost <= max_cost: new_transduction = transduction + (up, low) prefixes[pos + upperside_length].append((new_transduction, new_cost)) answer = sorted(prefixes[-1], key=(lambda x: x[0])) if return_cost: return answer else: return [elem[0] for elem in answer] def lower(self, word, max_cost, return_cost=True): transductions = self.lower_transductions(word, max_cost, return_cost=True) answer = dict() for transduction, cost in transductions: low = "".join(elem[1] for elem in transductions) curr_cost = answer.get(low, None) if curr_cost is None or cost < curr_cost: answer[low] = cost answer = sorted(answer.items(), key=(lambda x: x[1])) if return_cost: return answer else: return [elem[0] for elem in answer] def upper(self, word, max_cost, return_cost=True): inversed_transducer = self.inverse() return inversed_transducer.lower(word, max_cost, return_cost) def upper_transductions(self, word, max_cost, return_cost=True): inversed_transducer = self.inverse() return inversed_transducer.lower_transductions(word, max_cost, return_cost) def _fill_levenshtein_table(self, first, second, update_func, add_pred, clear_pred, threshold=None): """ Функция, динамически заполняющая таблицу costs стоимости трансдукций, costs[i][j] --- минимальная стоимость трансдукции, переводящей first[:i] в second[:j] Аргументы: ---------- first, second : string Верхний и нижний элементы трансдукции update_func : callable, float*float -> bool update_func(x, y) возвращает новое значение в ячейке таблицы costs, если старое значение --- y, а потенциально новое значение --- x везде update_func = min add_pred : callable : float*float -> bool add_pred(x, y) возвращает, производится ли добавление нового элемента p стоимости x в ячейку backtraces[i][j] в зависимости от значения costs[i][j]=y и текущей стоимости x clear_pred : callable : float*float -> bool clear_pred(x, y) возвращает, производится ли очистка ячейки backtraces[i][j] в зависимости от значения costs[i][j]=y и текущей стоимости x элемента p, добавляемого в эту ячейку Возвращает: ----------- costs : array, dtype=float, shape=(len(first)+1, len(second)+1) массив, в ячейке с индексами i, j которого хранится минимальная стоимость трансдукции, переводящей first[:i] в second[:j] backtraces : array, dtype=list, shape=(len(first)+1, len(second)+1) массив, в ячейке с индексами i, j которого хранятся обратные ссылки на предыдущую ячейку в оптимальной трансдукции, приводящей в ячейку backtraces[i][j] """ m, n = len(first), len(second) # если threshold=None, то в качестве порога берётся удвоенная стоимость # трансдукции, отображающей символы на одинаковых позициях друг в друга if threshold is None: threshold = 0.0 for a, b in zip(first, second): threshold += self.get_operation_cost(a, b) if m > n: for a in first[n:]: threshold += self.get_operation_cost(a, '') elif m < n: for b in second[m:]: threshold += self.get_operation_cost('', b) threshold *= 2 # инициализация возвращаемых массивов costs = np.zeros(shape=(m + 1, n + 1), dtype=np.float64) costs[:] = np.inf backtraces = [None] * (m + 1) for i in range(m + 1): backtraces[i] = [[] for j in range(n + 1)] costs[0][0] = 0.0 for i in range(m + 1): for i_right in range(i, min(i + self.max_up_length, m) + 1): up = first[i: i_right] max_low_length = self.max_low_lengths_by_up.get(up, -1) if max_low_length == -1: # no up key in transduction continue up_costs = self.operation_costs[up] for j in range(n + 1): if costs[i][j] > threshold: continue if len(backtraces[i][j]) == 0 and i + j > 0: continue # не нашлось обратных ссылок for j_right in range((j if i_right > i else j + 1), min(j + max_low_length, n) + 1): low = second[j: j_right] curr_cost = up_costs.get(low, np.inf) old_cost = costs[i_right][j_right] new_cost = costs[i][j] + curr_cost if new_cost > threshold: continue if add_pred(new_cost, old_cost): if clear_pred(new_cost, old_cost): backtraces[i_right][j_right] = [] costs[i_right][j_right] = update_func(new_cost, old_cost) backtraces[i_right][j_right].append((i, j)) return costs, backtraces def _make_reversed_operation_costs(self): """ Заполняет массив _reversed_operation_costs на основе имеющегося массива operation_costs """ _reversed_operation_costs = dict() for up, costs in self.operation_costs.items(): for low, cost in costs.items(): if low not in _reversed_operation_costs: _reversed_operation_costs[low] = dict() _reversed_operation_costs[low][up] = cost self._reversed_operation_costs = _reversed_operation_costs def _make_maximal_key_lengths(self): """ Вычисляет максимальную длину элемента low в элементарной трансдукции (up, low) для каждого up и максимальную длину элемента up в элементарной трансдукции (up, low) для каждого low """ self.max_up_length = \ (max(len(up) for up in self.operation_costs) if len(self.operation_costs) > 0 else -1) self.max_low_length = \ (max(len(low) for low in self._reversed_operation_costs) if len(self._reversed_operation_costs) > 0 else -1) self.max_low_lengths_by_up, self.max_up_lengths_by_low = dict(), dict() for up, costs in self.operation_costs.items(): self.max_low_lengths_by_up[up] = \ max(len(low) for low in costs) if len(costs) > 0 else -1 for low, costs in self._reversed_operation_costs.items(): self.max_up_lengths_by_low[low] = \ max(len(up) for up in costs) if len(costs) > 0 else -1 def _backtraces_to_transductions(self, first, second, backtraces, threshold, return_cost=False): """ Восстанавливает трансдукции по таблице обратных ссылок Аргументы: ---------- first, second : string верхние и нижние элементы трансдукции backtraces : array-like, dtype=list, shape=(len(first)+1, len(second)+1) таблица обратных ссылок threshold : float порог для отсева трансдукций, возвращаются только трансдукции стоимостью <= threshold return_cost : bool (optional, default=False) если True, то вместе с трансдукциями возвращается их стоимость Возвращает: ----------- result : list список вида [(трансдукция, стоимость)], если return_cost=True и вида [трансдукция], если return_cost=False, содержащий все трансдукции, переводящие first в second, чья стоимость не превышает threshold """ m, n = len(first), len(second) agenda = [None] * (m + 1) for i in range(m + 1): agenda[i] = [[] for j in range(n + 1)] agenda[m][n] = [((), 0.0)] for i_right in range(m, -1, -1): for j_right in range(n, -1, -1): current_agenda = agenda[i_right][j_right] if len(current_agenda) == 0: continue for (i, j) in backtraces[i_right][j_right]: up, low = first[i:i_right], second[j:j_right] add_cost = self.operation_costs[up][low] for elem, cost in current_agenda: new_cost = cost + add_cost if new_cost <= threshold: # удаление трансдукций большой стоимости agenda[i][j].append((((up, low),) + elem, new_cost)) if return_cost: return agenda[0][0] else: return [elem[0] for elem in agenda[0][0]] def _perform_insertions(self, initial, max_cost): """ возвращает все трансдукции стоимости <= max_cost, которые можно получить из элементов initial Аргументы: ---------- initial : list of tuples список исходных трансдукций вида [(трансдукция, стоимость)] max_cost : float максимальная стоимость трансдукции Возвращает: ----------- final : list of tuples финальный список трансдукций вида [(трансдукция, стоимость)] """ queue = list(initial) final = initial while len(queue) > 0: transduction, cost = queue[0] queue = queue[1:] for string, string_cost in self.operation_costs[""].items(): new_cost = cost + string_cost if new_cost <= max_cost: new_transduction = transduction + ("", string) final.append((new_transduction, new_cost)) queue.append((new_transduction, new_cost)) return final def _make_default_operation_costs(self, allow_spaces=False): """ sets 1.0 cost for every replacement, insertion, deletion and transposition """ self.operation_costs = dict() self.operation_costs[""] = {c: 1.0 for c in list(self.alphabet) + [' ']} for a in self.alphabet: current_costs = {c: 1.0 for c in self.alphabet} current_costs[a] = 0.0 current_costs[""] = 1.0 if allow_spaces: current_costs[" "] = 1.0 self.operation_costs[a] = current_costs # транспозиции for a, b in itertools.permutations(self.alphabet, 2): self.operation_costs[a + b] = {b + a: 1.0} # пробелы if allow_spaces: self.operation_costs[" "] = {c: 1.0 for c in self.alphabet} self.operation_costs[" "][""] = 1.0 ================================================ FILE: deeppavlov/models/spelling_correction/levenshtein/searcher_component.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import string from logging import getLogger from math import log10 from typing import Iterable, List, Tuple, Optional from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component from .levenshtein_searcher import LevenshteinSearcher logger = getLogger(__name__) @register('spelling_levenshtein') class LevenshteinSearcherComponent(Component): """Component that finds replacement candidates for tokens at a set Damerau-Levenshtein distance Args: words: list of every correct word max_distance: maximum allowed Damerau-Levenshtein distance between source words and candidates error_probability: assigned probability for every edit vocab_penalty: assigned probability of an out of vocabulary token being the correct one without changes Attributes: max_distance: maximum allowed Damerau-Levenshtein distance between source words and candidates error_probability: assigned logarithmic probability for every edit vocab_penalty: assigned logarithmic probability of an out of vocabulary token being the correct one without changes """ _punctuation = frozenset(string.punctuation) def __init__(self, words: Iterable[str], max_distance: int = 1, error_probability: float = 1e-4, vocab_penalty: Optional[float] = None, **kwargs): words = list({word.strip().lower().replace('ё', 'е') for word in words}) alphabet = sorted({letter for word in words for letter in word}) self.max_distance = max_distance self.error_probability = log10(error_probability) self.vocab_penalty = self.error_probability if vocab_penalty is None else log10(vocab_penalty) self.searcher = LevenshteinSearcher(alphabet, words, allow_spaces=True, euristics=2) def _infer_instance(self, tokens: Iterable[str]) -> List[List[Tuple[float, str]]]: candidates = [] for word in tokens: if word in self._punctuation: candidates.append([(0, word)]) else: c = {candidate: self.error_probability * distance for candidate, distance in self.searcher.search(word, d=self.max_distance)} c[word] = c.get(word, self.vocab_penalty) candidates.append([(score, candidate) for candidate, score in c.items()]) return candidates def __call__(self, batch: Iterable[Iterable[str]], *args, **kwargs) -> List[List[List[Tuple[float, str]]]]: """Propose candidates for tokens in sentences Args: batch: batch of tokenized sentences Returns: batch of lists of probabilities and candidates for every token """ return [self._infer_instance(tokens) for tokens in batch] ================================================ FILE: deeppavlov/models/spelling_correction/levenshtein/tabled_trie.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import copy from collections import defaultdict import numpy as np class Trie: """ Реализация префиксного бора (точнее, корневого направленного ациклического графа) Атрибуты -------- alphabet: list, алфавит alphabet_codes: dict, словарь символ:код compressed: bool, индикатор сжатия cashed: bool, индикатор кэширования запросов к функции descend root: int, индекс корня graph: array, type=int, shape=(число вершин, размер алфавита), матрица потомков graph[i][j] = k <-> вершина k --- потомок вершины i по ребру, помеченному символом alphabet[j] data: array, type=object, shape=(число вершин), массив с данными, хранящямися в вершинах final: array, type=bool, shape=(число вершин), массив индикаторов final[i] = True <-> i --- финальная вершина """ NO_NODE = -1 SPACE_CODE = -1 ATTRS = ['is_numpied', 'precompute_symbols', 'allow_spaces', 'is_terminated', 'to_make_cashed'] def __init__(self, alphabet, make_sorted=True, make_alphabet_codes=True, is_numpied=False, to_make_cashed=False, precompute_symbols=None, allow_spaces=False, dict_storage=False): self.alphabet = sorted(alphabet) if make_sorted else alphabet self.alphabet_codes = ({a: i for i, a in enumerate(self.alphabet)} if make_alphabet_codes else self.alphabet) self.alphabet_codes[" "] = Trie.SPACE_CODE self.is_numpied = is_numpied self.to_make_cashed = to_make_cashed self.dict_storage = dict_storage self.precompute_symbols = precompute_symbols self.allow_spaces = allow_spaces self.initialize() def initialize(self): self.root = 0 self.graph = [self._make_default_node()] self.data, self.final = [None], [False] self.nodes_number = 1 self.descend = self._descend_simple self.is_terminated = False def _make_default_node(self): if self.dict_storage: return defaultdict(lambda: -1) elif self.is_numpied: return np.full(shape=(len(self.alphabet),), fill_value=Trie.NO_NODE, dtype=int) else: return [Trie.NO_NODE] * len(self.alphabet) def save(self, outfile): """ Сохраняет дерево для дальнейшего использования """ with open(outfile, "w", encoding="utf8") as fout: attr_values = [getattr(self, attr) for attr in Trie.ATTRS] attr_values.append(any(x is not None for x in self.data)) fout.write("{}\n{}\t{}\n".format( " ".join("T" if x else "F" for x in attr_values), self.nodes_number, self.root)) fout.write(" ".join(str(a) for a in self.alphabet) + "\n") for index, label in enumerate(self.final): letters = self._get_letters(index, return_indexes=True) children = self._get_children(index) fout.write("{}\t{}\n".format( "T" if label else "F", " ".join("{}:{}".format(*elem) for elem in zip(letters, children)))) if self.precompute_symbols is not None: for elem in self.data: fout.write(":".join(",".join( map(str, symbols)) for symbols in elem) + "\n") return def make_cashed(self): """ Включает кэширование запросов к descend """ self._descendance_cash = [dict() for _ in self.graph] self.descend = self._descend_cashed def make_numpied(self): self.graph = np.array(self.graph) self.final = np.asarray(self.final, dtype=bool) self.is_numpied = True def add(self, s): """ Добавление строки s в префиксный бор """ if self.is_terminated: raise TypeError("Impossible to add string to fitted trie") if s == "": self._set_final(self.root) return curr = self.root for i, a in enumerate(s): code = self.alphabet_codes[a] next = self.graph[curr][code] if next == Trie.NO_NODE: curr = self._add_descendant(curr, s[i:]) break else: curr = next self._set_final(curr) return self def fit(self, words): for s in words: self.add(s) self.terminate() def terminate(self): if self.is_numpied: self.make_numpied() self.terminated = True if self.precompute_symbols is not None: precompute_future_symbols(self, self.precompute_symbols, allow_spaces=self.allow_spaces) if self.to_make_cashed: self.make_cashed() def __contains__(self, s): if any(a not in self.alphabet for a in s): return False # word = tuple(self.alphabet_codes[a] for a in s) node = self.descend(self.root, s) return (node != Trie.NO_NODE) and self.is_final(node) def words(self): """ Возвращает итератор по словам, содержащимся в боре """ branch, word, indexes = [self.root], [], [0] letters_with_children = [self._get_children_and_letters(self.root)] while len(branch) > 0: if self.is_final(branch[-1]): yield "".join(word) while indexes[-1] == len(letters_with_children[-1]): indexes.pop() letters_with_children.pop() branch.pop() if len(indexes) == 0: raise StopIteration() word.pop() next_letter, next_child = letters_with_children[-1][indexes[-1]] indexes[-1] += 1 indexes.append(0) word.append(next_letter) branch.append(next_child) letters_with_children.append(self._get_children_and_letters(branch[-1])) def is_final(self, index): """ Аргументы --------- index: int, номер вершины Возвращает ---------- True: если index --- номер финальной вершины """ return self.final[index] def find_partitions(self, s, max_count=1): """ Находит все разбиения s = s_1 ... s_m на словарные слова s_1, ..., s_m для m <= max_count """ curr_agenda = [(self.root, [], 0)] for i, a in enumerate(s): next_agenda = [] for curr, borders, cost in curr_agenda: if cost >= max_count: continue child = self.graph[curr][self.alphabet_codes[a]] # child = self.graph[curr][a] if child == Trie.NO_NODE: continue next_agenda.append((child, borders, cost)) if self.is_final(child): next_agenda.append((self.root, borders + [i + 1], cost + 1)) curr_agenda = next_agenda answer = [] for curr, borders, cost in curr_agenda: if curr == self.root: borders = [0] + borders answer.append([s[left:borders[i + 1]] for i, left in enumerate(borders[:-1])]) return answer def __len__(self): return self.nodes_number def __repr__(self): answer = "" for i, (final, data) in enumerate(zip(self.final, self.data)): letters, children = self._get_letters(i), self._get_children(i) answer += "{0}".format(i) if final: answer += "F" for a, index in zip(letters, children): answer += " {0}:{1}".format(a, index) answer += "\n" if data is not None: answer += "data:{0} {1}\n".format(len(data), " ".join(str(elem) for elem in data)) return answer def _add_descendant(self, parent, s, final=False): for a in s: code = self.alphabet_codes[a] parent = self._add_empty_child(parent, code, final) return parent def _add_empty_child(self, parent, code, final=False): """ Добавление ребёнка к вершине parent по символу с кодом code """ self.graph[parent][code] = self.nodes_number self.graph.append(self._make_default_node()) self.data.append(None) self.final.append(final) self.nodes_number += 1 return (self.nodes_number - 1) def _descend_simple(self, curr, s): """ Спуск из вершины curr по строке s """ for a in s: curr = self.graph[curr][self.alphabet_codes[a]] if curr == Trie.NO_NODE: break return curr def _descend_cashed(self, curr, s): """ Спуск из вершины curr по строке s с кэшированием """ if s == "": return curr curr_cash = self._descendance_cash[curr] answer = curr_cash.get(s, None) if answer is not None: return answer # для оптимизации дублируем код res = curr for a in s: res = self.graph[res][self.alphabet_codes[a]] # res = self.graph[res][a] if res == Trie.NO_NODE: break curr_cash[s] = res return res def _set_final(self, curr): """ Делает состояние curr завершающим """ self.final[curr] = True def _get_letters(self, index, return_indexes=False): """ Извлекает все метки выходных рёбер вершины с номером index """ if self.dict_storage: answer = list(self.graph[index].keys()) else: answer = [i for i, elem in enumerate(self.graph[index]) if elem != Trie.NO_NODE] if not return_indexes: answer = [(self.alphabet[i] if i >= 0 else " ") for i in answer] return answer def _get_children_and_letters(self, index, return_indexes=False): if self.dict_storage: answer = list(self.graph[index].items()) else: answer = [elem for elem in enumerate(self.graph[index]) if elem[1] != Trie.NO_NODE] if not return_indexes: for i, (letter_index, child) in enumerate(answer): answer[i] = (self.alphabet[letter_index], child) return answer def _get_children(self, index): """ Извлекает всех потомков вершины с номером index """ if self.dict_storage: return list(self.graph[index].values()) else: return [elem for elem in self.graph[index] if elem != Trie.NO_NODE] class TrieMinimizer: def __init__(self): pass def minimize(self, trie, dict_storage=False, make_cashed=False, make_numpied=False, precompute_symbols=None, allow_spaces=False, return_groups=False): N = len(trie) if N == 0: raise ValueError("Trie should be non-empty") node_classes = np.full(shape=(N,), fill_value=-1, dtype=int) order = self.generate_postorder(trie) # processing the first node index = order[0] node_classes[index] = 0 class_representatives = [index] node_key = ((), (), trie.is_final(index)) classes, class_keys = {node_key: 0}, [node_key] curr_index = 1 for index in order[1:]: letter_indexes = tuple(trie._get_letters(index, return_indexes=True)) children = trie._get_children(index) children_classes = tuple(node_classes[i] for i in children) key = (letter_indexes, children_classes, trie.is_final(index)) key_class = classes.get(key, None) if key_class is not None: node_classes[index] = key_class else: # появился новый класс class_keys.append(key) classes[key] = node_classes[index] = curr_index class_representatives.append(curr_index) curr_index += 1 # построение нового дерева compressed = Trie(trie.alphabet, is_numpied=make_numpied, dict_storage=dict_storage, allow_spaces=allow_spaces, precompute_symbols=precompute_symbols) L = len(classes) new_final = [elem[2] for elem in class_keys[::-1]] if dict_storage: new_graph = [defaultdict(int) for _ in range(L)] elif make_numpied: new_graph = np.full(shape=(L, len(trie.alphabet)), fill_value=Trie.NO_NODE, dtype=int) new_final = np.array(new_final, dtype=bool) else: new_graph = [[Trie.NO_NODE for a in trie.alphabet] for i in range(L)] for (indexes, children, final), class_index in \ sorted(classes.items(), key=(lambda x: x[1])): row = new_graph[L - class_index - 1] for i, child_index in zip(indexes, children): row[i] = L - child_index - 1 compressed.graph = new_graph compressed.root = L - node_classes[trie.root] - 1 compressed.final = new_final compressed.nodes_number = L compressed.data = [None] * L if make_cashed: compressed.make_cashed() if precompute_symbols is not None: if (trie.is_terminated and trie.precompute_symbols and trie.allow_spaces == allow_spaces): # копируем будущие символы из исходного дерева # нужно, чтобы возврат из финальных состояний в начальное был одинаковым в обоих деревьях for i, node_index in enumerate(class_representatives[::-1]): # будущие символы для представителя i-го класса compressed.data[i] = copy.copy(trie.data[node_index]) else: precompute_future_symbols(compressed, precompute_symbols, allow_spaces) if return_groups: node_classes = [L - i - 1 for i in node_classes] return compressed, node_classes else: return compressed def generate_postorder(self, trie): """ Обратная топологическая сортировка """ order, stack = [], [] stack.append(trie.root) colors = ['white'] * len(trie) while len(stack) > 0: index = stack[-1] color = colors[index] if color == 'white': # вершина ещё не обрабатывалась colors[index] = 'grey' for child in trie._get_children(index): # проверяем, посещали ли мы ребёнка раньше if child != Trie.NO_NODE and colors[child] == 'white': stack.append(child) else: if color == 'grey': colors[index] = 'black' order.append(index) stack = stack[:-1] return order def load_trie(infile): with open(infile, "r", encoding="utf8") as fin: line = fin.readline().strip() flags = [x == 'T' for x in line.split()] if len(flags) != len(Trie.ATTRS) + 1: raise ValueError("Wrong file format") nodes_number, root = map(int, fin.readline().strip().split()) alphabet = fin.readline().strip().split() trie = Trie(alphabet) for i, attr in enumerate(Trie.ATTRS): setattr(trie, attr, flags[i]) read_data = flags[-1] final = [False] * nodes_number # print(len(alphabet), nodes_number) if trie.dict_storage: graph = [defaultdict(lambda: -1) for _ in range(nodes_number)] elif trie.is_numpied: final = np.array(final) graph = np.full(shape=(nodes_number, len(alphabet)), fill_value=Trie.NO_NODE, dtype=int) else: graph = [[Trie.NO_NODE for a in alphabet] for i in range(nodes_number)] for i in range(nodes_number): line = fin.readline().strip() if "\t" in line: label, transitions = line.split("\t") final[i] = (label == "T") else: label = line final[i] = (label == "T") continue transitions = [x.split(":") for x in transitions.split()] for code, value in transitions: graph[i][int(code)] = int(value) trie.graph = graph trie.root = root trie.final = final trie.nodes_number = nodes_number trie.data = [None] * nodes_number if read_data: for i in range(nodes_number): line = fin.readline().strip("\n") trie.data[i] = [set(elem.split(",")) for elem in line.split(":")] if trie.to_make_cashed: trie.make_cashed() return trie def make_trie(alphabet, words, compressed=True, is_numpied=False, make_cashed=False, precompute_symbols=False, allow_spaces=False, dict_storage=False): trie = Trie(alphabet, is_numpied=is_numpied, to_make_cashed=make_cashed, precompute_symbols=precompute_symbols, dict_storage=dict_storage) trie.fit(words) if compressed: tm = TrieMinimizer() trie = tm.minimize(trie, dict_storage=dict_storage, make_cashed=make_cashed, make_numpied=is_numpied, precompute_symbols=precompute_symbols, allow_spaces=allow_spaces) return trie def precompute_future_symbols(trie, n, allow_spaces=False): """ Collecting possible continuations of length <= n for every node """ if n == 0: return if trie.is_terminated and trie.precompute_symbols: # символы уже предпосчитаны return for index, final in enumerate(trie.final): trie.data[index] = [set() for i in range(n)] for index, (node_data, final) in enumerate(zip(trie.data, trie.final)): node_data[0] = set(trie._get_letters(index)) if allow_spaces and final: node_data[0].add(" ") for d in range(1, n): for index, (node_data, final) in enumerate(zip(trie.data, trie.final)): children = set(trie._get_children(index)) for child in children: node_data[d] |= trie.data[child][d - 1] # в случае, если разрешён возврат по пробелу в стартовое состояние if allow_spaces and final: node_data[d] |= trie.data[trie.root][d - 1] trie.terminated = True ================================================ FILE: deeppavlov/models/tokenizers/__init__.py ================================================ ================================================ FILE: deeppavlov/models/tokenizers/lazy_tokenizer.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from nltk import word_tokenize from deeppavlov.core.common.registry import register log = getLogger(__name__) @register('lazy_tokenizer') def lazy_tokenizer(batch): """Tokenizes if there is something to tokenize.""" if len(batch) > 0 and isinstance(batch[0], str): batch = [word_tokenize(utt) for utt in batch] return batch ================================================ FILE: deeppavlov/models/tokenizers/nltk_moses_tokenizer.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Union, List from sacremoses import MosesDetokenizer, MosesTokenizer from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component @register("nltk_moses_tokenizer") class NLTKMosesTokenizer(Component): """Class for splitting texts on tokens using NLTK wrapper over MosesTokenizer Attributes: escape: whether escape characters for use in html markup tokenizer: tokenizer instance from nltk.tokenize.moses detokenizer: detokenizer instance from nltk.tokenize.moses Args: escape: whether escape characters for use in html markup """ def __init__(self, escape: bool = False, *args, **kwargs): self.escape = escape self.tokenizer = MosesTokenizer() self.detokenizer = MosesDetokenizer() def __call__(self, batch: List[Union[str, List[str]]]) -> List[Union[List[str], str]]: """Tokenize given batch of strings or detokenize given batch of lists of tokens Args: batch: list of text samples or list of lists of tokens Returns: list of lists of tokens or list of text samples """ if isinstance(batch[0], str): return [self.tokenizer.tokenize(line, escape=self.escape) for line in batch] else: return [self.detokenizer.detokenize(line, return_str=True, unescape=self.escape) for line in batch] ================================================ FILE: deeppavlov/models/tokenizers/nltk_tokenizer.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import List import nltk from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component @register("nltk_tokenizer") class NLTKTokenizer(Component): """Class for splitting texts on tokens using NLTK Args: tokenizer: tokenization mode for `nltk.tokenize` download: whether to download nltk data Attributes: tokenizer: tokenizer instance from nltk.tokenizers """ def __init__(self, tokenizer: str = "wordpunct_tokenize", download: bool = False, *args, **kwargs): if download: nltk.download() self.tokenizer = getattr(nltk.tokenize, tokenizer, None) if not callable(self.tokenizer): raise AttributeError("Tokenizer {} is not defined in nltk.tokenizer".format(tokenizer)) def __call__(self, batch: List[str]) -> List[List[str]]: """Tokenize given batch Args: batch: list of text samples Returns: list of lists of tokens """ return [self.tokenizer(sent) for sent in batch] ================================================ FILE: deeppavlov/models/tokenizers/spacy_tokenizer.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from typing import List, Generator, Any, Optional, Union, Tuple, Iterable import spacy import spacy.language from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component from deeppavlov.models.tokenizers.utils import detokenize, ngramize logger = getLogger(__name__) # TODO: make proper handling through spacy.cli.download in the stage of python -m deeppavlov download def _try_load_spacy_model(model_name: str, disable: Iterable[str] = ()): disable = set(disable) try: model = spacy.load(model_name, disable=disable) except OSError as e: try: model = __import__(model_name).load(disable=disable) if not isinstance(model, spacy.language.Language): raise RuntimeError(f'{model_name} is not a spacy model module') except Exception: raise e return model @register('stream_spacy_tokenizer') class StreamSpacyTokenizer(Component): """Tokenize or lemmatize a list of documents. Default spacy model is **en_core_web_sm**. Return a list of tokens or lemmas for a whole document. If is called onto ``List[str]``, performs detokenizing procedure. Args: disable: spacy pipeline elements to disable, serves a purpose of performing; if nothing filter_stopwords: whether to ignore stopwords during tokenizing/lemmatizing and ngrams creation batch_size: a batch size for spaCy buffering ngram_range: size of ngrams to create; only unigrams are returned by default lemmas: whether to perform lemmatizing or not lowercase: whether to perform lowercasing or not; is performed by default by :meth:`_tokenize` and :meth:`_lemmatize` methods alphas_only: whether to filter out non-alpha tokens; is performed by default by :meth:`_filter` method spacy_model: a string name of spacy model to use; DeepPavlov searches for this name in downloaded spacy models; default model is **en_core_web_sm**, it downloads automatically during DeepPavlov installation Attributes: stopwords: a list of stopwords that should be ignored during tokenizing/lemmatizing and ngrams creation model: a loaded spacy model batch_size: a batch size for spaCy buffering ngram_range: size of ngrams to create; only unigrams are returned by default lemmas: whether to perform lemmatizing or not lowercase: whether to perform lowercasing or not; is performed by default by :meth:`_tokenize` and :meth:`_lemmatize` methods alphas_only: whether to filter out non-alpha tokens; is performed by default by :meth:`_filter` method """ def __init__(self, disable: Optional[Iterable[str]] = None, filter_stopwords: bool = False, batch_size: Optional[int] = None, ngram_range: Optional[List[int]] = None, lemmas: bool = False, lowercase: Optional[bool] = None, alphas_only: Optional[bool] = None, spacy_model: str = 'en_core_web_sm', **kwargs): if disable is None: disable = ['parser', 'ner'] if ngram_range is None: ngram_range = [1, 1] self.model = _try_load_spacy_model(spacy_model, disable=disable) self.stopwords = self.model.Defaults.stop_words if filter_stopwords else set() self.batch_size = batch_size self.ngram_range = tuple(ngram_range) # cast JSON array to tuple self.lemmas = lemmas self.lowercase = lowercase self.alphas_only = alphas_only def __call__(self, batch: Union[List[str], List[List[str]]]) -> Union[List[List[str]], List[str]]: """Tokenize or detokenize strings, depends on the type structure of passed arguments. Args: batch: a batch of documents to perform tokenizing/lemmatizing; or a batch of lists of tokens/lemmas to perform detokenizing Returns: a batch of lists of tokens/lemmas; or a batch of detokenized strings Raises: TypeError: If the first element of ``batch`` is neither List, nor str. """ if isinstance(batch[0], str): if self.lemmas: return list(self._lemmatize(batch)) else: return list(self._tokenize(batch)) if isinstance(batch[0], list): return [detokenize(doc) for doc in batch] raise TypeError( "StreamSpacyTokenizer.__call__() is not implemented for `{}`".format(type(batch[0]))) def _tokenize(self, data: List[str], ngram_range: Optional[Tuple[int, int]] = None, batch_size: int = 10000, lowercase: bool = True) -> Generator[List[str], Any, None]: """Tokenize a list of documents. Args: data: a list of documents to tokenize ngram_range: size of ngrams to create; only unigrams are returned by default batch_size: a batch size for spaCy buffering lowercase: whether to perform lowercasing or not; is performed by default by :meth:`_tokenize` and :meth:`_lemmatize` methods Yields: list of lists of ngramized tokens or list of detokenized strings Returns: None """ _batch_size = self.batch_size or batch_size _ngram_range = ngram_range or self.ngram_range if self.lowercase is None: _lowercase = lowercase else: _lowercase = self.lowercase for i, doc in enumerate( self.model.tokenizer.pipe(data, batch_size=_batch_size)): if _lowercase: tokens = [t.lower_ for t in doc] else: tokens = [t.text for t in doc] filtered = self._filter(tokens) processed_doc = ngramize(filtered, ngram_range=_ngram_range, doc=data[i]) yield from processed_doc def _lemmatize(self, data: List[str], ngram_range: Optional[Tuple[int, int]] = None, batch_size: int = 10000, lowercase: bool = True) -> Generator[List[str], Any, None]: """Lemmatize a list of documents. Args: data: a list of documents to tokenize ngram_range: size of ngrams to create; only unigrams are returned by default batch_size: a batch size for spaCy buffering Yields: list of lists of ngramized lemmas or list of detokenized strings Returns: None """ _batch_size = self.batch_size or batch_size _ngram_range = ngram_range or self.ngram_range if self.lowercase is None: _lowercase = lowercase else: _lowercase = self.lowercase for i, doc in enumerate( self.model.pipe(data, batch_size=_batch_size)): lemmas = [t.lemma_ for t in doc] if _lowercase: lemmas = [t.lower() for t in lemmas] lemm_doc = " ".join(lemmas) filtered = self._filter(lemmas) processed_doc = ngramize(filtered, ngram_range=_ngram_range, doc=lemm_doc) yield from processed_doc def _filter(self, items: List[str], alphas_only: bool = True) -> List[str]: """Filter a list of tokens/lemmas. Args: items: a list of tokens/lemmas to filter alphas_only: whether to filter out non-alpha tokens Returns: a list of filtered tokens/lemmas """ if self.alphas_only is None: _alphas_only = alphas_only else: _alphas_only = self.alphas_only if _alphas_only: filter_fn = lambda x: x.isalpha() and not x.isspace() and x not in self.stopwords else: filter_fn = lambda x: not x.isspace() and x not in self.stopwords return list(filter(filter_fn, items)) def set_stopwords(self, stopwords: List[str]) -> None: """Redefine a list of stopwords. Args: stopwords: a list of stopwords Returns: None """ self.stopwords = stopwords ================================================ FILE: deeppavlov/models/tokenizers/split_tokenizer.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import List from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component @register("split_tokenizer") class SplitTokenizer(Component): """ Generates utterance's tokens by mere python's ``str.split()``. Doesn't have any parameters. """ def __init__(self, **kwargs) -> None: pass def __call__(self, batch: List[str]) -> List[List[str]]: """ Tokenize given batch Args: batch: list of texts to tokenize Returns: tokenized batch """ if isinstance(batch, (list, tuple)): return [sample.split() for sample in batch] else: raise NotImplementedError('not implemented for types other than' ' list or tuple') ================================================ FILE: deeppavlov/models/tokenizers/utils.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import re from typing import List, Generator, Any def detokenize(tokens): """ Detokenizing a text undoes the tokenizing operation, restores punctuation and spaces to the places that people expect them to be. Ideally, `detokenize(tokenize(text))` should be identical to `text`, except for line breaks. """ text = ' '.join(tokens) step0 = text.replace('. . .', '...') step1 = step0.replace("`` ", '"').replace(" ''", '"') step2 = step1.replace(" ( ", " (").replace(" ) ", ") ") step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2) step4 = re.sub(r' ([.,:;?!%]+)$', r"\1", step3) step5 = step4.replace(" '", "'").replace(" n't", "n't") \ .replace(" nt", "nt").replace("can not", "cannot") step6 = step5.replace(" ` ", " '") return step6.strip() def ngramize(items: List[str], ngram_range=(1, 1), doc: str = None) -> Generator[List[str], Any, None]: """ Make ngrams from a list of tokens/lemmas :param items: list of tokens, lemmas or other strings to form ngrams :param ngram_range: range for producing ngrams, ex. for unigrams + bigrams should be set to (1, 2), for bigrams only should be set to (2, 2) :return: ngrams (as strings) generator """ ngrams = [] ranges = [(0, i) for i in range(ngram_range[0], ngram_range[1] + 1)] for r in ranges: ngrams += list(zip(*[items[j:] for j in range(*r)])) formatted_ngrams = [' '.join(item) for item in ngrams] if doc is not None: doc_lower = doc.lower() formatted_ngrams = [ngram for ngram in formatted_ngrams if (ngram in doc or ngram in doc_lower)] yield formatted_ngrams ================================================ FILE: deeppavlov/models/torch_bert/__init__.py ================================================ ================================================ FILE: deeppavlov/models/torch_bert/crf.py ================================================ import numpy as np import torch from torch import nn from torchcrf import CRF as CRFbase class CRF(CRFbase): """Class with Conditional Random Field from PyTorch-CRF library with modified training function """ def __init__(self, num_tags: int, batch_first: bool = False) -> None: super().__init__(num_tags=num_tags, batch_first=batch_first) nn.init.zeros_(self.transitions) nn.init.zeros_(self.start_transitions) nn.init.zeros_(self.end_transitions) self.stats = torch.zeros((num_tags, num_tags), dtype=torch.float) self.zeros = torch.zeros((num_tags, num_tags), dtype=torch.float) self.neg = torch.full((num_tags, num_tags), -1000.0) def forward(self, tags_batch: torch.LongTensor, y_masks: np.ndarray): seq_lengths = np.sum(y_masks, axis=1) for seq_len, tags_list in zip(seq_lengths, tags_batch): if seq_len > 1: for i in range(seq_len - 1): self.stats[int(tags_list[i])][int(tags_list[i + 1])] += 1.0 with torch.no_grad(): self.transitions.copy_(torch.where(self.stats > 0, self.zeros, self.neg)) ================================================ FILE: deeppavlov/models/torch_bert/multitask_transformer.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from collections.abc import Iterable from logging import getLogger from pathlib import Path from typing import Dict, Optional import numpy as np import torch import torch.nn as nn from torch.nn import CrossEntropyLoss, MSELoss, BCEWithLogitsLoss from transformers import AutoConfig, AutoModel from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register from deeppavlov.core.models.torch_model import TorchModel from deeppavlov.models.torch_bert.torch_transformers_sequence_tagger import token_from_subtoken, \ token_labels_to_subtoken_labels log = getLogger(__name__) class FocalLoss(nn.Module): "Non weighted version of Focal Loss" def __init__(self, alpha=.5, gamma=2, categorical_loss=False, weight=None): super(FocalLoss, self).__init__() self.alpha = torch.tensor([alpha, 1 - alpha]).cuda() self.gamma = gamma self.categorical = categorical_loss self.weight = weight def forward(self, inputs, targets): if self.categorical: loss = CrossEntropyLoss(weight=self.weight, reduction='none')(inputs, targets) else: loss = BCEWithLogitsLoss(weight=self.weight, reduction='none')(inputs, targets) targets = targets.type(torch.long) at = self.alpha.gather(0, targets.data.view(-1)) pt = torch.exp(-loss) F_loss = at * (1 - pt) ** self.gamma * loss return F_loss.mean() def SoftCrossEntropyLoss(inputs, targets): logprobs = torch.nn.functional.log_softmax(inputs, dim=1) return -(targets * logprobs).sum() / inputs.shape[0] def we_transform_input(name): return name in ['sequence_labeling', 'multiple_choice'] class BertForMultiTask(nn.Module): """ BERT model for multiple choice,sequence labeling, ner, classification or regression This module is composed of the BERT model with a linear layer on top of the pooled output. Params: task_num_classes task_types backbone_model - na """ def __init__(self, tasks_num_classes, multilabel, task_types, weights, backbone_model='bert_base_uncased', dropout=None, new_model=False,focal=False, max_seq_len=320, model_takes_token_type_ids=True): super(BertForMultiTask, self).__init__() config = AutoConfig.from_pretrained(backbone_model, output_hidden_states=True, output_attentions=True) self.bert = AutoModel.from_pretrained(pretrained_model_name_or_path=backbone_model, config=config) self.classes = tasks_num_classes # classes for every task self.weights = weights self.multilabel = multilabel self.new_model = new_model self.model_takes_token_type_ids = model_takes_token_type_ids if dropout is not None: self.dropout = nn.Dropout(dropout) elif hasattr(config, 'hidden_dropout_prob'): self.dropout = nn.Dropout(config.hidden_dropout_prob) elif hasattr(config, 'seq_classif_dropout'): self.dropout = nn.Dropout(config.seq_classif_dropout) elif hasattr(config, 'dropout'): self.dropout = nn.Dropout(config.dropout) else: self.dropout = nn.Dropout(0) self.max_seq_len = max_seq_len self.activation = nn.Tanh() self.task_types = task_types self.focal=focal OUT_DIM = config.hidden_size if self.new_model and self.new_model!=2: OUT_DIM = OUT_DIM * 2 self.bert.final_classifier = nn.ModuleList( [ nn.Linear(OUT_DIM, num_labels) if self.task_types[i] not in ['multiple_choice', 'regression', 'binary_head'] else nn.Linear(OUT_DIM, 1) for i, num_labels in enumerate(self.classes) ] ) if self.new_model:# or True: self.bert.pooling_layer = nn.Linear(OUT_DIM, OUT_DIM) else: self.bert.pooler = nn.Linear(OUT_DIM, OUT_DIM) def get_logits(self, task_id, input_ids, attention_mask, token_type_ids): name = self.task_types[task_id] outputs = None if we_transform_input(name): input_ids = input_ids.view(-1, input_ids.size(-1)) attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if token_type_ids is not None: token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is None or not self.model_takes_token_type_ids: outputs = self.bert(input_ids=input_ids.long(), attention_mask=attention_mask.long()) else: try: outputs = self.bert(input_ids=input_ids.long(), token_type_ids=token_type_ids.long(), attention_mask=attention_mask.long()) except Exception as e: if "forward() got an unexpected keyword argument 'token_type_ids'" in str(e): outputs = self.bert(input_ids=input_ids.long(), attention_mask=attention_mask.long()) self.model_takes_token_type_ids=False else: raise e if name == 'sequence_labeling': return outputs.last_hidden_state elif self.new_model == 2: return outputs.last_hidden_state[:, task_id] elif self.new_model: return torch.cat([outputs.last_hidden_state[:, 0], outputs.last_hidden_state[:, task_id + 1]], axis=1) else: return outputs.last_hidden_state[:, 0] def predict_on_top(self, task_id, last_hidden_state, labels=None): name = self.task_types[task_id] if name == 'sequence_labeling': # last hidden state is all token tensor final_output = self.dropout(last_hidden_state) logits = self.bert.final_classifier[task_id](final_output) if labels is not None: active_logits = logits.view(-1, self.classes[task_id]) if self.multilabel[task_id]: loss_fct = BCEWithLogitsLoss() loss = loss_fct(active_logits, labels) elif not self.multilabel[task_id]: loss_fct = CrossEntropyLoss() loss = loss_fct(active_logits, labels.view(-1)) return loss, logits else: return logits elif name in ['classification', 'regression', 'multiple_choice']: # last hidden state is a first token tensor if self.new_model: # or True: pooled_output = self.bert.pooling_layer(last_hidden_state) else: pooled_output = self.bert.pooler(last_hidden_state) pooled_output = self.activation(pooled_output) pooled_output = self.dropout(pooled_output) logits = self.bert.final_classifier[task_id](pooled_output) if name == 'multiple_choice': logits = logits.view((-1, self.classes[task_id])) if labels is not None: l1, l2 = len(logits), len(labels) if len(logits) != len(labels): raise Exception(f'Len of logits {l1} and labels {l2} not match') if labels is not None: if name != "regression": if self.multilabel[task_id]: loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) elif not self.multilabel[task_id]: if self.focal: if self.weights[task_id] is None: loss_fct = FocalLoss() else: loss_fct = FocalLoss(weight=torch.tensor([self.weights[task_id]]).cuda()) loss = loss_fct(logits, labels.view(-1)) else: if self.weights[task_id] is None: loss_fct = CrossEntropyLoss() else: loss_fct = CrossEntropyLoss(weight=torch.Tensor([self.weights[task_id]]).cuda()) loss = loss_fct(logits, labels.view(-1)) return loss, logits elif name == "regression": loss_fct = MSELoss() loss = loss_fct(logits, labels.unsqueeze(1)) return loss, logits else: return logits elif name == 'binary_head': last_hidden_state = self.dropout(last_hidden_state) pooled_output = self.bert.pooler(last_hidden_state) pooled_output = self.activation(pooled_output) pooled_output = self.dropout(pooled_output) logits = self.bert.final_classifier[task_id](pooled_output) if labels is not None: if self.focal: if self.weights[task_id] is None: loss_fct = FocalLoss() else: loss_fct = FocalLoss(weight=torch.tensor([self.weights[task_id]]).cuda()) else: if self.weights[task_id] is None: loss_fct = BCEWithLogitsLoss() else: loss_fct = BCEWithLogitsLoss(weight=torch.Tensor([self.weights[task_id]]).cuda()) if len(labels.shape) == 1 and len(logits.shape) == 2: labels = labels.unsqueeze(1) loss = loss_fct(logits, labels) return loss, logits else: return logits else: raise Exception(f'Unsupported name {name}') def forward(self, task_id, input_ids, attention_mask, token_type_ids, labels=None): last_hidden_state = self.get_logits(task_id, input_ids, attention_mask, token_type_ids) return self.predict_on_top(task_id, last_hidden_state, labels) @register('multitask_transformer') class MultiTaskTransformer(TorchModel): """ Multi-Task transformer-agnostic model Args: tasks: Dict of task names along with the labels for each task, max_seq_len(int): maximum length of the input token sequence. gradient_accumulation_steps(default:1): number of gradient accumulation steps, steps_per_epoch(int): number of steps taken per epoch. Specify if gradient_accumulation_steps > 1 backbone_model(str): name of HuggingFace.Transformers backbone model. Default: 'bert-base-cased' multilabel(default: False): set to true for multilabel classification, return_probas(default: False): set true to return prediction probabilities, freeze_embeddings(default: False): set true to freeze BERT embeddings dropout(default: None): dropout for the final model layer. If not set, defaults to the parameter hidden_dropout_prob of original model cuda_cache_size(default:3): predicts cache size. Recommended if we need classify one samples for many tasks. 0 if we don't use cache cuda_cache(default:True): if True, store cache on GPU seed(default:42): Torch manual_random_seed """ def __init__( self, tasks: Dict[str, Dict], max_seq_len: int = 320, gradient_accumulation_steps: Optional[int] = 1, steps_per_epoch: Optional[int] = None, backbone_model: str = "bert-base-cased", focal: bool = False, return_probas: bool = False, freeze_embeddings: bool = False, new_model=False, dropout: Optional[float] = None, binary_threshold: float = 0.5, seed: int = 42, *args, **kwargs, ) -> None: self.return_probas = return_probas self.task_names = list(tasks.keys()) self.task_types = [] self.max_seq_len = max_seq_len self.tasks_num_classes = [] self.task_names = [] self.multilabel = [] weights = [] self.types_to_cache = [] for task in tasks: self.task_names.append(task) self.tasks_num_classes.append(tasks[task].get('options', 1)) weights.append(tasks[task].get('weight', None)) self.task_types.append(tasks[task]['type']) self.multilabel.append(tasks[task].get('multilabel', False)) self.types_to_cache.append(tasks[task].get('type_to_cache', -1)) if self.return_probas and 'sequence_labeling' in self.task_types: log.warning(f'Return_probas for sequence_labeling not supported yet. Returning ids for this task') self.n_tasks = len(tasks) self.train_losses = [[] for _ in self.task_names] self.gradient_accumulation_steps = gradient_accumulation_steps self.steps_per_epoch = steps_per_epoch self.steps_taken = 0 self.prev_id = None self.printed = False self.freeze_embeddings = freeze_embeddings self.binary_threshold = binary_threshold self._reset_cache() torch.manual_seed(seed) model = BertForMultiTask( backbone_model=backbone_model, tasks_num_classes=self.tasks_num_classes, weights=weights, multilabel=self.multilabel, task_types=self.task_types, new_model=new_model, focal=focal, dropout=dropout) super().__init__(model, **kwargs) def _reset_cache(self): self.preds_cache = {index_: None for index_ in self.types_to_cache if index_ != -1} def load(self, fname: Optional[str] = None, *args, **kwargs) -> None: """ Loads weights. """ super().load(fname) if self.freeze_embeddings: for n, p in self.model.bert.named_parameters(): if not ('final_classifier' in n or 'pool' in n): p.requires_grad = False def _make_input(self, task_features, task_id, labels=None): batch_input_size = None if len(task_features) == 1 and isinstance(task_features, list): task_features = task_features[0] if isinstance(labels, Iterable) and all([k is None for k in labels]): labels = None _input = {} element_list = ["input_ids", "attention_mask", "token_type_ids"] for elem in element_list: if elem in task_features: _input[elem] = task_features[elem] batch_input_size = _input[elem].shape[0] elif hasattr(task_features, elem): _input[elem] = getattr(task_features, elem) batch_input_size = _input[elem].shape[0] if elem in _input: if we_transform_input(self.task_types[task_id]): _input[elem] = _input[elem].view( (-1, _input[elem].size(-1))) if labels is not None: if self.task_types[task_id] in ["regression", "binary_head"]: _input["labels"] = torch.tensor( np.array(labels, dtype=float), dtype=torch.float32 ) elif self.task_types[task_id] == 'multiple_choice': labels = torch.Tensor(labels).long() _input['labels'] = labels elif self.task_types[task_id] == 'sequence_labeling': subtoken_labels = [token_labels_to_subtoken_labels(y_el, y_mask, input_mask) for y_el, y_mask, input_mask in zip(labels, _input['token_type_ids'].numpy(), _input['attention_mask'].numpy())] _input['labels'] = torch.from_numpy( np.array(subtoken_labels)).to(torch.int64) else: if not self.multilabel[task_id]: _input["labels"] = torch.from_numpy(np.array(labels)) elif self.multilabel[task_id]: # We assume that labels already are one hot encoded num_classes = self.tasks_num_classes[task_id] _input['labels'] = torch.zeros((len(labels), num_classes)) for i in range(len(labels)): for label_ind in labels[i]: _input['labels'][i][label_ind] = 1 element_list = element_list + ['labels'] for elem in element_list: if elem not in _input: _input[elem] = None else: _input[elem] = _input[elem].to(self.device) if 'labels' in _input and self.task_types[task_id] != 'multiple_choice': error_msg = f'Len of labels {len(_input["labels"])} does not match len of ids {len(_input["input_ids"])}' if len(_input['labels']) != len(_input['input_ids']): raise Exception(error_msg) return _input, batch_input_size def __call__(self, *args): """Make prediction for given features (texts). Args: features: batch of InputFeatures for all tasks Returns: predicted classes or probabilities of each class """ # IMPROVE ARGS CHECKING AFTER DEBUG log.debug(f'Calling {args}') self.validation_predictions = [None for _ in range(len(args))] for task_id in range(len(self.task_names)): if len(args[task_id]): _input, batch_input_size = self._make_input(task_features=args[task_id], task_id=task_id) if 'input_ids' not in _input: raise Exception(f'No input_ids in _input {_input}') cache_key = self.types_to_cache[task_id] if cache_key != -1 and self.preds_cache[cache_key] is not None: last_hidden_state = self.preds_cache[cache_key] else: with torch.no_grad(): if self.is_data_parallel: last_hidden_state = self.model.module.get_logits(task_id, **_input) else: last_hidden_state = self.model.get_logits(task_id, **_input) if cache_key != -1: self.preds_cache[cache_key] = last_hidden_state with torch.no_grad(): if self.is_data_parallel: logits = self.model.module.predict_on_top(task_id, last_hidden_state) else: logits = self.model.predict_on_top(task_id, last_hidden_state) if self.task_types[task_id] == 'sequence_labeling': y_mask = _input['token_type_ids'].cpu() logits = token_from_subtoken(logits.cpu(), y_mask) predicted_ids = torch.argmax(logits, dim=-1).int().tolist() seq_lengths = torch.sum(y_mask, dim=1).int().tolist() pred = [prediction[:max_seq_len] for max_seq_len, prediction in zip(seq_lengths, predicted_ids)] elif self.task_types[task_id] in ['regression', 'binary_head']: pred = logits[:, 0] if self.task_types[task_id] == 'binary_head': pred = torch.sigmoid(logits).squeeze(1) if not self.return_probas: pred = (pred > self.binary_threshold).int() pred = pred.cpu().numpy() else: if self.multilabel[task_id]: probs = torch.sigmoid(logits) if self.return_probas: pred = probs pred = pred.cpu().numpy() else: numbers_of_sample, numbers_of_class = (probs > self.binary_threshold).nonzero(as_tuple=True) numbers_of_sample, numbers_of_class = numbers_of_sample.cpu().numpy(), numbers_of_class.cpu().numpy() pred = [[] for _ in range(len(logits))] for sample_num, class_num in zip(numbers_of_sample, numbers_of_class): pred[sample_num].append(int(class_num)) else: if self.multilabel[task_id]: probs = torch.sigmoid(logits) if self.return_probas: pred = probs pred = pred.cpu().numpy() else: numbers_of_sample, numbers_of_class = (probs > self.binary_threshold).nonzero(as_tuple=True) numbers_of_sample, numbers_of_class = numbers_of_sample.cpu().numpy(), numbers_of_class.cpu().numpy() pred = [[] for _ in range(len(logits))] for sample_num, class_num in zip(numbers_of_sample, numbers_of_class): pred[sample_num].append(int(class_num)) else: if self.return_probas: pred = torch.softmax(logits, dim=-1) else: pred = torch.argmax(logits, dim=1) pred = pred.cpu().numpy() self.validation_predictions[task_id] = pred if len(args) == 1: return self.validation_predictions[0] for i in range(len(self.validation_predictions)): if self.validation_predictions[i] is None: self.validation_predictions[i] = [] self._reset_cache() log.debug(self.validation_predictions) return self.validation_predictions def train_on_batch(self, *args): """Train model on given batch. This method calls train_op using features and y (labels). Args: features: batch of InputFeatures y: batch of labels (class id) Returns: dict with loss for each task """ log.debug(f'Training for {args}') error_msg = f'Len of arguments {len(args)} is WRONG. ' \ f'Correct is {2 * self.n_tasks} as n_tasks is {self.n_tasks}' if len(args) != 2 * self.n_tasks: raise Exception(error_msg) ids_to_iterate = [k for k in range(self.n_tasks) if len(args[k]) > 0] if len(ids_to_iterate) == 0: raise Exception(f'No examples given! Given args {args}') elif len(ids_to_iterate) > 1: raise Exception('Samples from more than 1 task in train_on_batch') task_id = ids_to_iterate[0] _input, batch_size = self._make_input(task_features=args[task_id], task_id=task_id, labels=args[task_id + self.n_tasks]) if _input == {}: raise Exception('Empty input!') if self.prev_id is None: self.prev_id = task_id elif self.prev_id != task_id and not self.printed: log.info('Seen samples from different tasks') self.printed = True if 'token_type_ids' not in _input: _input['token_type_ids'] = None loss, logits = self.model(task_id=task_id, **_input) if self.is_data_parallel: loss = loss.mean() loss = loss / self.gradient_accumulation_steps loss.backward() # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. if self.clip_norm: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip_norm) if (self.steps_taken + 1) % self.gradient_accumulation_steps == 0 or ( self.steps_per_epoch is not None and (self.steps_taken + 1) % self.steps_per_epoch == 0): self.optimizer.step() self.optimizer.zero_grad() self.train_losses[task_id] = loss.item() self.steps_taken += 1 log.debug(f'train {task_id} {logits}') return {"losses": self.train_losses} ================================================ FILE: deeppavlov/models/torch_bert/torch_bert_ranker.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from pathlib import Path from typing import List, Dict, Union, Optional import numpy as np import torch from transformers import AutoModelForSequenceClassification, AutoConfig from transformers.data.processors.utils import InputFeatures from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register from deeppavlov.core.models.torch_model import TorchModel log = getLogger(__name__) @register('torch_bert_ranker') class TorchBertRankerModel(TorchModel): """BERT-based model for interaction-based text ranking on PyTorch. Linear transformation is trained over the BERT pooled output from [CLS] token. Predicted probabilities of classes are used as a similarity measure for ranking. Args: pretrained_bert: pretrained Bert checkpoint path or key title (e.g. "bert-base-uncased") bert_config_file: path to Bert configuration file (not used if pretrained_bert is key title) n_classes: number of classes return_probas: set True if class probabilities are returned instead of the most probable label """ def __init__(self, pretrained_bert: str = None, bert_config_file: Optional[str] = None, n_classes: int = 2, return_probas: bool = True, **kwargs) -> None: self.return_probas = return_probas if self.return_probas and n_classes == 1: raise RuntimeError('Set return_probas to False for regression task!') if pretrained_bert: log.debug(f"From pretrained {pretrained_bert}.") if Path(expand_path(pretrained_bert)).exists(): pretrained_bert = str(expand_path(pretrained_bert)) config = AutoConfig.from_pretrained(pretrained_bert, # num_labels=self.n_classes, output_attentions=False, output_hidden_states=False) model = AutoModelForSequenceClassification.from_pretrained(pretrained_bert, config=config) # TODO: make better exception handling here and at # deeppavlov.models.torch_bert.torch_transformers_classifier.TorchTransformersClassifierModel.load try: hidden_size = model.classifier.out_proj.in_features if n_classes != model.num_labels: model.classifier.out_proj.weight = torch.nn.Parameter(torch.randn(n_classes, hidden_size)) model.classifier.out_proj.bias = torch.nn.Parameter(torch.randn(n_classes)) model.classifier.out_proj.out_features = n_classes model.num_labels = n_classes except AttributeError: hidden_size = model.classifier.in_features if n_classes != model.num_labels: model.classifier.weight = torch.nn.Parameter(torch.randn(n_classes, hidden_size)) model.classifier.bias = torch.nn.Parameter(torch.randn(n_classes)) model.classifier.out_features = n_classes model.num_labels = n_classes elif bert_config_file and expand_path(bert_config_file).is_file(): self.bert_config = AutoConfig.from_pretrained(str(expand_path(bert_config_file))) model = AutoModelForSequenceClassification.from_config(config=self.bert_config) else: raise ConfigError("No pre-trained BERT model is given.") super().__init__(model, **kwargs) def train_on_batch(self, features_li: List[List[InputFeatures]], y: Union[List[int], List[List[int]]]) -> Dict: """Train the model on the given batch. Args: features_li: list with the single element containing the batch of InputFeatures y: batch of labels (class id or one-hot encoding) Returns: dict with loss and learning rate values """ features = features_li[0] input_ids = [f.input_ids for f in features] input_masks = [f.attention_mask for f in features] b_input_ids = torch.cat(input_ids, dim=0).to(self.device) b_input_masks = torch.cat(input_masks, dim=0).to(self.device) b_labels = torch.from_numpy(np.array(y)).to(self.device) self.optimizer.zero_grad() loss, logits = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_masks, labels=b_labels, return_dict=False) self._make_step(loss) return {'loss': loss.item()} def __call__(self, features_li: List[List[InputFeatures]]) -> Union[List[int], List[List[float]]]: """Calculate scores for the given context over candidate responses. Args: features_li: list of elements where each element contains the batch of features for contexts with particular response candidates Returns: predicted scores for contexts over response candidates """ if len(features_li) == 1 and len(features_li[0]) == 1: msg = f"It is not intended to use the {self.__class__} in the interact mode." log.error(msg) return [msg] predictions = [] for features in features_li: input_ids = [f.input_ids for f in features] input_masks = [f.attention_mask for f in features] b_input_ids = torch.cat(input_ids, dim=0).to(self.device) b_input_masks = torch.cat(input_masks, dim=0).to(self.device) with torch.no_grad(): # Forward pass, calculate logit predictions logits = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_masks) logits = logits[0] if self.return_probas: pred = torch.nn.functional.softmax(logits, dim=-1)[:, 1] pred = pred.detach().cpu().numpy() else: logits = logits.detach().cpu().numpy() pred = np.argmax(logits, axis=1) predictions.append(pred) if len(features_li) == 1: predictions = predictions[0] else: predictions = np.hstack([np.expand_dims(el, 1) for el in predictions]) return predictions ================================================ FILE: deeppavlov/models/torch_bert/torch_transformers_classifier.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from pathlib import Path from typing import List, Dict, Union, Optional, Tuple import numpy as np import torch from torch.nn import BCEWithLogitsLoss from transformers import AutoModelForSequenceClassification, AutoConfig, AutoModel, AutoTokenizer from transformers.modeling_outputs import SequenceClassifierOutput from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register from deeppavlov.core.models.torch_model import TorchModel log = getLogger(__name__) @register('torch_transformers_classifier') class TorchTransformersClassifierModel(TorchModel): """Bert-based model for text classification on PyTorch. It uses output from [CLS] token and predicts labels using linear transformation. Args: n_classes: number of classes pretrained_bert: pretrained Bert checkpoint path or key title (e.g. "bert-base-uncased") multilabel: set True if it is multi-label classification return_probas: set True if return class probabilites instead of most probable label needed attention_probs_keep_prob: keep_prob for Bert self-attention layers hidden_keep_prob: keep_prob for Bert hidden layers bert_config_file: path to Bert configuration file (not used if pretrained_bert is key title) is_binary: whether classification task is binary or multi-class num_special_tokens: number of special tokens used by classification model """ def __init__(self, n_classes, pretrained_bert, multilabel: bool = False, return_probas: bool = False, attention_probs_keep_prob: Optional[float] = None, hidden_keep_prob: Optional[float] = None, bert_config_file: Optional[str] = None, is_binary: Optional[bool] = False, num_special_tokens: int = None, **kwargs) -> None: self.return_probas = return_probas self.multilabel = multilabel self.n_classes = n_classes self.is_binary = is_binary if self.multilabel and not self.return_probas: raise RuntimeError('Set return_probas to True for multilabel classification!') if self.return_probas and self.n_classes == 1: raise RuntimeError('Set return_probas to False for regression task!') if pretrained_bert: log.debug(f"From pretrained {pretrained_bert}.") config = AutoConfig.from_pretrained(pretrained_bert, # num_labels=self.n_classes, output_attentions=False, output_hidden_states=False) if self.is_binary: config.add_pooling_layer = False model = AutoModelForBinaryClassification(pretrained_bert, config) else: model = AutoModelForSequenceClassification.from_pretrained(pretrained_bert, config=config) # TODO need a better solution here and at # deeppavlov.models.torch_bert.torch_bert_ranker.TorchBertRankerModel.load try: hidden_size = model.classifier.out_proj.in_features if self.n_classes != model.num_labels: model.classifier.out_proj.weight = torch.nn.Parameter(torch.randn(self.n_classes, hidden_size)) model.classifier.out_proj.bias = torch.nn.Parameter(torch.randn(self.n_classes)) model.classifier.out_proj.out_features = self.n_classes model.num_labels = self.n_classes except AttributeError: hidden_size = model.classifier.in_features if self.n_classes != model.num_labels: model.classifier.weight = torch.nn.Parameter(torch.randn(self.n_classes, hidden_size)) model.classifier.bias = torch.nn.Parameter(torch.randn(self.n_classes)) model.classifier.out_features = self.n_classes model.num_labels = self.n_classes elif bert_config_file and Path(bert_config_file).is_file(): bert_config = AutoConfig.from_pretrained(str(expand_path(bert_config_file))) if attention_probs_keep_prob is not None: bert_config.attention_probs_dropout_prob = 1.0 - attention_probs_keep_prob if hidden_keep_prob is not None: bert_config.hidden_dropout_prob = 1.0 - hidden_keep_prob model = AutoModelForSequenceClassification.from_config(config=bert_config) else: raise ConfigError("No pre-trained BERT model is given.") tokenizer = AutoTokenizer.from_pretrained(pretrained_bert) if num_special_tokens is not None: model.resize_token_embeddings(len(tokenizer) + num_special_tokens) super().__init__(model, **kwargs) def train_on_batch(self, features: Dict[str, torch.tensor], y: Union[List[int], List[List[int]]]) -> Dict: """Train model on given batch. This method calls train_op using features and y (labels). Args: features: batch of InputFeatures y: batch of labels (class id or one-hot encoding) Returns: dict with loss and learning_rate values """ _input = {key: value.to(self.device) for key, value in features.items()} if self.n_classes > 1 and not self.is_binary: _input["labels"] = torch.from_numpy(np.array(y)).to(self.device) # regression else: _input["labels"] = torch.from_numpy(np.array(y, dtype=np.float32)).unsqueeze(1).to(self.device) self.optimizer.zero_grad() tokenized = {key: value for (key, value) in _input.items() if key in self.accepted_keys} loss = self.model(**tokenized).loss if self.is_data_parallel: loss = loss.mean() self._make_step(loss) return {'loss': loss.item()} def __call__(self, features: Dict[str, torch.tensor]) -> Union[List[int], List[List[float]]]: """Make prediction for given features (texts). Args: features: batch of InputFeatures Returns: predicted classes or probabilities of each class """ _input = {key: value.to(self.device) for key, value in features.items()} with torch.no_grad(): tokenized = {key: value for (key, value) in _input.items() if key in self.accepted_keys} # Forward pass, calculate logit predictions logits = self.model(**tokenized) logits = logits[0] if self.return_probas: if self.is_binary: pred = torch.sigmoid(logits).squeeze(1) elif not self.multilabel: pred = torch.nn.functional.softmax(logits, dim=-1) else: pred = torch.nn.functional.sigmoid(logits) pred = pred.detach().cpu().numpy() elif self.n_classes > 1: logits = logits.detach().cpu().numpy() pred = np.argmax(logits, axis=1) # regression else: pred = logits.squeeze(-1).detach().cpu().numpy() return pred # TODO move to the super class @property def accepted_keys(self) -> Tuple[str]: if self.is_data_parallel: accepted_keys = self.model.module.forward.__code__.co_varnames else: accepted_keys = self.model.forward.__code__.co_varnames return accepted_keys class AutoModelForBinaryClassification(torch.nn.Module): def __init__(self, pretrained_bert, config): super().__init__() self.pretrained_bert = pretrained_bert self.config = config self.model = AutoModel.from_pretrained(self.pretrained_bert, self.config) self.classifier = BinaryClassificationHead(config) self.classifier.init_weights() def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None): return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict) sequence_output = outputs[0] logits = self.classifier(sequence_output) loss = None if labels is not None: loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions) class BinaryClassificationHead(torch.nn.Module): def __init__(self, config): super().__init__() self.config = config self.dense = torch.nn.Linear(config.hidden_size, config.hidden_size) self.dropout = torch.nn.Dropout(config.hidden_dropout_prob) self.out_proj = torch.nn.Linear(config.hidden_size, 1) def init_weights(self): self.dense.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if self.dense.bias is not None: self.dense.bias.data.zero_() def forward(self, features, **kwargs): x = features[:, 0, :] x = self.dropout(x) x = self.dense(x) x = torch.tanh(x) x = self.dropout(x) x = self.out_proj(x) return x ================================================ FILE: deeppavlov/models/torch_bert/torch_transformers_el_ranker.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from pathlib import Path from typing import List, Optional, Dict, Tuple, Union, Any import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from torch import Tensor from transformers import AutoConfig, AutoTokenizer, AutoModel from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register from deeppavlov.core.models.torch_model import TorchModel from deeppavlov.models.preprocessors.torch_transformers_preprocessor import TorchTransformersEntityRankerPreprocessor log = getLogger(__name__) @register('torch_transformers_el_ranker') class TorchTransformersElRanker(TorchModel): """Class for ranking of entities by context and description Args: encoder_save_path: path to save the encoder checkpoint bilinear_save_path: path to save bilinear layer checkpoint block_size: size of block in bilinear layer emb_size: entity embedding size pretrained_bert: pretrained Bert checkpoint path or key title (e.g. "bert-base-uncased") bert_config_file: path to Bert configuration file, or None, if `pretrained_bert` is a string name return_probas: set this to `True` if you need the probabilities instead of raw answers """ def __init__( self, encoder_save_path: str, bilinear_save_path: str, block_size: int, emb_size: int, pretrained_bert: str = None, return_probas: bool = False, **kwargs ): self.return_probas = return_probas model = SiameseBertElModel( pretrained_bert=pretrained_bert, encoder_save_path=encoder_save_path, bilinear_save_path=bilinear_save_path, bert_config_file=pretrained_bert, block_size=block_size, emb_size=emb_size ) super().__init__(model, **kwargs) def train_on_batch(self, q_features: List[Dict], c_features: List[Dict], entity_tokens_pos: List[int], labels: List[int]) -> float: """ Args: q_features: batch of indices of text subwords c_features: batch of indices of entity description subwords entity_tokens_pos: list of indices of special tokens labels: 1 if entity is appropriate to context, 0 - otherwise Returns: the value of loss """ _input = {'labels': labels} _input['entity_tokens_pos'] = entity_tokens_pos for elem in ['input_ids', 'attention_mask']: inp_elem = [getattr(f, elem) for f in q_features] _input[f"q_{elem}"] = torch.LongTensor(inp_elem).to(self.device) for elem in ['input_ids', 'attention_mask']: inp_elem = [getattr(f, elem) for f in c_features] _input[f"c_{elem}"] = torch.LongTensor(inp_elem).to(self.device) self.model.train() self.model.zero_grad() self.optimizer.zero_grad() # zero the parameter gradients loss, softmax_scores = self.model(**_input) self._make_step(loss) return loss.item() def __call__(self, q_features: List[Dict], c_features: List[Dict], entity_tokens_pos: List[int]) -> Union[List[int], List[np.ndarray]]: """ Predicts entity labels (1 if the entity description is appropriate to the context, 0 - otherwise) Args: q_features: batch of indices of text subwords c_features: batch of indices of entity description subwords entity_tokens_pos: list of indices of special tokens Returns: Label indices or class probabilities for each token (not subtoken) """ self.model.eval() _input = {'entity_tokens_pos': entity_tokens_pos} for elem in ['input_ids', 'attention_mask']: inp_elem = [getattr(f, elem) for f in q_features] _input[f"q_{elem}"] = torch.LongTensor(inp_elem).to(self.device) for elem in ['input_ids', 'attention_mask']: inp_elem = [getattr(f, elem) for f in c_features] _input[f"c_{elem}"] = torch.LongTensor(inp_elem).to(self.device) with torch.no_grad(): softmax_scores = self.model(**_input) if self.return_probas: pred = softmax_scores else: pred = torch.argmax(softmax_scores, dim=1).cpu().numpy() return pred def save(self, fname: Optional[str] = None, *args, **kwargs) -> None: if fname is None: fname = self.save_path if not fname.parent.is_dir(): raise ConfigError("Provided save path is incorrect!") weights_path = Path(fname).with_suffix(f".pth.tar") log.info(f"Saving model to {weights_path}.") torch.save({ "model_state_dict": self.model.cpu().state_dict(), "optimizer_state_dict": self.optimizer.state_dict(), "epochs_done": self.epochs_done }, weights_path) self.model.to(self.device) self.model.save() class TextEncoder(nn.Module): """Class for obtaining the BERT output for CLS-token and special entity token Args: pretrained_bert: pretrained Bert checkpoint path or key title (e.g. "bert-base-uncased") bert_config_file: path to Bert configuration file, or None, if `pretrained_bert` is a string name device: device to use """ def __init__(self, pretrained_bert: str = None, bert_config_file: str = None, device: torch.device = torch.device('cpu')): super().__init__() self.pretrained_bert = pretrained_bert self.bert_config_file = bert_config_file self.encoder, self.config, self.bert_config = None, None, None self.device = device self.load() self.tokenizer = AutoTokenizer.from_pretrained(self.pretrained_bert) self.encoder.resize_token_embeddings(len(self.tokenizer) + 1) self.encoder.to(self.device) def forward(self, input_ids: Tensor, attention_mask: Tensor, entity_tokens_pos: List[int] = None ) -> Union[Tuple[Any, Tensor], Tuple[Tensor]]: if entity_tokens_pos is not None: q_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask) q_hidden_states = q_outputs.last_hidden_state entity_emb = [] for i in range(len(entity_tokens_pos)): pos = entity_tokens_pos[i] entity_emb.append(q_hidden_states[i, pos]) entity_emb = torch.stack(entity_emb, dim=0).to(self.device) return entity_emb else: c_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask) c_cls_emb = c_outputs.last_hidden_state[:, :1, :].squeeze(1) return c_cls_emb def load(self) -> None: if self.pretrained_bert: log.debug(f"From pretrained {self.pretrained_bert}.") self.config = AutoConfig.from_pretrained( self.pretrained_bert, output_hidden_states=True ) self.encoder = AutoModel.from_pretrained(self.pretrained_bert, config=self.config) elif self.bert_config_file and Path(self.bert_config_file).is_file(): self.config = AutoConfig.from_json_file(str(expand_path(self.bert_config_file))) self.encoder = AutoModel.from_config(config=self.bert_config) else: raise ConfigError("No pre-trained BERT model is given.") self.encoder.to(self.device) class BilinearRanking(nn.Module): """Class for calculation of bilinear form of two vectors Args: n_classes: number of classes for classification emb_size: entity embedding size block_size: size of block in bilinear layer """ def __init__(self, n_classes: int = 2, emb_size: int = 768, block_size: int = 8): super().__init__() self.n_classes = n_classes self.emb_size = emb_size self.block_size = block_size self.bilinear = nn.Linear(self.emb_size * self.block_size, self.n_classes) self.softmax = nn.Softmax(dim=1) def forward(self, text1: Tensor, text2: Tensor): b1 = text1.view(-1, self.emb_size // self.block_size, self.block_size) b2 = text2.view(-1, self.emb_size // self.block_size, self.block_size) bl = (b1.unsqueeze(3) * b2.unsqueeze(2)).view(-1, self.emb_size * self.block_size) logits = self.bilinear(bl) softmax_logits = self.softmax(logits) log_softmax = F.log_softmax(logits, dim=-1) return softmax_logits, log_softmax class SiameseBertElModel(nn.Module): """Class with model for ranking of entities by context and description Args: emb_size: entity embedding size block_size: size of block in bilinear layer encoder_save_path: path to save the encoder checkpoint bilinear_save_path: path to save bilinear layer checkpoint pretrained_bert: pretrained Bert checkpoint path or key title (e.g. "bert-base-uncased") bert_config_file: path to Bert configuration file, or None, if `pretrained_bert` is a string name device: device to use """ def __init__( self, emb_size: int, block_size: int, encoder_save_path: str, bilinear_save_path: str, pretrained_bert: str = None, bert_config_file: str = None, device: torch.device = torch.device('cpu') ): super().__init__() self.pretrained_bert = pretrained_bert self.encoder_save_path = encoder_save_path self.bilinear_save_path = bilinear_save_path self.bert_config_file = bert_config_file self.device = device # initialize parameters that would be filled later self.encoder = TextEncoder(pretrained_bert=self.pretrained_bert, device=self.device) self.bilinear_ranker = BilinearRanking(emb_size, block_size) def forward( self, q_input_ids: Tensor, q_attention_mask: Tensor, c_input_ids: Tensor, c_attention_mask: Tensor, entity_tokens_pos: List, labels: List[int] = None ) -> Union[Tuple[Any, Tensor], Tuple[Tensor]]: entity_emb = self.encoder(input_ids=q_input_ids, attention_mask=q_attention_mask, entity_tokens_pos=entity_tokens_pos) c_cls_emb = self.encoder(input_ids=c_input_ids, attention_mask=c_attention_mask) softmax_scores, log_softmax = self.bilinear_ranker(entity_emb, c_cls_emb) if labels is not None: labels_one_hot = [[0.0, 0.0] for _ in labels] for i in range(len(labels)): labels_one_hot[i][labels[i]] = 1.0 labels_one_hot = torch.Tensor(labels_one_hot).to(self.device) bs, dim = labels_one_hot.shape per_sample_loss = -torch.bmm(labels_one_hot.view(bs, 1, dim), log_softmax.view(bs, dim, 1)).squeeze( 2).squeeze(1) loss = torch.mean(per_sample_loss) return loss, softmax_scores else: return softmax_scores def save(self) -> None: encoder_weights_path = expand_path(self.encoder_save_path).with_suffix(f".pth.tar") log.info(f"Saving encoder to {encoder_weights_path}.") torch.save({"model_state_dict": self.encoder.cpu().state_dict()}, encoder_weights_path) bilinear_weights_path = expand_path(self.bilinear_save_path).with_suffix(f".pth.tar") log.info(f"Saving bilinear weights to {bilinear_weights_path}.") torch.save({"model_state_dict": self.bilinear_ranker.cpu().state_dict()}, bilinear_weights_path) self.encoder.to(self.device) self.bilinear_ranker.to(self.device) @register('torch_transformers_entity_ranker_infer') class TorchTransformersEntityRankerInfer: """Class for infering of model for ranking of entities from a knowledge base by context and description Args: pretrained_bert: pretrained Bert checkpoint path or key title (e.g. "bert-base-uncased") encoder_weights_path: path to save the encoder checkpoint bilinear_weights_path: path to save bilinear layer checkpoint spaecial_token_id: id of special token do_lower_case: whether to lower case the text batch_size: batch size when model infering emb_size: entity embedding size block_size: size of block in bilinear layer device: `cpu` or `gpu` device to use """ def __init__(self, pretrained_bert, encoder_weights_path, bilinear_weights_path, special_token_id: int, do_lower_case: bool = False, batch_size: int = 5, emb_size: int = 300, block_size: int = 8, device: str = "gpu", **kwargs): self.device = torch.device("cuda" if torch.cuda.is_available() and device == "gpu" else "cpu") self.pretrained_bert = pretrained_bert self.preprocessor = TorchTransformersEntityRankerPreprocessor(vocab_file=self.pretrained_bert, do_lower_case=do_lower_case, special_tokens=["[ENT]"]) self.encoder, self.config = None, None self.config = AutoConfig.from_pretrained(self.pretrained_bert, output_hidden_states=True) self.emb_size = emb_size self.block_size = block_size self.encoder = TextEncoder(pretrained_bert=self.pretrained_bert, device=self.device) self.encoder_weights_path = str(expand_path(encoder_weights_path)) self.bilinear_weights_path = str(expand_path(bilinear_weights_path)) encoder_checkpoint = torch.load(self.encoder_weights_path, map_location=self.device) self.encoder.load_state_dict(encoder_checkpoint["model_state_dict"]) self.encoder.to(self.device) self.bilinear_ranking = BilinearRanking(emb_size=self.emb_size, block_size=self.block_size) bilinear_checkpoint = torch.load(self.bilinear_weights_path, map_location=self.device) self.bilinear_ranking.load_state_dict(bilinear_checkpoint["model_state_dict"]) self.bilinear_ranking.to(self.device) self.special_token_id = special_token_id self.batch_size = batch_size def __call__(self, contexts_batch: List[str], candidate_entities_batch: List[List[str]], candidate_entities_descr_batch: List[List[str]]): entity_emb_batch = [] num_batches = len(contexts_batch) // self.batch_size + int(len(contexts_batch) % self.batch_size > 0) for ii in range(num_batches): contexts_list = contexts_batch[ii * self.batch_size:(ii + 1) * self.batch_size] context_features = self.preprocessor(contexts_list) context_input_ids = context_features["input_ids"].to(self.device) context_attention_mask = context_features["attention_mask"].to(self.device) special_tokens_pos = [] for input_ids_list in context_input_ids: found_n = -1 for n, input_id in enumerate(input_ids_list): if input_id == self.special_token_id: found_n = n break if found_n == -1: found_n = 0 special_tokens_pos.append(found_n) cur_entity_emb_batch = self.encoder(input_ids=context_input_ids, attention_mask=context_attention_mask, entity_tokens_pos=special_tokens_pos) entity_emb_batch += cur_entity_emb_batch.detach().cpu().numpy().tolist() scores_batch = [] for entity_emb, candidate_entities_list, candidate_entities_descr_list in \ zip(entity_emb_batch, candidate_entities_batch, candidate_entities_descr_batch): if candidate_entities_list: entity_emb = [entity_emb for _ in candidate_entities_list] entity_emb = torch.Tensor(entity_emb).to(self.device) descr_features = self.preprocessor(candidate_entities_descr_list) descr_input_ids = descr_features["input_ids"].to(self.device) descr_attention_mask = descr_features["attention_mask"].to(self.device) candidate_entities_emb = self.encoder(input_ids=descr_input_ids, attention_mask=descr_attention_mask) scores_list, _ = self.bilinear_ranking(entity_emb, candidate_entities_emb) scores_list = scores_list.detach().cpu().numpy() scores_list = [score[1] for score in scores_list] entities_with_scores = [(entity, score) for entity, score in zip(candidate_entities_list, scores_list)] entities_with_scores = sorted(entities_with_scores, key=lambda x: x[1], reverse=True) scores_batch.append(entities_with_scores) else: scores_batch.append([]) return scores_batch ================================================ FILE: deeppavlov/models/torch_bert/torch_transformers_multiplechoice.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from pathlib import Path from typing import List, Dict, Union, Optional import numpy as np import torch from transformers import AutoModelForMultipleChoice, AutoConfig from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.registry import register from deeppavlov.core.models.torch_model import TorchModel log = getLogger(__name__) @register('torch_transformers_multiplechoice') class TorchTransformersMultiplechoiceModel(TorchModel): """Bert-based model for text classification on PyTorch. It uses output from [CLS] token and predicts labels using linear transformation. Args: n_classes: number of classes pretrained_bert: pretrained Bert checkpoint path or key title (e.g. "bert-base-uncased") multilabel: set True if it is multi-label classification return_probas: set True if return class probabilites instead of most probable label needed attention_probs_keep_prob: keep_prob for Bert self-attention layers hidden_keep_prob: keep_prob for Bert hidden layers bert_config_file: path to Bert configuration file (not used if pretrained_bert is key title) """ def __init__(self, n_classes, pretrained_bert, multilabel: bool = False, return_probas: bool = False, attention_probs_keep_prob: Optional[float] = None, hidden_keep_prob: Optional[float] = None, bert_config_file: Optional[str] = None, **kwargs) -> None: self.return_probas = return_probas self.multilabel = multilabel self.n_classes = n_classes if self.multilabel and not self.return_probas: raise RuntimeError('Set return_probas to True for multilabel classification!') if self.return_probas and self.n_classes == 1: raise RuntimeError('Set return_probas to False for regression task!') if pretrained_bert: log.debug(f"From pretrained {pretrained_bert}.") config = AutoConfig.from_pretrained(pretrained_bert, num_labels=self.n_classes, output_attentions=False, output_hidden_states=False) model = AutoModelForMultipleChoice.from_pretrained(pretrained_bert, config=config) elif bert_config_file and Path(bert_config_file).is_file(): bert_config = AutoConfig.from_json_file(str(expand_path(bert_config_file))) if attention_probs_keep_prob is not None: bert_config.attention_probs_dropout_prob = 1.0 - attention_probs_keep_prob if hidden_keep_prob is not None: bert_config.hidden_dropout_prob = 1.0 - hidden_keep_prob model = AutoModelForMultipleChoice.from_config(config=bert_config) else: raise ConfigError("No pre-trained BERT model is given.") super().__init__(model, **kwargs) def train_on_batch(self, features: Dict[str, torch.tensor], y: Union[List[int], List[List[int]]]) -> Dict: """Train model on given batch. This method calls train_op using features and y (labels). Args: features: batch of InputFeatures y: batch of labels (class id or one-hot encoding) Returns: dict with loss and learning_rate values """ _input = {key: value.to(self.device) for key, value in features.items()} _input["labels"] = torch.tensor(y).long().to(self.device) self.optimizer.zero_grad() tokenized = {key: value for (key, value) in _input.items() if key in self.model.forward.__code__.co_varnames} loss = self.model(**tokenized).loss self._make_step(loss) return {'loss': loss.item()} def __call__(self, features: Dict[str, torch.tensor]) -> Union[List[int], List[List[float]]]: """Make prediction for given features (texts). Args: features: batch of InputFeatures Returns: predicted classes or probabilities of each class """ _input = {key: value.to(self.device) for key, value in features.items()} with torch.no_grad(): tokenized = {key: value for (key, value) in _input.items() if key in self.model.forward.__code__.co_varnames} # Forward pass, calculate logit predictions logits = self.model(**tokenized) logits = logits[0] if self.return_probas: if not self.multilabel: pred = torch.nn.functional.softmax(logits, dim=-1) else: pred = torch.nn.functional.sigmoid(logits) pred = pred.detach().cpu().numpy() elif self.n_classes > 1: logits = logits.detach().cpu().numpy() pred = np.argmax(logits, axis=1) else: # regression pred = logits.squeeze(-1).detach().cpu().numpy() return pred ================================================ FILE: deeppavlov/models/torch_bert/torch_transformers_nll_ranking.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from pathlib import Path from typing import List, Dict, Tuple, Union, Any import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from torch import Tensor from transformers import AutoConfig, AutoModel, AutoTokenizer from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register from deeppavlov.core.models.torch_model import TorchModel log = getLogger(__name__) @register('torch_transformers_nll_ranker') class TorchTransformersNLLRanker(TorchModel): """Class for ranking of relations using the model trained with NLL loss Args: pretrained_bert: pretrained transformer checkpoint path or key title (e.g. "bert-base-uncased") encoder_save_path: path to save the encoder checkpoint linear_save_path: path to save linear layer checkpoint return_probas: set this to `True` if you need the probabilities instead of raw answers """ def __init__( self, pretrained_bert: str = None, encoder_save_path: str = None, linear_save_path: str = None, return_probas: bool = False, **kwargs ): self.return_probas = return_probas model = NLLRanking( pretrained_bert=pretrained_bert, encoder_save_path=encoder_save_path, linear_save_path=linear_save_path, bert_tokenizer_config_file=pretrained_bert, ) super().__init__(model, **kwargs) def train_on_batch(self, input_features: Dict[str, Any], positive_idx: List[int]) -> float: _input = {'positive_idx': positive_idx, "input_ids": torch.LongTensor(input_features["input_ids"]).to(self.device), "attention_mask": torch.LongTensor(input_features["attention_mask"]).to(self.device), "token_type_ids": torch.LongTensor(input_features["token_type_ids"]).to(self.device)} self.model.train() self.model.zero_grad() self.optimizer.zero_grad() # zero the parameter gradients loss, softmax_scores = self.model(**_input) loss.backward() self.optimizer.step() # Clip the norm of the gradients to prevent the "exploding gradients" problem if self.clip_norm: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip_norm) return loss.item() def __call__(self, input_features: Dict[str, Any]) -> Union[List[int], List[np.ndarray]]: self.model.eval() _input = {"input_ids": torch.LongTensor(input_features["input_ids"]).to(self.device), "attention_mask": torch.LongTensor(input_features["attention_mask"]).to(self.device), "token_type_ids": torch.LongTensor(input_features["token_type_ids"]).to(self.device)} with torch.no_grad(): output = self.model(**_input) if isinstance(output, tuple) and len(output) == 2: loss, softmax_scores = output else: softmax_scores = output if self.return_probas: softmax_scores = softmax_scores.cpu().numpy().tolist() return softmax_scores else: pred = torch.argmax(softmax_scores, dim=1) pred = pred.cpu() pred = pred.numpy() return pred class NLLRanking(nn.Module): """Class which implements the relation ranking model Args: pretrained_bert: pretrained transformer checkpoint path or key title (e.g. "bert-base-uncased") encoder_save_path: path to save the encoder checkpoint linear_save_path: path to save linear layer checkpoint bert_tokenizer_config_file: path to configuration file of transformer tokenizer device: cpu or gpu """ def __init__( self, pretrained_bert: str = None, encoder_save_path: str = None, linear_save_path: str = None, bert_tokenizer_config_file: str = None, device: str = "gpu" ): super().__init__() self.pretrained_bert = pretrained_bert self.encoder_save_path = encoder_save_path self.linear_save_path = linear_save_path self.device = torch.device("cuda" if torch.cuda.is_available() and device == "gpu" else "cpu") # initialize parameters that would be filled later self.encoder, self.config, self.bert_config = None, None, None self.load() if Path(bert_tokenizer_config_file).is_file(): vocab_file = str(expand_path(bert_tokenizer_config_file)) tokenizer = AutoTokenizer(vocab_file=vocab_file) else: tokenizer = AutoTokenizer.from_pretrained(pretrained_bert) self.encoder.resize_token_embeddings(len(tokenizer) + 7) def forward( self, input_ids: Tensor, attention_mask: Tensor, token_type_ids: Tensor, positive_idx: List[List[int]] = None ) -> Union[Tuple[Any, Tensor], Tuple[Tensor]]: bs, samples_num, seq_len = input_ids.size() input_ids = input_ids.reshape(bs * samples_num, -1) attention_mask = attention_mask.reshape(bs * samples_num, -1) token_type_ids = token_type_ids.reshape(bs * samples_num, -1) if hasattr(self.config, "type_vocab_size"): encoder_output = self.encoder(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) else: encoder_output = self.encoder(input_ids=input_ids, attention_mask=attention_mask) cls_emb = encoder_output.last_hidden_state[:, :1, :].squeeze(1) scores = self.fc(cls_emb) scores = scores.reshape(bs, samples_num) if positive_idx is not None: scores = F.log_softmax(scores, dim=1) positive_idx = [] for i in range(bs): positive_idx.append(0) loss = F.nll_loss(scores, torch.tensor(positive_idx).to(scores.device), reduction="mean") return loss, scores else: return scores def load(self) -> None: if self.pretrained_bert: log.info(f"From pretrained {self.pretrained_bert}.") self.config = AutoConfig.from_pretrained( self.pretrained_bert, output_hidden_states=True ) self.encoder = AutoModel.from_pretrained(self.pretrained_bert, config=self.config) self.fc = nn.Linear(self.config.hidden_size, 1) else: raise ConfigError("No pre-trained BERT model is given.") self.encoder.to(self.device) self.fc.to(self.device) ================================================ FILE: deeppavlov/models/torch_bert/torch_transformers_sequence_tagger.py ================================================ # Copyright 2019 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from pathlib import Path from typing import List, Union, Dict, Optional, Tuple import numpy as np import torch from transformers import AutoModelForTokenClassification, AutoConfig from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register from deeppavlov.core.models.torch_model import TorchModel from deeppavlov.models.torch_bert.crf import CRF log = getLogger(__name__) def token_from_subtoken(units: torch.Tensor, mask: torch.Tensor) -> torch.Tensor: """ Assemble token level units from subtoken level units Args: units: torch.Tensor of shape [batch_size, SUBTOKEN_seq_length, n_features] mask: mask of token beginnings. For example: for tokens [[``[CLS]`` ``My``, ``capybara``, ``[SEP]``], [``[CLS]`` ``Your``, ``aar``, ``##dvark``, ``is``, ``awesome``, ``[SEP]``]] the mask will be [[0, 1, 1, 0, 0, 0, 0], [0, 1, 1, 0, 1, 1, 0]] Returns: word_level_units: Units assembled from ones in the mask. For the example above this units will correspond to the following [[``My``, ``capybara``], [``Your`, ``aar``, ``is``, ``awesome``,]] the shape of this tensor will be [batch_size, TOKEN_seq_length, n_features] """ shape = units.size() batch_size = shape[0] nf = shape[2] nf_int = units.size()[-1] token_seq_lengths = torch.sum(mask, 1).to(torch.int64) n_words = torch.sum(token_seq_lengths) max_token_seq_len = torch.max(token_seq_lengths) idxs = torch.stack(torch.nonzero(mask, as_tuple=True), dim=1) sample_ids_in_batch = torch.nn.functional.pad(input=idxs[:, 0], pad=[1, 0]) a = torch.logical_not(torch.eq(sample_ids_in_batch[1:], sample_ids_in_batch[:-1]).to(torch.int64)) q = a * torch.arange(n_words).to(torch.int64) count_to_substract = torch.nn.functional.pad(torch.masked_select(q, q.to(torch.bool)), [1, 0]) new_word_indices = torch.arange(n_words).to(torch.int64) - torch.gather( count_to_substract, dim=0, index=torch.cumsum(a, 0)) n_total_word_elements = (batch_size * max_token_seq_len).to(torch.int32) word_indices_flat = (idxs[:, 0] * max_token_seq_len + new_word_indices).to(torch.int64) x_mask = torch.sum(torch.nn.functional.one_hot(word_indices_flat, n_total_word_elements), 0) x_mask = x_mask.to(torch.bool) full_range = torch.arange(batch_size * max_token_seq_len).to(torch.int64) nonword_indices_flat = torch.masked_select(full_range, torch.logical_not(x_mask)) def gather_nd(params, indices): assert type(indices) == torch.Tensor return params[indices.transpose(0, 1).long().numpy().tolist()] elements = gather_nd(units, idxs) sh = tuple(torch.stack([torch.sum(max_token_seq_len - token_seq_lengths), torch.tensor(nf)], 0).numpy()) paddings = torch.zeros(sh, dtype=torch.float64) def dynamic_stitch(indices, data): # https://discuss.pytorch.org/t/equivalent-of-tf-dynamic-partition/53735/2 n = sum(idx.numel() for idx in indices) res = [None] * n for i, data_ in enumerate(data): idx = indices[i].view(-1) if idx.numel() > 0: d = data_.view(idx.numel(), -1) k = 0 for idx_ in idx: res[idx_] = d[k].to(torch.float64) k += 1 return res tensor_flat = torch.stack(dynamic_stitch([word_indices_flat, nonword_indices_flat], [elements, paddings])) tensor = torch.reshape(tensor_flat, (batch_size, max_token_seq_len.item(), nf_int)) return tensor def token_labels_to_subtoken_labels(labels, y_mask, input_mask): subtoken_labels = [] labels_ind = 0 n_tokens_with_special = int(np.sum(input_mask)) for el in y_mask[1:n_tokens_with_special - 1]: if el == 1: subtoken_labels += [labels[labels_ind]] labels_ind += 1 else: subtoken_labels += [labels[labels_ind - 1]] subtoken_labels = [0] + subtoken_labels + [0] * (len(input_mask) - n_tokens_with_special + 1) return subtoken_labels @register('torch_transformers_sequence_tagger') class TorchTransformersSequenceTagger(TorchModel): """Transformer-based model on PyTorch for text tagging. It predicts a label for every token (not subtoken) in the text. You can use it for sequence labeling tasks, such as morphological tagging or named entity recognition. Args: n_tags: number of distinct tags pretrained_bert: pretrained Bert checkpoint path or key title (e.g. "bert-base-uncased") bert_config_file: path to Bert configuration file, or None, if `pretrained_bert` is a string name attention_probs_keep_prob: keep_prob for Bert self-attention layers hidden_keep_prob: keep_prob for Bert hidden layers use_crf: whether to use Conditional Ramdom Field to decode tags """ def __init__(self, n_tags: int, pretrained_bert: str, bert_config_file: Optional[str] = None, attention_probs_keep_prob: Optional[float] = None, hidden_keep_prob: Optional[float] = None, use_crf: bool = False, **kwargs) -> None: if pretrained_bert: config = AutoConfig.from_pretrained(pretrained_bert, num_labels=n_tags, output_attentions=False, output_hidden_states=False) model = AutoModelForTokenClassification.from_pretrained(pretrained_bert, config=config) elif bert_config_file and Path(bert_config_file).is_file(): bert_config = AutoConfig.from_json_file(str(expand_path(bert_config_file))) if attention_probs_keep_prob is not None: bert_config.attention_probs_dropout_prob = 1.0 - attention_probs_keep_prob if hidden_keep_prob is not None: bert_config.hidden_dropout_prob = 1.0 - hidden_keep_prob model = AutoModelForTokenClassification(config=bert_config) else: raise ConfigError("No pre-trained BERT model is given.") self.crf = CRF(n_tags) if use_crf else None super().__init__(model, **kwargs) def train_on_batch(self, input_ids: Union[List[List[int]], np.ndarray], input_masks: Union[List[List[int]], np.ndarray], y_masks: Union[List[List[int]], np.ndarray], y: List[List[int]], *args, **kwargs) -> Dict[str, float]: """ Args: input_ids: batch of indices of subwords input_masks: batch of masks which determine what should be attended args: arguments passed to _build_feed_dict and corresponding to additional input and output tensors of the derived class. kwargs: keyword arguments passed to _build_feed_dict and corresponding to additional input and output tensors of the derived class. Returns: dict with fields 'loss', 'head_learning_rate', and 'bert_learning_rate' """ b_input_ids = torch.from_numpy(input_ids).to(self.device) b_input_masks = torch.from_numpy(input_masks).to(self.device) subtoken_labels = [token_labels_to_subtoken_labels(y_el, y_mask, input_mask) for y_el, y_mask, input_mask in zip(y, y_masks, input_masks)] b_labels = torch.from_numpy(np.array(subtoken_labels)).to(torch.int64).to(self.device) self.optimizer.zero_grad() loss = self.model(input_ids=b_input_ids, attention_mask=b_input_masks, labels=b_labels).loss if self.crf is not None: self.crf(y, y_masks) if self.is_data_parallel: loss = loss.mean() self._make_step(loss) return {'loss': loss.item()} def __call__(self, input_ids: Union[List[List[int]], np.ndarray], input_masks: Union[List[List[int]], np.ndarray], y_masks: Union[List[List[int]], np.ndarray]) -> Tuple[List[List[int]], List[np.ndarray]]: """ Predicts tag indices for a given subword tokens batch Args: input_ids: indices of the subwords input_masks: mask that determines where to attend and where not to y_masks: mask which determines the first subword units in the the word Returns: Label indices or class probabilities for each token (not subtoken) """ b_input_ids = torch.from_numpy(input_ids).to(self.device) b_input_masks = torch.from_numpy(input_masks).to(self.device) with torch.no_grad(): # Forward pass, calculate logit predictions logits = self.model(b_input_ids, attention_mask=b_input_masks) # Move logits and labels to CPU and to numpy arrays logits = token_from_subtoken(logits[0].detach().cpu(), torch.from_numpy(y_masks)) probas = torch.nn.functional.softmax(logits, dim=-1) probas = probas.detach().cpu().numpy() if self.crf is not None: logits = logits.transpose(1, 0).to(self.device) pred = self.crf.decode(logits) else: logits = logits.detach().cpu().numpy() pred = np.argmax(logits, axis=-1) seq_lengths = np.sum(y_masks, axis=1) pred = [p[:l] for l, p in zip(seq_lengths, pred)] return pred, probas def load(self, fname=None): super().load(fname) if self.crf is not None: self.crf = self.crf.to(self.device) if self.load_path: weights_path_crf = Path(f"{self.load_path}_crf").resolve() weights_path_crf = weights_path_crf.with_suffix(".pth.tar") if weights_path_crf.exists(): checkpoint = torch.load(weights_path_crf, map_location=self.device) self.crf.load_state_dict(checkpoint["model_state_dict"], strict=False) else: log.warning(f"Init from scratch. Load path {weights_path_crf} does not exist.") def save(self, fname: Optional[str] = None, *args, **kwargs) -> None: super().save(fname, *args, **kwargs) if self.crf is not None: if fname is None: fname = self.save_path weights_path_crf = Path(f"{fname}_crf").resolve() weights_path_crf = weights_path_crf.with_suffix(".pth.tar") torch.save({"model_state_dict": self.crf.cpu().state_dict()}, weights_path_crf) self.crf.to(self.device) ================================================ FILE: deeppavlov/models/torch_bert/torch_transformers_squad.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from collections import namedtuple from logging import getLogger from pathlib import Path from typing import List, Tuple, Optional, Dict import numpy as np import torch from transformers import AutoModelForQuestionAnswering, AutoConfig, AutoModel from transformers.data.processors.utils import InputFeatures from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register from deeppavlov.core.models.torch_model import TorchModel logger = getLogger(__name__) def softmax_mask(val, mask): inf = 1e30 return -inf * (1 - mask.to(torch.float32)) + val class PassageReaderClassifier(torch.nn.Module): """The model with a Transformer encoder and two linear layers: the first for prediction of answer start and end positions, the second defines the probability of the paragraph to contain the answer. Args: config: path to Transformer configuration file """ def __init__(self, config): super().__init__() self.encoder = AutoModel.from_config(config=config) self.qa_outputs = torch.nn.Linear(config.hidden_size, 2) self.qa_classifier = torch.nn.Linear(config.hidden_size, 1) def forward(self, input_ids, attention_mask, token_type_ids): out = self.encoder(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask) logits = self.qa_outputs(out[0]) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) rank_logits = self.qa_classifier(out[0][:, 0, :]) outputs = namedtuple("outputs", "start_logits end_logits rank_logits") return outputs(start_logits=start_logits, end_logits=end_logits, rank_logits=rank_logits) @register('torch_transformers_squad') class TorchTransformersSquad(TorchModel): """Bert-based on PyTorch model for SQuAD-like problem setting: It predicts start and end position of answer for given question and context. [CLS] token is used as no_answer. If model selects [CLS] token as most probable answer, it means that there is no answer in given context. Start and end position of answer are predicted by linear transformation of Bert outputs. Args: pretrained_bert: pretrained Bert checkpoint path or key title (e.g. "bert-base-uncased") attention_probs_keep_prob: keep_prob for Bert self-attention layers hidden_keep_prob: keep_prob for Bert hidden layers bert_config_file: path to Bert configuration file, or None, if `pretrained_bert` is a string name psg_cls: whether to use a separate linear layer to define if a passage contains the answer to the question batch_size: batch size for inference of squad model """ def __init__(self, pretrained_bert: str, attention_probs_keep_prob: Optional[float] = None, hidden_keep_prob: Optional[float] = None, bert_config_file: Optional[str] = None, psg_cls: bool = False, batch_size: int = 10, **kwargs) -> None: self.batch_size = batch_size self.psg_cls = psg_cls if pretrained_bert: logger.debug(f"From pretrained {pretrained_bert}.") config = AutoConfig.from_pretrained(pretrained_bert, output_attentions=False, output_hidden_states=False) if self.psg_cls: model = PassageReaderClassifier(config=config) else: model = AutoModelForQuestionAnswering.from_pretrained(pretrained_bert, config=config) elif bert_config_file and Path(bert_config_file).is_file(): bert_config = AutoConfig.from_json_file(str(expand_path(bert_config_file))) if attention_probs_keep_prob is not None: bert_config.attention_probs_dropout_prob = 1.0 - attention_probs_keep_prob if hidden_keep_prob is not None: bert_config.hidden_dropout_prob = 1.0 - hidden_keep_prob if self.psg_cls: model = PassageReaderClassifier(config=self.bert_config) else: model = AutoModelForQuestionAnswering(config=self.bert_config) else: raise ConfigError("No pre-trained BERT model is given.") super().__init__(model, **kwargs) def train_on_batch(self, features: List[List[InputFeatures]], y_st: List[List[int]], y_end: List[List[int]]) -> Dict: """Train model on given batch. This method calls train_op using features and labels from y_st and y_end Args: features: batch of InputFeatures instances y_st: batch of lists of ground truth answer start positions y_end: batch of lists of ground truth answer end positions Returns: dict with loss and learning_rate values """ input_ids = [f[0].input_ids for f in features] input_masks = [f[0].attention_mask for f in features] input_type_ids = [f[0].token_type_ids for f in features] b_input_ids = torch.cat(input_ids, dim=0).to(self.device) b_input_masks = torch.cat(input_masks, dim=0).to(self.device) b_input_type_ids = torch.cat(input_type_ids, dim=0).to(self.device) y_st = [x[0] for x in y_st] y_end = [x[0] for x in y_end] b_y_st = torch.from_numpy(np.array(y_st)).to(self.device) b_y_end = torch.from_numpy(np.array(y_end)).to(self.device) input_ = { 'input_ids': b_input_ids, 'attention_mask': b_input_masks, 'token_type_ids': b_input_type_ids, 'start_positions': b_y_st, 'end_positions': b_y_end, 'return_dict': True } self.optimizer.zero_grad() input_ = {arg_name: arg_value for arg_name, arg_value in input_.items() if arg_name in self.accepted_keys} loss = self.model(**input_).loss if self.is_data_parallel: loss = loss.mean() self._make_step(loss) return {'loss': loss.item()} @property def accepted_keys(self) -> Tuple[str]: if self.is_data_parallel: accepted_keys = self.model.module.forward.__code__.co_varnames else: accepted_keys = self.model.forward.__code__.co_varnames return accepted_keys def __call__(self, features_batch: List[List[InputFeatures]]) -> Tuple[ List[List[int]], List[List[int]], List[List[float]], List[List[float]], List[int]]: """get predictions using features as input Args: features_batch: batch of InputFeatures instances Returns: start_pred_batch: answer start positions end_pred_batch: answer end positions logits_batch: answer logits scores_batch: answer confidences ind_batch: indices of paragraph pieces where the answer was found """ predictions = {} # TODO: refactor batchification indices, input_ids, input_masks, input_type_ids = [], [], [], [] for n, features_list in enumerate(features_batch): for f in features_list: input_ids.append(f.input_ids) input_masks.append(f.attention_mask) input_type_ids.append(f.token_type_ids) indices.append(n) num_batches = len(indices) // self.batch_size + int(len(indices) % self.batch_size > 0) for i in range(num_batches): b_input_ids = torch.cat(input_ids[i * self.batch_size:(i + 1) * self.batch_size], dim=0).to(self.device) b_input_masks = torch.cat(input_masks[i * self.batch_size:(i + 1) * self.batch_size], dim=0).to(self.device) b_input_type_ids = torch.cat(input_type_ids[i * self.batch_size:(i + 1) * self.batch_size], dim=0).to(self.device) input_ = { 'input_ids': b_input_ids, 'attention_mask': b_input_masks, 'token_type_ids': b_input_type_ids, 'return_dict': True } with torch.no_grad(): input_ = {arg_name: arg_value for arg_name, arg_value in input_.items() if arg_name in self.accepted_keys} # Forward pass, calculate logit predictions outputs = self.model(**input_) logits_st = outputs.start_logits logits_end = outputs.end_logits bs = b_input_ids.size()[0] seq_len = b_input_ids.size()[-1] mask = torch.cat([torch.ones(bs, 1, dtype=torch.int32), torch.zeros(bs, seq_len - 1, dtype=torch.int32)], dim=-1).to(self.device) logit_mask = b_input_type_ids + mask logits_st = softmax_mask(logits_st, logit_mask) logits_end = softmax_mask(logits_end, logit_mask) start_probs = torch.nn.functional.softmax(logits_st, dim=-1) end_probs = torch.nn.functional.softmax(logits_end, dim=-1) if self.psg_cls: scores = outputs.rank_logits.squeeze(1) else: scores = torch.tensor(1) - start_probs[:, 0] * end_probs[:, 0] outer = torch.matmul(start_probs.view(*start_probs.size(), 1), end_probs.view(end_probs.size()[0], 1, end_probs.size()[1])) outer_logits = torch.exp(logits_st.view(*logits_st.size(), 1) + logits_end.view( logits_end.size()[0], 1, logits_end.size()[1])) context_max_len = torch.max(torch.sum(b_input_type_ids, dim=1)).to(torch.int64) max_ans_length = torch.min(torch.tensor(20).to(self.device), context_max_len).to(torch.int64).item() outer = torch.triu(outer, diagonal=0) - torch.triu(outer, diagonal=outer.size()[1] - max_ans_length) outer_logits = torch.triu(outer_logits, diagonal=0) - torch.triu( outer_logits, diagonal=outer_logits.size()[1] - max_ans_length) start_pred = torch.argmax(torch.max(outer, dim=2)[0], dim=1) end_pred = torch.argmax(torch.max(outer, dim=1)[0], dim=1) logits = torch.max(torch.max(outer_logits, dim=2)[0], dim=1)[0] # Move logits and labels to CPU and to numpy arrays start_pred = start_pred.detach().cpu().numpy() end_pred = end_pred.detach().cpu().numpy() logits = logits.detach().cpu().numpy().tolist() scores = scores.detach().cpu().numpy().tolist() for j, (start_pred_elem, end_pred_elem, logits_elem, scores_elem) in \ enumerate(zip(start_pred, end_pred, logits, scores)): ind = indices[i * self.batch_size + j] if ind in predictions: predictions[ind] += [(start_pred_elem, end_pred_elem, logits_elem, scores_elem)] else: predictions[ind] = [(start_pred_elem, end_pred_elem, logits_elem, scores_elem)] start_pred_batch, end_pred_batch, logits_batch, scores_batch, ind_batch = [], [], [], [], [] for ind in sorted(predictions.keys()): prediction = predictions[ind] max_ind = np.argmax([pred[2] for pred in prediction]) start_pred_batch.append(prediction[max_ind][0]) end_pred_batch.append(prediction[max_ind][1]) logits_batch.append(prediction[max_ind][2]) scores_batch.append(prediction[max_ind][3]) ind_batch.append(max_ind) return start_pred_batch, end_pred_batch, logits_batch, scores_batch, ind_batch ================================================ FILE: deeppavlov/models/torch_bert/torch_transformers_syntax_parser.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import math from logging import getLogger from pathlib import Path from typing import List, Dict, Union, Optional, Tuple import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from transformers import AutoConfig, AutoModel from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register from deeppavlov.core.models.torch_model import TorchModel from deeppavlov.models.torch_bert.torch_transformers_sequence_tagger import token_from_subtoken logger = getLogger(__name__) class Biaffine(nn.Module): def __init__(self, in1_features: int, in2_features: int, out_features: int): super().__init__() self.bilinear = PairwiseBilinear(in1_features + 1, in2_features + 1, out_features) self.bilinear.weight.data.zero_() self.bilinear.bias.data.zero_() def forward(self, input1: torch.Tensor, input2: torch.Tensor) -> torch.Tensor: input1 = torch.cat([input1, input1.new_ones(*input1.size()[:-1], 1)], dim=input1.dim() - 1) input2 = torch.cat([input2, input2.new_ones(*input2.size()[:-1], 1)], dim=input2.dim() - 1) return self.bilinear(input1, input2) class PairwiseBilinear(nn.Module): """ https://github.com/stanfordnlp/stanza/blob/v1.1.1/stanza/models/common/biaffine.py#L5 # noqa """ def __init__(self, in1_features: int, in2_features: int, out_features: int, bias: bool = True): super().__init__() self.in1_features = in1_features self.in2_features = in2_features self.out_features = out_features self.weight = nn.Parameter(torch.Tensor(in1_features, out_features, in2_features)) if bias: self.bias = nn.Parameter(torch.Tensor(out_features)) else: self.register_parameter("bias", None) self.reset_parameters() def reset_parameters(self): bound = 1 / math.sqrt(self.weight.size(0)) nn.init.uniform_(self.weight, -bound, bound) if self.bias is not None: nn.init.uniform_(self.bias, -bound, bound) def forward(self, input1: torch.Tensor, input2: torch.Tensor) -> torch.Tensor: d1, d2, out = self.in1_features, self.in2_features, self.out_features n1, n2 = input1.size(1), input2.size(1) # (b * n1, d1) @ (d1, out * d2) => (b * n1, out * d2) x1W = torch.mm(input1.view(-1, d1), self.weight.view(d1, out * d2)) # (b, n1 * out, d2) @ (b, d2, n2) => (b, n1 * out, n2) x1Wx2 = x1W.view(-1, n1 * out, d2).bmm(input2.transpose(1, 2)) y = x1Wx2.view(-1, n1, self.out_features, n2).transpose(2, 3) if self.bias is not None: y.add_(self.bias) return y # (b, n1, n2, out) def extra_repr(self) -> str: return "in1_features={}, in2_features={}, out_features={}, bias={}".format( self.in1_features, self.in2_features, self.out_features, self.bias is not None ) @torch.no_grad() def mask_arc(lengths: torch.Tensor, mask_diag: bool = True) -> Optional[torch.Tensor]: b, n = lengths.numel(), lengths.max() if torch.all(lengths == n): if not mask_diag: return None mask = torch.ones(b, n, n + 1) else: mask = torch.zeros(b, n, n + 1) for i, length in enumerate(lengths): mask[i, :length, :length + 1] = 1 if mask_diag: mask.masked_fill_(torch.eye(n, dtype=torch.bool), 0) return mask class SyntaxParserNetwork(torch.nn.Module): """The model which defines heads in syntax tree and dependencies for text tokens. Text token ids are fed into Transformer encoder, hidden states are passed into dense layers followed by two biaffine layers (first for prediction of pairwise probabilities of a token to be the head for other token, second - for prediction of syntax dependency of a token). """ def __init__(self, n_deps: int, pretrained_bert: str, encoder_layer_ids: List[int] = (-1,), bert_config_file: Optional[str] = None, attention_probs_keep_prob: Optional[float] = None, hidden_keep_prob: Optional[float] = None, state_size: int = 256, device: str = "gpu"): super().__init__() self.device = torch.device("cuda" if torch.cuda.is_available() and device == "gpu" else "cpu") self.n_deps = n_deps self.encoder_layer_ids = encoder_layer_ids self.state_size = state_size if pretrained_bert: logger.debug(f"From pretrained {pretrained_bert}.") config = AutoConfig.from_pretrained(pretrained_bert, output_attentions=False, output_hidden_states=False) self.encoder = AutoModel.from_pretrained(pretrained_bert, config=config) elif bert_config_file and Path(bert_config_file).is_file(): bert_config = AutoConfig.from_json_file(str(expand_path(bert_config_file))) if attention_probs_keep_prob is not None: bert_config.attention_probs_dropout_prob = 1.0 - attention_probs_keep_prob if hidden_keep_prob is not None: bert_config.hidden_dropout_prob = 1.0 - hidden_keep_prob self.encoder = AutoModel(config=bert_config) else: raise ConfigError("No pre-trained BERT model is given.") self.head_embs1 = torch.nn.Linear(config.hidden_size, state_size) self.dep_embs1 = torch.nn.Linear(config.hidden_size, state_size) self.head_embs2 = torch.nn.Linear(config.hidden_size, state_size) self.dep_embs2 = torch.nn.Linear(config.hidden_size, state_size) self.zero_emb1 = torch.nn.Parameter(torch.randn(state_size, ), requires_grad=True) self.zero_emb2 = torch.nn.Parameter(torch.randn(state_size, ), requires_grad=True) self.dropout = torch.nn.Dropout(config.hidden_dropout_prob) self.biaf_head = Biaffine(state_size, state_size, 1) self.biaf_dep = Biaffine(state_size, state_size, n_deps) def forward(self, input_ids, attention_mask, subtoken_mask, y_heads=None, y_dep=None): input_ids = torch.from_numpy(input_ids).to(self.device) attention_mask = torch.from_numpy(attention_mask).to(self.device) subtoken_mask = torch.from_numpy(subtoken_mask) outputs = self.encoder(input_ids, attention_mask, output_hidden_states=True) hidden_states = outputs.hidden_states layer_output_list = [] for layer_id in self.encoder_layer_ids: layer_id = layer_id + 1 if layer_id != -1 else layer_id layer_output_list.append(hidden_states[layer_id]) layer_output = torch.stack(layer_output_list) layer_output = torch.sum(layer_output, dim=0) layer_output = token_from_subtoken(layer_output, subtoken_mask) bs, seq_len, dim = layer_output.size() layer_output = layer_output.float().to(self.device) lengths = torch.sum(subtoken_mask, dim=-1) head1 = self.head_embs1(layer_output) dep1 = self.dep_embs1(layer_output) dep1_zero = [self.zero_emb1 for _ in range(bs)] dep1_zero = torch.stack(dep1_zero).unsqueeze(1).to(self.device) dep1 = torch.cat([dep1_zero, dep1], dim=1) head2 = self.head_embs2(layer_output) dep2 = self.dep_embs2(layer_output) dep2_zero = [self.zero_emb2 for _ in range(bs)] dep2_zero = torch.stack(dep2_zero).unsqueeze(1).to(self.device) dep2 = torch.cat([dep2_zero, dep2], dim=1) head1 = self.dropout(head1) dep1 = self.dropout(dep1) head2 = self.dropout(head2) dep2 = self.dropout(dep2) logits_head_init = self.biaf_head(head1, dep1).squeeze_(3) logits_deprel = self.biaf_dep(head2, dep2) mask = mask_arc(lengths, mask_diag=False) if mask is not None: logits_head_init.masked_fill_(mask.logical_not().to(logits_head_init.device), -10.0) logits_head = F.softmax(logits_head_init, dim=-1) head_loss, dep_loss = None, None if y_heads is not None: y_heads = tuple(torch.LongTensor(yh).to(self.device) for yh in y_heads) y_heads_pd = nn.utils.rnn.pad_sequence(y_heads, batch_first=True, padding_value=-1) logits_head_flatten = logits_head.contiguous().view(-1, logits_head.size(-1)) y_heads_flatten = y_heads_pd.contiguous().view(-1) head_loss = F.cross_entropy(logits_head_flatten, y_heads_flatten, ignore_index=-1, reduction="sum") head_loss.div_((y_heads_flatten != -1).sum()) y_dep = tuple(torch.LongTensor(ydp).to(self.device) for ydp in y_dep) y_dep_pd = nn.utils.rnn.pad_sequence(y_dep, batch_first=True, padding_value=-1) y_heads_new = y_heads_pd.masked_fill(y_heads_pd == -1, 0) gather_index = y_heads_new.view(*y_heads_new.size(), 1, 1).expand(-1, -1, -1, logits_deprel.size(-1)) logits_deprel = torch.gather(logits_deprel, dim=2, index=gather_index) logits_deprel_flatten = logits_deprel.contiguous().view(-1, logits_deprel.size(-1)) y_dep_flatten = y_dep_pd.contiguous().view(-1) dep_loss = F.cross_entropy(logits_deprel_flatten, y_dep_flatten, ignore_index=-1, reduction="sum") dep_loss.div_((y_dep_flatten != -1).sum()) else: logits_head = logits_head.detach().cpu().numpy() head_ids = np.argmax(logits_head, axis=-1).tolist() head_ids_new = torch.LongTensor(head_ids) steps = torch.arange(head_ids_new.size(1)) logits_deprel = [logits_deprel[i, steps, heads] for i, heads in enumerate(head_ids_new)] logits_deprel = torch.stack(logits_deprel, dim=0) deprels = logits_deprel.argmax(dim=2).detach().cpu().numpy().tolist() head_probas = [head_probas_list[:l, :l + 1] for l, head_probas_list in zip(lengths, logits_head)] deprels = [deprel[:l] for l, deprel in zip(lengths, deprels)] if y_heads is not None: return head_loss + dep_loss else: return head_probas, deprels @register('torch_transformers_syntax_parser') class TorchTransformersSyntaxParser(TorchModel): """Transformer-based model on PyTorch for syntax parsing. It predicts probabilities of heads and dependency ids for text tokens. Args: pretrained_bert: pretrained Bert checkpoint path or key title (e.g. "bert-base-uncased") n_deps: number of syntax dependencies encoder_layer_ids: list of indexes of encoder layers which will be used for further predicting of heads and dependencies with biaffine layer state_size: size of dense layers which follow after transformer encoder attention_probs_keep_prob: keep_prob for Bert self-attention layers hidden_keep_prob: keep_prob for Bert hidden layers bert_config_file: path to Bert configuration file, or None, if `pretrained_bert` is a string name """ def __init__(self, pretrained_bert: str, n_deps: int, encoder_layer_ids: List[int] = (-1,), state_size: int = 256, attention_probs_keep_prob: Optional[float] = None, hidden_keep_prob: Optional[float] = None, bert_config_file: Optional[str] = None, **kwargs) -> None: model = SyntaxParserNetwork(n_deps, pretrained_bert, encoder_layer_ids, bert_config_file, attention_probs_keep_prob, hidden_keep_prob, state_size) super().__init__(model, **kwargs) def train_on_batch(self, input_ids: Union[List[List[int]], np.ndarray], input_masks: Union[List[List[int]], np.ndarray], y_masks: Union[List[List[int]], np.ndarray], y_heads: List[List[int]], y_dep: List[List[int]]) -> Dict: """ Args: input_ids: indices of the subwords input_masks: mask that determines where to attend and where not to y_masks: mask which determines the first subword units in the the word y_heads: for each token - id fo token which is the head in syntax tree for the token y_dep: syntax dependencies for each tokens """ self.optimizer.zero_grad() loss = self.model(input_ids, input_masks, y_masks, y_heads, y_dep) loss.backward() # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. if self.clip_norm: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip_norm) self.optimizer.step() return {'loss': loss.item()} def __call__(self, input_ids: Union[List[List[int]], np.ndarray], input_masks: Union[List[List[int]], np.ndarray], y_masks: Union[List[List[int]], np.ndarray]) -> Tuple[List[List[List[float]]], List[List[int]]]: """ Predicts probas of heads and dependency ids for tokens Args: input_ids: indices of the subwords input_masks: mask that determines where to attend and where not to y_masks: mask which determines the first subword units in the the word Returns: Probas of heads and dependency ids for each token (not subtoken) """ with torch.no_grad(): head_probas, dep_ids = self.model(input_ids, input_masks, y_masks) return head_probas, dep_ids ================================================ FILE: deeppavlov/models/vectorizers/__init__.py ================================================ ================================================ FILE: deeppavlov/models/vectorizers/hashing_tfidf_vectorizer.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from collections import Counter from logging import getLogger from typing import List, Any, Generator, Tuple, KeysView, ValuesView, Dict, Optional import numpy as np import scipy as sp from scipy import sparse from sklearn.utils import murmurhash3_32 from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component from deeppavlov.core.models.estimator import Estimator logger = getLogger(__name__) Sparse = sp.sparse.csr_matrix def hash_(token: str, hash_size: int) -> int: """Convert a token to a hash of given size. Args: token: a word hash_size: hash size Returns: int, hashed token """ return murmurhash3_32(token, positive=True) % hash_size @register('hashing_tfidf_vectorizer') class HashingTfIdfVectorizer(Estimator): """Create a tfidf matrix from collection of documents of size [n_documents X n_features(hash_size)]. Args: tokenizer: a tokenizer class hash_size: a hash size, power of two doc_index: a dictionary of document ids and their titles save_path: a path to **.npz** file where tfidf matrix is saved load_path: a path to **.npz** file where tfidf matrix is loaded from Attributes: hash_size: a hash size tokenizer: instance of a tokenizer class term_freqs: a dictionary with tfidf terms and their frequences doc_index: provided by a user ids or generated automatically ids rows: tfidf matrix rows corresponding to terms cols: tfidf matrix cols corresponding to docs data: tfidf matrix data corresponding to tfidf values """ def __init__(self, tokenizer: Component, hash_size=2 ** 24, doc_index: Optional[dict] = None, save_path: Optional[str] = None, load_path: Optional[str] = None, **kwargs): super().__init__(save_path=save_path, load_path=load_path, mode=kwargs.get('mode', 'infer')) self.hash_size = hash_size self.tokenizer = tokenizer self.rows = [] self.cols = [] self.data = [] if kwargs.get('mode', 'infer') == 'infer': self.tfidf_matrix, opts = self.load() self.ngram_range = opts['ngram_range'] self.hash_size = opts['hash_size'] self.term_freqs = opts['term_freqs'].squeeze() self.doc_index = opts['doc_index'] self.index2doc = self.get_index2doc() else: self.term_freqs = None self.doc_index = doc_index or {} def __call__(self, questions: List[str]) -> Sparse: """Transform input list of documents to tfidf vectors. Args: questions: a list of input strings Returns: transformed documents as a csr_matrix with shape [n_documents X :attr:`hash_size`] """ sp_tfidfs = [] for question in questions: ngrams = list(self.tokenizer([question])) hashes = [hash_(ngram, self.hash_size) for ngram in ngrams[0]] hashes_unique, q_hashes = np.unique(hashes, return_counts=True) tfs = np.log1p(q_hashes) if len(q_hashes) == 0: sp_tfidfs.append(Sparse((1, self.hash_size))) continue size = len(self.doc_index) Ns = self.term_freqs[hashes_unique] idfs = np.log((size - Ns + 0.5) / (Ns + 0.5)) idfs[idfs < 0] = 0 tfidf = np.multiply(tfs, idfs).astype("float32") indptr = np.array([0, len(hashes_unique)]) sp_tfidf = Sparse((tfidf, hashes_unique, indptr), shape=(1, self.hash_size) ) sp_tfidfs.append(sp_tfidf) transformed = sp.sparse.vstack(sp_tfidfs) return transformed def get_index2doc(self) -> Dict[Any, int]: """Invert doc_index. Returns: inverted doc_index dict """ return dict(zip(self.doc_index.values(), self.doc_index.keys())) def get_counts(self, docs: List[str], doc_ids: List[Any]) \ -> Generator[Tuple[KeysView, ValuesView, List[int]], Any, None]: """Get term counts for a list of documents. Args: docs: a list of input documents doc_ids: a list of document ids corresponding to input documents Yields: a tuple of term hashes, count values and column ids Returns: None """ logger.debug("Tokenizing batch...") batch_ngrams = list(self.tokenizer(docs)) logger.debug("Counting hash...") doc_id = iter(doc_ids) for ngrams in batch_ngrams: counts = Counter([hash_(gram, self.hash_size) for gram in ngrams]) hashes = counts.keys() values = counts.values() _id = self.doc_index[next(doc_id)] if values: col_id = [_id] * len(values) else: col_id = [] yield hashes, values, col_id def get_count_matrix(self, row: List[int], col: List[int], data: List[int], size: int) \ -> Sparse: """Get count matrix. Args: row: tfidf matrix rows corresponding to terms col: tfidf matrix cols corresponding to docs data: tfidf matrix data corresponding to tfidf values size: :attr:`doc_index` size Returns: a count csr_matrix """ count_matrix = Sparse((data, (row, col)), shape=(self.hash_size, size)) count_matrix.sum_duplicates() return count_matrix @staticmethod def get_tfidf_matrix(count_matrix: Sparse) -> Tuple[Sparse, np.array]: """Convert a count matrix into a tfidf matrix. Args: count_matrix: a count matrix Returns: a tuple of tfidf matrix and term frequences """ binary = (count_matrix > 0).astype(int) term_freqs = np.array(binary.sum(1)).squeeze() idfs = np.log((count_matrix.shape[1] - term_freqs + 0.5) / (term_freqs + 0.5)) idfs[idfs < 0] = 0 idfs = sp.sparse.diags(idfs, 0) tfs = count_matrix.log1p() tfidfs = idfs.dot(tfs) return tfidfs, term_freqs def save(self) -> None: """Save tfidf matrix into **.npz** format. Returns: None """ logger.info("Saving tfidf matrix to {}".format(self.save_path)) count_matrix = self.get_count_matrix(self.rows, self.cols, self.data, size=len(self.doc_index)) tfidf_matrix, term_freqs = self.get_tfidf_matrix(count_matrix) self.term_freqs = term_freqs opts = {'hash_size': self.hash_size, 'ngram_range': self.tokenizer.ngram_range, 'doc_index': self.doc_index, 'term_freqs': self.term_freqs} data = { 'data': tfidf_matrix.data, 'indices': tfidf_matrix.indices, 'indptr': tfidf_matrix.indptr, 'shape': tfidf_matrix.shape, 'opts': opts } np.savez(self.save_path, **data) # release memory self.reset() def reset(self) -> None: """Clear :attr:`rows`, :attr:`cols` and :attr:`data` Returns: None """ self.rows.clear() self.cols.clear() self.data.clear() def load(self) -> Tuple[Sparse, Dict]: """Load a tfidf matrix as csr_matrix. Returns: a tuple of tfidf matrix and csr data. Raises: FileNotFoundError if :attr:`load_path` doesn't exist. Todo: * implement loading from URL """ if not self.load_path.exists(): raise FileNotFoundError("HashingTfIdfVectorizer path doesn't exist!") logger.debug("Loading tfidf matrix from {}".format(self.load_path)) loader = np.load(self.load_path, allow_pickle=True) matrix = Sparse((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape']) return matrix, loader['opts'].item(0) def partial_fit(self, docs: List[str], doc_ids: List[Any], doc_nums: List[int]) -> None: """Partially fit on one batch. Args: docs: a list of input documents doc_ids: a list of document ids corresponding to input documents doc_nums: a list of document integer ids as they appear in a database Returns: None """ for doc_id, i in zip(doc_ids, doc_nums): self.doc_index[doc_id] = i for batch_rows, batch_data, batch_cols in self.get_counts(docs, doc_ids): self.rows.extend(batch_rows) self.cols.extend(batch_cols) self.data.extend(batch_data) def fit(self, docs: List[str], doc_ids: List[Any], doc_nums: List[int]) -> None: """Fit the vectorizer. Args: docs: a list of input documents doc_ids: a list of document ids corresponding to input documents doc_nums: a list of document integer ids as they appear in a database Returns: None """ self.doc_index = {} self.rows = [] self.cols = [] self.data = [] return self.partial_fit(docs, doc_ids, doc_nums) ================================================ FILE: deeppavlov/paramsearch.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import sys from copy import deepcopy from itertools import product from logging import getLogger from pathlib import Path import numpy as np from sklearn.model_selection import train_test_split from deeppavlov.core.commands.train import train_evaluate_model_from_config, get_iterator_from_config, \ read_data_by_config from deeppavlov.core.commands.utils import parse_config from deeppavlov.core.common.cross_validation import calc_cv_score from deeppavlov.core.common.file import save_json, find_config, read_json from deeppavlov.core.common.params_search import ParamsSearch p = (Path(__file__) / ".." / "..").resolve() sys.path.append(str(p)) log = getLogger(__name__) parser = argparse.ArgumentParser() parser.add_argument("config_path", help="path to a pipeline json config", type=str) parser.add_argument("--folds", help="number of folds", type=str, default=None) parser.add_argument("--search_type", help="search type: grid or random search", type=str, default='grid') def get_best_params(combinations, scores, param_names, target_metric): max_id = np.argmax(scores) best_params = dict(zip(param_names, combinations[max_id])) best_params[target_metric] = scores[max_id] return best_params def main(): params_helper = ParamsSearch() args = parser.parse_args() is_loo = False n_folds = None if args.folds == 'loo': is_loo = True elif args.folds is None: n_folds = None elif args.folds.isdigit(): n_folds = int(args.folds) else: raise NotImplementedError('Not implemented this type of CV') # read config pipeline_config_path = find_config(args.config_path) config_init = read_json(pipeline_config_path) config = parse_config(config_init) data = read_data_by_config(config) target_metric = parse_config(config_init)['train']['metrics'][0] if isinstance(target_metric, dict): target_metric = target_metric['name'] # get all params for search param_paths = list(params_helper.find_model_path(config, 'search_choice')) param_values = [] param_names = [] for path in param_paths: value = params_helper.get_value_from_config(config, path) param_name = path[-1] param_value_search = value['search_choice'] param_names.append(param_name) param_values.append(param_value_search) # find optimal params if args.search_type == 'grid': # generate params combnations for grid search combinations = list(product(*param_values)) # calculate cv scores scores = [] for comb in combinations: config = deepcopy(config_init) for param_path, param_value in zip(param_paths, comb): params_helper.insert_value_or_dict_into_config(config, param_path, param_value) config = parse_config(config) if (n_folds is not None) | is_loo: # CV for model evaluation score_dict = calc_cv_score(config, data=data, n_folds=n_folds, is_loo=is_loo) score = score_dict[next(iter(score_dict))] else: # train/valid for model evaluation data_to_evaluate = data.copy() if len(data_to_evaluate['valid']) == 0: data_to_evaluate['train'], data_to_evaluate['valid'] = train_test_split(data_to_evaluate['train'], test_size=0.2) iterator = get_iterator_from_config(config, data_to_evaluate) score = train_evaluate_model_from_config(config, iterator=iterator)['valid'][target_metric] scores.append(score) # get model with best score best_params_dict = get_best_params(combinations, scores, param_names, target_metric) log.info('Best model params: {}'.format(best_params_dict)) else: raise NotImplementedError('Not implemented this type of search') # save config best_config = config_init for i, param_name in enumerate(best_params_dict.keys()): if param_name != target_metric: params_helper.insert_value_or_dict_into_config(best_config, param_paths[i], best_params_dict[param_name]) best_model_filename = pipeline_config_path.with_suffix('.cvbest.json') save_json(best_config, best_model_filename) log.info('Best model saved in json-file: {}'.format(best_model_filename)) # try to run: # --config_path path_to_config.json --folds 2 if __name__ == "__main__": main() ================================================ FILE: deeppavlov/requirements/datasets.txt ================================================ datasets>=1.16.0,<2.5.0;python_version<="3.10" datasets==2.2.*;python_version=="3.11.*" ================================================ FILE: deeppavlov/requirements/dependency_decoding.txt ================================================ ufal.chu-liu-edmonds ================================================ FILE: deeppavlov/requirements/en_core_web_sm.txt ================================================ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl spacy ================================================ FILE: deeppavlov/requirements/faiss.txt ================================================ faiss-cpu==1.7.2;python_version<="3.10" faiss-cpu==1.7.4;python_version=="3.11.*" ================================================ FILE: deeppavlov/requirements/fasttext.txt ================================================ fasttext==0.9.* ================================================ FILE: deeppavlov/requirements/hdt.txt ================================================ hdt==2.3 ================================================ FILE: deeppavlov/requirements/kenlm.txt ================================================ pypi-kenlm==0.1.20220713;python_version<="3.10" kenlm==0.2.*;python_version=="3.11.*" ================================================ FILE: deeppavlov/requirements/lxml.txt ================================================ lxml==4.9.* ================================================ FILE: deeppavlov/requirements/opt_einsum.txt ================================================ opt-einsum==3.3.* ================================================ FILE: deeppavlov/requirements/protobuf.txt ================================================ protobuf<=3.20 ================================================ FILE: deeppavlov/requirements/pytorch.txt ================================================ torch>=1.6.0,<1.14.0 ================================================ FILE: deeppavlov/requirements/rapidfuzz.txt ================================================ rapidfuzz==2.1.* ================================================ FILE: deeppavlov/requirements/razdel.txt ================================================ razdel==0.5.0 ================================================ FILE: deeppavlov/requirements/ru_core_news_sm.txt ================================================ https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.5.0/ru_core_news_sm-3.5.0-py3-none-any.whl spacy ================================================ FILE: deeppavlov/requirements/sacremoses.txt ================================================ sacremoses==0.0.53 ================================================ FILE: deeppavlov/requirements/sentencepiece.txt ================================================ sentencepiece==0.2.0 ================================================ FILE: deeppavlov/requirements/slovnet.txt ================================================ slovnet==0.5.* navec ================================================ FILE: deeppavlov/requirements/sortedcontainers.txt ================================================ sortedcontainers==2.4.* ================================================ FILE: deeppavlov/requirements/torchcrf.txt ================================================ pytorch-crf==0.7.* ================================================ FILE: deeppavlov/requirements/transformers.txt ================================================ transformers>=4.13.0,<4.25.0;python_version<"3.8" transformers==4.30.0;python_version>="3.8" ================================================ FILE: deeppavlov/requirements/udapi.txt ================================================ udapi==0.3.* ================================================ FILE: deeppavlov/requirements/whapi.txt ================================================ bs4 whapi==0.6.* ================================================ FILE: deeppavlov/settings.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse from deeppavlov.core.common.paths import get_settings_path, populate_settings_dir parser = argparse.ArgumentParser() parser.add_argument("-d", "--default", action="store_true", help="return to defaults") def main(): """DeepPavlov console configuration utility.""" args = parser.parse_args() path = get_settings_path() if args.default: if populate_settings_dir(force=True): print(f'Populated {path} with default settings files') else: print(f'{path} is already a default settings directory') else: print(f'Current DeepPavlov settings path: {path}') if __name__ == "__main__": main() ================================================ FILE: deeppavlov/utils/__init__.py ================================================ ================================================ FILE: deeppavlov/utils/benchmarks/__init__.py ================================================ ================================================ FILE: deeppavlov/utils/benchmarks/benchmarks.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse from collections import defaultdict from logging import getLogger import numpy as np from tqdm import tqdm from deeppavlov import build_model from deeppavlov.core.commands.train import read_data_by_config, get_iterator_from_config from deeppavlov.core.commands.utils import parse_config, expand_path from deeppavlov.core.common.file import save_jsonl log = getLogger(__name__) parser = argparse.ArgumentParser() parser.add_argument('config_path', help='path to a pipeline json config', type=str) parser.add_argument('benchmark_name', help='benchmark name to be submitted', choices=['glue', 'superglue', 'russian_superglue']) parser.add_argument('-o', '--output-file', default=None, help='path to save output', type=str) parser.add_argument('-d', '--download', action='store_true', help='download model components') GLUE_TASKS = { 'cola': 'CoLA', 'mnli-m': 'MNLI-m', 'mnli-mm': 'MNLI-mm', 'mrpc': 'MRPC', 'qnli': 'QNLI', 'qqp': 'QQP', 'rte': 'RTE', 'sst2': 'SST-2', 'stsb': 'STS-B', 'wnli': 'WNLI' } SUPER_GLUE_TASKS = { 'copa': 'COPA', 'multirc': 'MultiRC', 'boolq': 'BoolQ', 'record': 'ReCoRD', 'wic': 'WiC' } RSG_TASKS = { 'lidirus': 'LiDiRus', 'rcb': 'RCB', 'parus': 'PARus', 'muserc': 'MuSeRC', 'terra': 'TERRa', 'russe': 'RUSSE', 'rwsd': 'RWSD', 'danetqa': 'DaNetQA', 'rucos': 'RuCoS' } def split_config(config_path, download): """Gets model, data iterator and a task name from the configuration file. Args: config_path: Path to the model configuration file. download: If True, the model will be downloaded from the DeepPavlov server. """ config = parse_config(config_path) data = read_data_by_config(config) iterator = get_iterator_from_config(config, data) task_name = config['dataset_reader']['name'] if task_name == 'mnli': task_name = 'mnli-m' if config['dataset_reader']['valid'] == 'validation_matched' else 'mnli-mm' data_gen = iterator.gen_batches(1, data_type='test', shuffle=False) model = build_model(config, download=download) return model, data_gen, task_name def get_predictions(model, data_gen, replace_word=None, round_res=False): """Gets model predictions and replaces model output with replace_word. Args: model: The model itself. data_gen: Iterator with data to be submitted. replace_word: Model outputs to be replaced with 1, other outputs are replaced with 0. If None, model outputs are not replaced. round_res: If True, model outputs are rounded (used in stsb). """ submission = {'index': [], 'prediction': []} for idx, (x, _) in enumerate(tqdm(data_gen)): prediction = model.compute(x)[0] if replace_word: prediction = 1 if prediction == replace_word else 0 if round_res: prediction = round(prediction, 3) submission['index'].append(idx) submission['prediction'].append(prediction) return submission def submit_glue(config_path, output_path, download): """Creates submission file for the GLUE tasks. Args: config_path: Path to the model configuration file. output_path: Path to output file. If None, file name is selected according corresponding task name. download: If True, the model will be downloaded from the DeepPavlov server. """ model, data_gen, task_name = split_config(config_path, download) if task_name == 'cola': submission = get_predictions(model, data_gen, 'acceptable') elif task_name.startswith('mnli'): submission = get_predictions(model, data_gen) elif task_name == 'mrpc': submission = get_predictions(model, data_gen, 'equivalent') elif task_name == 'sst2': submission = get_predictions(model, data_gen, 'positive') elif task_name == 'stsb': submission = get_predictions(model, data_gen, None, True) elif task_name == 'wnli': submission = get_predictions(model, data_gen, 'entailment') elif task_name in GLUE_TASKS: submission = get_predictions(model, data_gen) else: raise ValueError(f'Unexpected GLUE task name: {task_name}') save_path = output_path or f'{GLUE_TASKS[task_name]}.tsv' save_path = expand_path(save_path) save_path.parent.mkdir(parents=True, exist_ok=True) save_array = np.vstack(([list(submission.keys())], np.array(list(submission.values())).transpose())) np.savetxt(save_path, save_array, delimiter='\t', fmt='%s') log.info(f'Prediction saved to {save_path}') def commonsense_reasoning_prediction(model, data_gen): """Common part for ReCoRD and RuCoS tasks that gets their predictions in needed format. Args: model: The model itself. data_gen: Iterator with data to be submitted. """ submission = [] output = defaultdict( lambda: { 'predicted': [], 'probability': [] } ) for x, _ in tqdm(data_gen): indices, _, _, entities, _ = x[0] prediction = model.compute(x)[:, 1] output[indices]['predicted'].append(entities) output[indices]['probability'].append(prediction) for key, value in output.items(): answer_index = np.argmax(value['probability']) answer = value['predicted'][answer_index] submission.append({'idx': int(key.split('-')[1]), 'label': answer}) return submission def multi_sentence_comprehension_prediction(model, data_gen): """Common part for MultiRC and MuSeRC tasks that gets their predictions in needed format. Args: model: The model itself. data_gen: Iterator with data to be submitted. """ output = {} for x, _ in tqdm(data_gen): contexts, answers, indices = x[0] prediction = model([contexts], [answers], indices) paragraph_idx = indices['paragraph'] question_idx = indices['question'] answer_idx = indices['answer'] label = int(prediction[0] == 'True') if paragraph_idx not in output: output[paragraph_idx] = { 'idx': paragraph_idx, 'passage': { 'questions': [ { 'idx': question_idx, 'answers': [{'idx': answer_idx, 'label': label}] } ] } } questions = output[paragraph_idx]['passage']['questions'] question_indices = set(el['idx'] for el in questions) if question_idx not in question_indices: output[paragraph_idx]['passage']['questions'].append({ 'idx': question_idx, 'answers': [{'idx': answer_idx, 'label': label}] }) else: for question in questions: if question['idx'] == question_idx: question['answers'].append({'idx': answer_idx, 'label': label}) submission = list(output.values()) return submission def submit_superglue(config_path, output_path, download): """Creates submission file for the SuperGLUE tasks. Args: config_path: Path to the model configuration file. output_path: Path to output file. If None, file name is selected according corresponding task name. download: If True, the model will be downloaded from the DeepPavlov server. """ model, data_gen, task_name = split_config(config_path, download) submission = [] if task_name == 'record': submission = commonsense_reasoning_prediction(model, data_gen) elif task_name == 'copa': for idx, (x, _) in enumerate(tqdm(data_gen)): prediction = model.compute(x)[0] label = int(prediction == 'choice2') submission.append({'idx': idx, 'label': label}) elif task_name == 'multirc': submission = multi_sentence_comprehension_prediction(model, data_gen) elif task_name in SUPER_GLUE_TASKS: for idx, (x, _) in enumerate(tqdm(data_gen)): prediction = model.compute(x) while isinstance(prediction, list): prediction = prediction[0] submission.append({'idx': idx, 'label': prediction}) else: raise ValueError(f'Unexpected SuperGLUE task name: {task_name}') save_path = output_path if output_path is not None else f'{SUPER_GLUE_TASKS[task_name]}.jsonl' save_path = expand_path(save_path) save_path.parent.mkdir(parents=True, exist_ok=True) save_jsonl(submission, save_path) log.info(f'Prediction saved to {save_path}') def submit_rsg(config_path, output_path, download): """Creates submission file for the Russian SuperGLUE tasks. Args: config_path: Path to the model configuration file. output_path: Path to output file. If None, file name is selected according corresponding task name. download: If True, the model will be downloaded from the DeepPavlov server. """ model, data_gen, task_name = split_config(config_path, download) submission = [] if task_name == 'rucos': submission = commonsense_reasoning_prediction(model, data_gen) elif task_name == 'parus': for idx, (x, _) in enumerate(tqdm(data_gen)): prediction = model.compute(x)[0] label = int(prediction == 'choice2') submission.append({'idx': idx, 'label': label}) elif task_name == 'muserc': submission = multi_sentence_comprehension_prediction(model, data_gen) elif task_name in RSG_TASKS: for idx, (x, _) in enumerate(tqdm(data_gen)): prediction = model.compute(x) while isinstance(prediction, list): prediction = prediction[0] submission.append({'idx': idx, 'label': prediction}) else: raise ValueError(f'Unexpected Russian SuperGLUE task name: {task_name}') save_path = output_path if output_path is not None else f'{RSG_TASKS[task_name]}.jsonl' save_path = expand_path(save_path) save_path.parent.mkdir(parents=True, exist_ok=True) save_jsonl(submission, save_path) log.info(f'Prediction saved to {save_path}') def main(): args = parser.parse_args() if args.benchmark_name == 'glue': submit_glue(args.config_path, args.output_file, args.download) elif args.benchmark_name == 'superglue': submit_superglue(args.config_path, args.output_file, args.download) elif args.benchmark_name == 'russian_superglue': submit_rsg(args.config_path, args.output_file, args.download) if __name__ == '__main__': main() ================================================ FILE: deeppavlov/utils/connector/__init__.py ================================================ from .dialog_logger import DialogLogger ================================================ FILE: deeppavlov/utils/connector/dialog_logger.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json from datetime import datetime from logging import getLogger from pathlib import Path from typing import Any, Optional, Hashable from deeppavlov.core.common.file import read_json from deeppavlov.core.common.paths import get_settings_path from deeppavlov.core.data.utils import jsonify_data LOGGER_CONFIG_FILENAME = 'dialog_logger_config.json' LOG_TIMESTAMP_FORMAT = '%Y-%m-%d_%H-%M-%S_%f' log = getLogger(__name__) class DialogLogger: """DeepPavlov dialog logging facility. DialogLogger is an entity which provides tools for dialogs logging. Args: enabled: DialogLogger on/off flag. logger_name: Dialog logger name that is used for organising log files. Attributes: logger_name: Dialog logger name which is used for organising log files. log_max_size: Maximum size of log file, kb. self.log_file: Current log file object. """ def __init__(self, enabled: bool = False, logger_name: Optional[str] = None) -> None: self.config: dict = read_json(get_settings_path() / LOGGER_CONFIG_FILENAME) self.enabled: bool = enabled or self.config['enabled'] if self.enabled: self.logger_name: str = logger_name or self.config['logger_name'] self.log_max_size: int = self.config['logfile_max_size_kb'] self.log_file = self._get_log_file() self.log_file.writelines('"Dialog logger initiated"\n') @staticmethod def _get_timestamp_utc_str() -> str: """Returns str converted current UTC timestamp. Returns: utc_timestamp_str: str converted current UTC timestamp. """ utc_timestamp_str = datetime.strftime(datetime.utcnow(), LOG_TIMESTAMP_FORMAT) return utc_timestamp_str def _get_log_file(self): """Returns opened file object for writing dialog logs. Returns: log_file: opened Python file object. """ log_dir: Path = Path(self.config['log_path']).expanduser().resolve() / self.logger_name log_dir.mkdir(parents=True, exist_ok=True) log_file_path = Path(log_dir, f'{self._get_timestamp_utc_str()}_{self.logger_name}.log') log_file = open(log_file_path, 'a', buffering=1, encoding='utf8') return log_file def _log(self, utterance: Any, direction: str, dialog_id: Optional[Hashable]=None): """Logs single dialog utterance to current dialog log file. Args: utterance: Dialog utterance. direction: 'in' or 'out' utterance direction. dialog_id: Dialog ID. """ if isinstance(utterance, str): pass elif isinstance(utterance, (list, dict)): utterance = jsonify_data(utterance) else: utterance = str(utterance) dialog_id = str(dialog_id) if not isinstance(dialog_id, str) else dialog_id if self.log_file.tell() >= self.log_max_size * 1024: self.log_file.close() self.log_file = self._get_log_file() else: try: log_msg = {} log_msg['timestamp'] = self._get_timestamp_utc_str() log_msg['dialog_id'] = dialog_id log_msg['direction'] = direction log_msg['message'] = utterance log_str = json.dumps(log_msg, ensure_ascii=self.config['ensure_ascii']) self.log_file.write(f'{log_str}\n') except IOError: log.error('Failed to write dialog log.') def log_in(self, utterance: Any, dialog_id: Optional[Hashable] = None) -> None: """Wraps _log method for all input utterances. Args: utterance: Dialog utterance. dialog_id: Dialog ID. """ if self.enabled: self._log(utterance, 'in', dialog_id) def log_out(self, utterance: Any, dialog_id: Optional[Hashable] = None) -> None: """Wraps _log method for all output utterances. Args: utterance: Dialog utterance. dialog_id: Dialog ID. """ if self.enabled: self._log(utterance, 'out', dialog_id) ================================================ FILE: deeppavlov/utils/pip_wrapper/__init__.py ================================================ from .pip_wrapper import * ================================================ FILE: deeppavlov/utils/pip_wrapper/pip_wrapper.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import re import subprocess import sys from logging import getLogger from pathlib import Path from deeppavlov.core.commands.utils import expand_path, parse_config from deeppavlov.core.data.utils import get_all_elems_from_json log = getLogger(__name__) _tf_re = re.compile(r'\s*tensorflow\s*([<=>;]|$)') def install(*packages): if any(_tf_re.match(package) for package in packages) \ and b'tensorflow-gpu' in subprocess.check_output([sys.executable, '-m', 'pip', 'freeze'], env=os.environ.copy()): log.warning('found tensorflow-gpu installed, so upgrading it instead of tensorflow') packages = [_tf_re.sub(r'tensorflow-gpu\1', package) for package in packages] result = subprocess.check_call([sys.executable, '-m', 'pip', 'install', *[re.sub(r'\s', '', package) for package in packages]], env=os.environ.copy()) return result def get_config_requirements(config: [str, Path, dict]): config = parse_config(config) requirements = set() for req in config.get('metadata', {}).get('requirements', []): requirements.add(req) config_references = [expand_path(config_ref) for config_ref in get_all_elems_from_json(config, 'config_path')] requirements |= {req for config in config_references for req in get_config_requirements(config)} return requirements def install_from_config(config: [str, Path, dict]): requirements_files = get_config_requirements(config) if not requirements_files: log.warning('No requirements found in config') return requirements = [] for rf in requirements_files: with expand_path(rf).open(encoding='utf8') as f: for line in f: line = re.sub(r'\s', '', line.strip()) if line and not line.startswith('#') and line not in requirements: requirements.append(line) for r in requirements: install(r) ================================================ FILE: deeppavlov/utils/server/__init__.py ================================================ from .server import get_server_params, get_ssl_params, redirect_root_to_docs, start_model_server ================================================ FILE: deeppavlov/utils/server/metrics.py ================================================ # Copyright 2020 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import time from typing import Tuple from prometheus_client import CONTENT_TYPE_LATEST, REGISTRY, generate_latest from prometheus_client import Counter, Gauge, Histogram from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint from starlette.requests import Request from starlette.responses import Response from starlette.types import ASGIApp REQUESTS_COUNT = Counter('http_requests_count', 'Number of processed requests', ['endpoint', 'status_code']) REQUESTS_LATENCY = Histogram('http_requests_latency_seconds', 'Request latency histogram', ['endpoint']) REQUESTS_IN_PROGRESS = Gauge('http_requests_in_progress', 'Number of requests currently being processed', ['endpoint']) def metrics(request: Request) -> Response: return Response(generate_latest(REGISTRY), media_type=CONTENT_TYPE_LATEST) class PrometheusMiddleware(BaseHTTPMiddleware): def __init__(self, app: ASGIApp, ignore_paths: Tuple = ()) -> None: super().__init__(app) self.ignore_paths = ignore_paths async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response: endpoint = request.url.path if endpoint in self.ignore_paths: return await call_next(request) REQUESTS_IN_PROGRESS.labels(endpoint=endpoint).inc() start_time = time.perf_counter() status_code = 500 try: response = await call_next(request) status_code = response.status_code finally: if status_code == 200: duration = time.perf_counter() - start_time REQUESTS_LATENCY.labels(endpoint=endpoint).observe(duration) REQUESTS_COUNT.labels(endpoint=endpoint, status_code=status_code).inc() REQUESTS_IN_PROGRESS.labels(endpoint=endpoint).dec() return response ================================================ FILE: deeppavlov/utils/server/server.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import asyncio import os from collections import namedtuple from logging import getLogger from pathlib import Path from ssl import PROTOCOL_TLSv1_2 from typing import Dict, List, Optional, Union import uvicorn from fastapi import Body, FastAPI, HTTPException from fastapi.utils import generate_operation_id_for_path from pydantic import BaseConfig, BaseModel from pydantic.fields import Field, ModelField from pydantic.main import ModelMetaclass from starlette.middleware.cors import CORSMiddleware from starlette.responses import RedirectResponse from deeppavlov.core.commands.infer import build_model from deeppavlov.core.commands.utils import parse_config from deeppavlov.core.common.chainer import Chainer from deeppavlov.core.common.file import read_json from deeppavlov.core.common.log import log_config from deeppavlov.core.common.paths import get_settings_path from deeppavlov.core.data.utils import check_nested_dict_keys, jsonify_data from deeppavlov.utils.connector import DialogLogger from deeppavlov.utils.server.metrics import metrics, PrometheusMiddleware SERVER_CONFIG_PATH = get_settings_path() / 'server_config.json' SSLConfig = namedtuple('SSLConfig', ['version', 'keyfile', 'certfile']) log = getLogger(__name__) dialog_logger = DialogLogger(logger_name='rest_api') COMPATIBILITY_MODE = os.getenv('COMPATIBILITY_MODE', False) if COMPATIBILITY_MODE is not False: log.warning('DeepPavlov riseapi mode will use the old model response data format used up and including 1.0.0rc1.\n' 'COMPATIBILITY_MODE will be removed in the DeepPavlov 1.2.0.\n' 'Please, update your client code according to the new format.') app = FastAPI() app.add_middleware( PrometheusMiddleware, ignore_paths=('/', '/metrics', '/api', '/probe', '/docs', '/openapi.json') ) app.add_middleware( CORSMiddleware, allow_origins=['*'], allow_credentials=True, allow_methods=['*'], allow_headers=['*'] ) app.add_route("/metrics", metrics) def get_server_params(model_config: Union[str, Path]) -> Dict: server_config = read_json(SERVER_CONFIG_PATH) model_config = parse_config(model_config) server_params = server_config['common_defaults'] if check_nested_dict_keys(model_config, ['metadata', 'server_utils']): model_tag = model_config['metadata']['server_utils'] if check_nested_dict_keys(server_config, ['model_defaults', model_tag]): model_defaults = server_config['model_defaults'][model_tag] for param_name in model_defaults.keys(): if model_defaults[param_name]: server_params[param_name] = model_defaults[param_name] server_params['model_endpoint'] = server_params.get('model_endpoint', '/model') arg_names = server_params['model_args_names'] or model_config['chainer']['in'] if isinstance(arg_names, str): arg_names = [arg_names] server_params['model_args_names'] = arg_names return server_params def get_ssl_params(server_params: dict, https: Optional[bool], ssl_key: Optional[str], ssl_cert: Optional[str]) -> SSLConfig: https = https or server_params['https'] if https: ssh_key_path = Path(ssl_key or server_params['https_key_path']).resolve() if not ssh_key_path.is_file(): e = FileNotFoundError('Ssh key file not found: please provide correct path in --key param or ' 'https_key_path param in server configuration file') log.error(e) raise e ssh_cert_path = Path(ssl_cert or server_params['https_cert_path']).resolve() if not ssh_cert_path.is_file(): e = FileNotFoundError('Ssh certificate file not found: please provide correct path in --cert param or ' 'https_cert_path param in server configuration file') log.error(e) raise e ssl_config = SSLConfig(version=PROTOCOL_TLSv1_2, keyfile=str(ssh_key_path), certfile=str(ssh_cert_path)) else: ssl_config = SSLConfig(None, None, None) return ssl_config def redirect_root_to_docs(fast_app: FastAPI, func_name: str, endpoint: str, method: str) -> None: """Adds api route to server that redirects user from root to docs with opened `endpoint` description.""" @fast_app.get('/', include_in_schema=False) async def redirect_to_docs() -> RedirectResponse: operation_id = generate_operation_id_for_path(name=func_name, path=endpoint, method=method) response = RedirectResponse(url=f'/docs#/default/{operation_id}') return response def interact(model: Chainer, payload: Dict[str, Optional[List]]) -> List: model_args = payload.values() dialog_logger.log_in(payload) error_msg = None lengths = {len(model_arg) for model_arg in model_args if model_arg is not None} if not lengths: error_msg = 'got empty request' elif 0 in lengths: error_msg = 'got empty array as model argument' elif len(lengths) > 1: error_msg = 'got several different batch sizes' if error_msg is not None: log.error(error_msg) raise HTTPException(status_code=400, detail=error_msg) batch_size = next(iter(lengths)) model_args = [arg or [None] * batch_size for arg in model_args] prediction = model(*model_args) # TODO: remove in 1.2.0 if COMPATIBILITY_MODE is not False: if len(model.out_params) == 1: prediction = [prediction] prediction = list(zip(*prediction)) result = jsonify_data(prediction) dialog_logger.log_out(result) return result def test_interact(model: Chainer, payload: Dict[str, Optional[List]]) -> List[str]: model_args = [arg or ["Test string."] for arg in payload.values()] try: _ = model(*model_args) return ["Test passed"] except Exception as e: raise HTTPException(status_code=400, detail=repr(e)) def start_model_server(model_config: Path, https: Optional[bool] = None, ssl_key: Optional[str] = None, ssl_cert: Optional[str] = None, port: Optional[int] = None) -> None: server_params = get_server_params(model_config) host = server_params['host'] port = port or server_params['port'] model_endpoint = server_params['model_endpoint'] model_args_names = server_params['model_args_names'] ssl_config = get_ssl_params(server_params, https, ssl_key=ssl_key, ssl_cert=ssl_cert) model = build_model(model_config) def batch_decorator(cls: ModelMetaclass) -> ModelMetaclass: cls.__annotations__ = {arg_name: list for arg_name in model_args_names} cls.__fields__ = {arg_name: ModelField(name=arg_name, type_=list, class_validators=None, model_config=BaseConfig, required=False, field_info=Field(None)) for arg_name in model_args_names} return cls @batch_decorator class Batch(BaseModel): pass redirect_root_to_docs(app, 'answer', model_endpoint, 'post') model_endpoint_post_example = {arg_name: ['string'] for arg_name in model_args_names} @app.post(model_endpoint, summary='A model endpoint') async def answer(item: Batch = Body(..., example=model_endpoint_post_example)) -> List: loop = asyncio.get_event_loop() return await loop.run_in_executor(None, interact, model, item.dict()) @app.post('/probe', include_in_schema=False) async def probe(item: Batch) -> List[str]: loop = asyncio.get_event_loop() return await loop.run_in_executor(None, test_interact, model, item.dict()) @app.get('/api', summary='Model argument names') async def api() -> Dict[str, List[str]]: if COMPATIBILITY_MODE is not False: return model_args_names return { 'in': model.in_x, 'out': model.out_params } uvicorn.run(app, host=host, port=port, log_config=log_config, ssl_version=ssl_config.version, ssl_keyfile=ssl_config.keyfile, ssl_certfile=ssl_config.certfile, timeout_keep_alive=20) ================================================ FILE: deeppavlov/utils/settings/__init__.py ================================================ ================================================ FILE: deeppavlov/utils/settings/dialog_logger_config.json ================================================ { "enabled": false, "logger_name": "default", "log_path": "~/.deeppavlov/dialog_logs", "logfile_max_size_kb": 10240, "ensure_ascii": false } ================================================ FILE: deeppavlov/utils/settings/log_config.json ================================================ { "version": 1, "disable_existing_loggers": false, "loggers": { "deeppavlov": { "level": "INFO", "handlers": [ "stderr" ], "propagate": true }, "uvicorn.access": { "level": "INFO", "handlers": [ "uvicorn_handler" ], "propagate": true }, "uvicorn.error": { "level": "INFO", "handlers": [ "uvicorn_handler" ], "propagate": true }, "train_report": { "level": "INFO", "handlers": [ "train_handler" ], "propagate": true }, "filelock": { "level": "WARNING", "handlers": [ "stdout" ], "propagate": true } }, "formatters": { "default": { "format": "%(asctime)s.%(msecs)d %(levelname)s in '%(name)s'['%(module)s'] at line %(lineno)d: %(message)s", "datefmt": "%Y-%m-%d %H:%M:%S" }, "uvicorn_fmt": { "format": "%(asctime)s %(message)s", "datefmt": "%Y-%m-%d %H:%M:%S" }, "message": { "format": "%(message)s" } }, "handlers": { "file": { "class": "logging.FileHandler", "level": "DEBUG", "formatter": "default", "filename": "~/.deeppavlov/log.log" }, "stdout": { "class": "logging.StreamHandler", "level": "DEBUG", "formatter": "default", "stream": "ext://sys.stdout" }, "stderr": { "class": "logging.StreamHandler", "level": "DEBUG", "formatter": "default", "stream": "ext://sys.stderr" }, "uvicorn_handler": { "class": "logging.StreamHandler", "level": "INFO", "formatter": "uvicorn_fmt", "stream": "ext://sys.stdout", "filters": ["probeFilter"] }, "train_handler": { "class": "logging.StreamHandler", "level": "INFO", "formatter": "message", "stream": "ext://sys.stdout" } }, "filters": { "probeFilter": { "()": "deeppavlov.core.common.log.ProbeFilter" } } } ================================================ FILE: deeppavlov/utils/settings/server_config.json ================================================ { "common_defaults": { "host": "0.0.0.0", "port": 5000, "model_args_names": [], "https": false, "https_cert_path": "", "https_key_path": "", "socket_type": "TCP", "unix_socket_file": "/tmp/deeppavlov_socket.s", "socket_launch_message": "launching socket server at" } } ================================================ FILE: deeppavlov/utils/socket/__init__.py ================================================ from .socket import encode, start_socket_server ================================================ FILE: deeppavlov/utils/socket/socket.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import asyncio import json from logging import getLogger from pathlib import Path from struct import pack, unpack from typing import Any, List, Optional, Tuple, Union from deeppavlov.core.commands.infer import build_model from deeppavlov.core.common.chainer import Chainer from deeppavlov.core.data.utils import jsonify_data from deeppavlov.utils.connector import DialogLogger from deeppavlov.utils.server import get_server_params HEADER_FORMAT = ' bytes: """Сonverts data to the socket server input formatted bytes array. Serializes ``data`` to the JSON formatted bytes array and adds 4 bytes to the beginning of the array - packed to bytes length of the JSON formatted bytes array. Header format is ">> from deeppavlov.utils.socket import encode >>> encode({'a':1}) b'\x08\x00\x00\x00{"a": 1} >>> encode([42]) b'\x04\x00\x00\x00[42]' """ json_data = jsonify_data(data) bytes_data = json.dumps(json_data).encode() response = pack(HEADER_FORMAT, len(bytes_data)) + bytes_data return response class SocketServer: """Creates socket server that sends the received data to the DeepPavlov model and returns model response. The server receives bytes array consists of the `header` and the `body`. The `header` is the first 4 bytes of the array - `body` length in bytes represented by a packed unsigned int (byte order is little-endian). `body` is dictionary serialized to JSON formatted bytes array that server sends to the model. The dictionary keys should match model arguments names, the values should be lists or tuples of inferenced values. Socket server request creation example: >>> from deeppavlov.utils.socket import encode >>> request = encode({"context":["Elon Musk launched his cherry Tesla roadster to the Mars orbit"]}) >>> request b'I\x00\x00\x00{"x": ["Elon Musk launched his cherry Tesla roadster to the Mars orbit"]}' Socket server response, like the request, consists of the header and the body. Response body is dictionary {'status': status, 'payload': payload} serialized to a JSON formatted byte array, where: status (str): 'OK' if the model successfully processed the data, else - error message. payload: (Optional[List[Tuple]]): The model result if no error has occurred, otherwise None. """ _launch_msg: str _loop: asyncio.AbstractEventLoop _model: Chainer _model_args_names: List def __init__(self, model_config: Path, socket_type: str, port: Optional[int] = None, socket_file: Optional[Union[str, Path]] = None) -> None: """Initializes socket server. Args: model_config: Path to the config file. socket_type: Socket family. "TCP" for the AF_INET socket server, "UNIX" for UNIX Domain Socket server. port: Port number for the AF_INET address family. If parameter is not defined, the port number from the utils/settings/server_config.json is used. socket_file: Path to the file to which UNIX Domain Socket server connects. If parameter is not defined, the path from the utils/settings/server_config.json is used. Raises: ValueError: If ``socket_type`` parameter is neither "TCP" nor "UNIX". """ server_params = get_server_params(model_config) socket_type = socket_type or server_params['socket_type'] self._loop = asyncio.get_event_loop() if socket_type == 'TCP': host = server_params['host'] port = port or server_params['port'] self._launch_msg = f'{server_params["socket_launch_message"]} http://{host}:{port}' self._loop.create_task(asyncio.start_server(self._handle_client, host, port)) elif socket_type == 'UNIX': socket_file = socket_file or server_params['unix_socket_file'] socket_path = Path(socket_file).resolve() if socket_path.exists(): socket_path.unlink() self._launch_msg = f'{server_params["socket_launch_message"]} {socket_file}' self._loop.create_task(asyncio.start_unix_server(self._handle_client, socket_file)) else: raise ValueError(f'socket type "{socket_type}" is not supported') self._model = build_model(model_config) self._model_args_names = server_params['model_args_names'] def start(self) -> None: """Launches socket server""" log.info(self._launch_msg) try: self._loop.run_forever() except KeyboardInterrupt: pass except Exception as e: log.error(f'got exception {e} while running server') finally: self._loop.close() async def _handle_client(self, reader: asyncio.StreamReader, writer: asyncio.StreamWriter) -> None: """Handles connection from a client. Validates requests, sends request body to DeepPavlov model, sends responses to client. """ addr = writer.get_extra_info('peername') log.info(f'handling connection from {addr}') while True: header = await reader.read(4) if not header: log.info(f'closing connection from {addr}') writer.close() break elif len(header) != 4: error_msg = f'header "{header}" length less than 4 bytes' log.error(error_msg) response = self._response(error_msg) else: data_len = unpack(HEADER_FORMAT, header)[0] request_body = await reader.read(data_len) try: data = json.loads(request_body) response = await self._interact(data) except ValueError: error_msg = f'request "{request_body}" type is not json' log.error(error_msg) response = self._response(error_msg) writer.write(response) await writer.drain() async def _interact(self, data: dict) -> bytes: dialog_logger.log_in(data) model_args = [] for param_name in self._model_args_names: param_value = data.get(param_name) if param_value is None or (isinstance(param_value, list) and len(param_value) > 0): model_args.append(param_value) else: error_msg = f"nonempty array expected but got '{param_name}'={repr(param_value)}" log.error(error_msg) return self._response(error_msg) lengths = {len(i) for i in model_args if i is not None} if not lengths: error_msg = 'got empty request' log.error(error_msg) return self._response(error_msg) elif len(lengths) > 1: error_msg = f'got several different batch sizes: {lengths}' log.error(error_msg) return self._response(error_msg) batch_size = list(lengths)[0] model_args = [arg or [None] * batch_size for arg in model_args] # in case when some parameters were not described in model_args model_args += [[None] * batch_size for _ in range(len(self._model.in_x) - len(model_args))] prediction = await self._loop.run_in_executor(None, self._model, *model_args) if len(self._model.out_params) == 1: prediction = [prediction] prediction = list(zip(*prediction)) dialog_logger.log_out(prediction) return self._response(payload=prediction) @staticmethod def _response(status: str = 'OK', payload: Optional[List[Tuple]] = None) -> bytes: """Puts arguments into dict and serialize it to JSON formatted byte array with header. Args: status: Response status. 'OK' if no error has occurred, otherwise error message. payload: DeepPavlov model result if no error has occurred, otherwise None. Returns: dict({'status': status, 'payload': payload}) serialized to a JSON formatted byte array starting with the 4-byte header - the length of serialized dict in bytes. """ return encode({'status': status, 'payload': payload}) def start_socket_server(model_config: Path, socket_type: str, port: Optional[int], socket_file: Optional[Union[str, Path]]) -> None: server = SocketServer(model_config, socket_type, port, socket_file) server.start() ================================================ FILE: deeppavlov/vocabs/__init__.py ================================================ ================================================ FILE: deeppavlov/vocabs/typos.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import shutil from collections import defaultdict from logging import getLogger from pathlib import Path import requests from lxml import html from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.file import load_pickle, save_pickle from deeppavlov.core.common.registry import register from deeppavlov.core.data.utils import is_done, mark_done log = getLogger(__name__) @register('static_dictionary') class StaticDictionary: """Trie vocabulary used in spelling correction algorithms Args: data_dir: path to the directory where the built trie will be stored. Relative paths are interpreted as relative to pipeline's data directory dictionary_name: logical name of the dictionary raw_dictionary_path: path to the source file with the list of words Attributes: dict_name: logical name of the dictionary alphabet: set of all the characters used in this dictionary words_set: set of all the words words_trie: trie structure of all the words """ def __init__(self, data_dir: [Path, str] = '', *args, dictionary_name: str = 'dictionary', **kwargs): data_dir = expand_path(data_dir) / dictionary_name alphabet_path = data_dir / 'alphabet.pkl' words_path = data_dir / 'words.pkl' words_trie_path = data_dir / 'words_trie.pkl' if not is_done(data_dir): log.debug('Trying to build a dictionary in {}'.format(data_dir)) if data_dir.is_dir(): shutil.rmtree(str(data_dir)) data_dir.mkdir(parents=True) words = self._get_source(data_dir, *args, **kwargs) words = {self._normalize(word) for word in words} alphabet = {c for w in words for c in w} alphabet.remove('⟬') alphabet.remove('⟭') save_pickle(alphabet, alphabet_path) save_pickle(words, words_path) words_trie = defaultdict(set) for word in words: for i in range(len(word)): words_trie[word[:i]].add(word[:i + 1]) words_trie[word] = set() words_trie = {k: sorted(v) for k, v in words_trie.items()} save_pickle(words_trie, words_trie_path) mark_done(data_dir) log.debug('built') else: log.debug('Loading a dictionary from {}'.format(data_dir)) self.alphabet = load_pickle(alphabet_path) self.words_set = load_pickle(words_path) self.words_trie = load_pickle(words_trie_path) @staticmethod def _get_source(data_dir, raw_dictionary_path, *args, **kwargs): raw_path = expand_path(raw_dictionary_path) with raw_path.open(newline='', encoding='utf8') as f: data = [line.strip().split('\t')[0] for line in f] return data @staticmethod def _normalize(word): return '⟬{}⟭'.format(word.strip().lower().replace('ё', 'е')) @register('russian_words_vocab') class RussianWordsVocab(StaticDictionary): """Implementation of :class:`~deeppavlov.vocabs.typos.StaticDictionary` that builds data from https://github.com/danakt/russian-words/ Args: data_dir: path to the directory where the built trie will be stored. Relative paths are interpreted as relative to pipeline's data directory Attributes: dict_name: logical name of the dictionary alphabet: set of all the characters used in this dictionary words_set: set of all the words words_trie: trie structure of all the words """ def __init__(self, data_dir: [Path, str] = '', *args, **kwargs): kwargs['dictionary_name'] = 'russian_words_vocab' super().__init__(data_dir, *args, **kwargs) @staticmethod def _get_source(*args, **kwargs): log.debug('Downloading russian vocab from https://github.com/danakt/russian-words/') url = 'https://github.com/danakt/russian-words/raw/master/russian.txt' page = requests.get(url) return [word.strip() for word in page.content.decode('cp1251').strip().split('\n')] @register('wikitionary_100K_vocab') class Wiki100KDictionary(StaticDictionary): """Implementation of :class:`~deeppavlov.vocabs.typos.StaticDictionary` that builds data from `Wikitionary `__ Args: data_dir: path to the directory where the built trie will be stored. Relative paths are interpreted as relative to pipeline's data directory Attributes: dict_name: logical name of the dictionary alphabet: set of all the characters used in this dictionary words_set: set of all the words words_trie: trie structure of all the words """ def __init__(self, data_dir: [Path, str] = '', *args, **kwargs): kwargs['dictionary_name'] = 'wikipedia_100K_vocab' super().__init__(data_dir, *args, **kwargs) @staticmethod def _get_source(*args, **kwargs): words = [] log.debug('Downloading english vocab from Wiktionary') for i in range(1, 100000, 10000): k = 10000 + i - 1 url = 'https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/PG/2005/08/{}-{}'.format(i, k) page = requests.get(url) tree = html.fromstring(page.content) words += tree.xpath('//div[@class="mw-parser-output"]/p/a/text()') return words ================================================ FILE: deeppavlov/vocabs/wiki_sqlite.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from logging import getLogger from typing import List, Any, Optional, Union from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component from deeppavlov.dataset_iterators.sqlite_iterator import SQLiteDataIterator logger = getLogger(__name__) @register('wiki_sqlite_vocab') class WikiSQLiteVocab(SQLiteDataIterator, Component): """Get content from SQLite database by document ids. Args: load_path: a path to local DB file join_docs: whether to join extracted docs with ' ' or not shuffle: whether to shuffle data or not Attributes: join_docs: whether to join extracted docs with ' ' or not """ def __init__(self, load_path: str, join_docs: bool = True, shuffle: bool = False, **kwargs) -> None: SQLiteDataIterator.__init__(self, load_path=load_path, shuffle=shuffle) self.join_docs = join_docs def __call__(self, doc_ids: Optional[List[List[Any]]] = None, *args, **kwargs) -> List[Union[str, List[str]]]: """Get the contents of files, stacked by space or as they are. Args: doc_ids: a batch of lists of ids to get contents for Returns: a list of contents / list of lists of contents """ all_contents = [] if not doc_ids: logger.warning('No doc_ids are provided in WikiSqliteVocab, return all docs') doc_ids = [self.get_doc_ids()] for ids in doc_ids: contents = [self.get_doc_content(doc_id) for doc_id in ids] if self.join_docs: contents = ' '.join(contents) all_contents.append(contents) return all_contents ================================================ FILE: docs/Makefile ================================================ # Minimal makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = -WT SPHINXBUILD = sphinx-build SPHINXPROJ = DeepPavlov SOURCEDIR = . BUILDDIR = _build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) ================================================ FILE: docs/_static/deeppavlov.css ================================================ .wy-side-nav-search { background-color: #0176bd; } .wy-nav-content { max-width: 1000px; } .wy-side-nav-search>div.version { color: #ffffff; } ================================================ FILE: docs/_static/my_blocks.css ================================================ button.copybtn svg { width: 1.3em; height: 1.3em; padding: 0.1em; } button.copybtn { top: 0.2em; width: 1.4em; height: 1.4em; } .rst-content .linenodiv pre, .rst-content div[class^=highlight] pre, .rst-content pre.literal-block { font-size: 13px; line-height: 1.4; } ================================================ FILE: docs/_templates/footer.html ================================================ {#{% extends '!footer.html' %}#}
{% if (theme_prev_next_buttons_location == 'bottom' or theme_prev_next_buttons_location == 'both') and (next or prev) %} {% endif %}
{%- block extrafooter %}

Problem? Ask a Question or try our Demo

medium twitter youtube medium

{% endblock %}

{%- if show_copyright %} {%- if hasdoc('copyright') %} {% set path = pathto('copyright') %} {% set copyright = copyright|e %} © {% trans %}Copyright{% endtrans %} {{ copyright }} {%- else %} {% set copyright = copyright|e %} © {% trans %}Copyright{% endtrans %} {{ copyright }} {%- endif %} {%- endif %} {%- if build_id and build_url %} {# Translators: Build is a noun, not a verb #} {% trans %}Build{% endtrans %} {{ build_id }}. {%- elif commit %} {% trans %}Revision{% endtrans %} {{ commit }}. {%- elif last_updated %} {% trans last_updated=last_updated|e %}Last updated on {{ last_updated }}.{% endtrans %} {%- endif %}

{%- if show_sphinx %} {% set sphinx_web = 'Sphinx' %} {% set readthedocs_web = 'Read the Docs' %} {% trans sphinx_web=sphinx_web, readthedocs_web=readthedocs_web %}Built with {{ sphinx_web }} using a{% endtrans %} {% trans %}theme{% endtrans %} {% trans %}provided by {{ readthedocs_web }}{% endtrans %}. {%- endif %}
================================================ FILE: docs/apiref/core/commands.rst ================================================ deeppavlov.core.commands ======================== Basic training and inference functions. .. automodule:: deeppavlov.core.commands.infer :members: .. automodule:: deeppavlov.core.commands.train :members: ================================================ FILE: docs/apiref/core/common.rst ================================================ deeppavlov.core.common ====================== Registration and classes initialization functionality, class method decorators. .. autoclass:: deeppavlov.core.common.chainer.Chainer :members: .. automethod:: __call__ .. autoclass:: deeppavlov.core.common.base.Element .. automethod:: __init__ .. autoclass:: deeppavlov.core.common.base.Model .. automethod:: __init__ .. automodule:: deeppavlov.core.common.metrics_registry :members: .. automodule:: deeppavlov.core.common.params :members: .. automodule:: deeppavlov.core.common.registry :members: ================================================ FILE: docs/apiref/core/data.rst ================================================ deeppavlov.core.data ==================== DatasetReader, Vocab, DataLearningIterator and DataFittingIterator classes. .. autoclass:: deeppavlov.core.data.dataset_reader.DatasetReader .. autoclass:: deeppavlov.core.data.data_fitting_iterator.DataFittingIterator .. autoclass:: deeppavlov.core.data.data_learning_iterator.DataLearningIterator .. autoclass:: deeppavlov.core.data.simple_vocab.SimpleVocabulary ================================================ FILE: docs/apiref/core/models.rst ================================================ deeppavlov.core.models ====================== Abstract model classes and interfaces. .. autoclass:: deeppavlov.core.models.component.Component .. autoclass:: deeppavlov.core.models.serializable.Serializable .. autoclass:: deeppavlov.core.models.estimator.Estimator .. autoclass:: deeppavlov.core.models.nn_model.NNModel .. autoclass:: deeppavlov.core.models.torch_model.TorchModel ================================================ FILE: docs/apiref/core/trainers.rst ================================================ deeppavlov.core.trainers ======================== Trainer classes. .. autoclass:: deeppavlov.core.trainers.FitTrainer :members: .. autoclass:: deeppavlov.core.trainers.NNTrainer :members: :inherited-members: ================================================ FILE: docs/apiref/core.rst ================================================ core ==== DeepPavlov Core .. automodule:: deeppavlov.core :members: .. toctree:: :glob: :caption: Core core/* ================================================ FILE: docs/apiref/dataset_iterators.rst ================================================ dataset_iterators ================= Concrete DatasetIterator classes. .. autoclass:: deeppavlov.dataset_iterators.basic_classification_iterator.BasicClassificationDatasetIterator :members: .. autoclass:: deeppavlov.dataset_iterators.siamese_iterator.SiameseIterator .. autoclass:: deeppavlov.dataset_iterators.sqlite_iterator.SQLiteDataIterator .. autoclass:: deeppavlov.dataset_iterators.squad_iterator.SquadIterator .. automodule:: deeppavlov.dataset_iterators.typos_iterator :members: .. automodule:: deeppavlov.dataset_iterators.multitask_iterator :members: ================================================ FILE: docs/apiref/dataset_readers.rst ================================================ dataset_readers =============== Concrete DatasetReader classes. .. autoclass:: deeppavlov.dataset_readers.basic_classification_reader.BasicClassificationDatasetReader :members: .. autoclass:: deeppavlov.dataset_readers.conll2003_reader.Conll2003DatasetReader .. autoclass:: deeppavlov.dataset_readers.faq_reader.FaqDatasetReader :members: .. autoclass:: deeppavlov.dataset_readers.line_reader.LineReader :members: .. autoclass:: deeppavlov.dataset_readers.paraphraser_reader.ParaphraserReader .. autoclass:: deeppavlov.dataset_readers.squad_dataset_reader.SquadDatasetReader :members: .. automodule:: deeppavlov.dataset_readers.typos_reader :members: .. automodule:: deeppavlov.dataset_readers.ubuntu_v2_reader :members: .. automodule:: deeppavlov.dataset_readers.multitask_reader :members: ================================================ FILE: docs/apiref/metrics.rst ================================================ metrics ======= Different Metric functions. .. automodule:: deeppavlov.metrics :members: .. autofunction:: deeppavlov.metrics.accuracy.sets_accuracy .. autofunction:: deeppavlov.metrics.fmeasure.round_f1 .. autofunction:: deeppavlov.metrics.fmeasure.round_f1_macro .. autofunction:: deeppavlov.metrics.fmeasure.round_f1_weighted .. autofunction:: deeppavlov.metrics.fmeasure.ner_f1 .. autofunction:: deeppavlov.metrics.fmeasure.ner_token_f1 .. autofunction:: deeppavlov.metrics.log_loss.sk_log_loss .. autofunction:: deeppavlov.metrics.roc_auc_score.roc_auc_score ================================================ FILE: docs/apiref/models/api_requester.rst ================================================ deeppavlov.models.api_requester =============================== .. automodule:: deeppavlov.models.api_requester :members: .. autoclass:: deeppavlov.models.api_requester.api_requester.ApiRequester .. automethod:: __call__ .. automethod:: get_async_response .. autoclass:: deeppavlov.models.api_requester.api_router.ApiRouter .. automethod:: __call__ ================================================ FILE: docs/apiref/models/classifiers.rst ================================================ deeppavlov.models.classifiers ============================= .. automodule:: deeppavlov.models.classifiers :members: .. autoclass:: deeppavlov.models.classifiers.torch_classification_model.TorchTextClassificationModel :members: .. automethod:: __call__ .. autoclass:: deeppavlov.models.classifiers.cos_sim_classifier.CosineSimilarityClassifier :members: .. automethod:: __call__ .. autoclass:: deeppavlov.models.classifiers.proba2labels.Proba2Labels :members: .. automethod:: __call__ ================================================ FILE: docs/apiref/models/doc_retrieval.rst ================================================ deeppavlov.models.doc_retrieval =============================== Document retrieval classes. .. automodule:: deeppavlov.models.doc_retrieval .. autoclass:: deeppavlov.models.doc_retrieval.tfidf_ranker.TfidfRanker :members: .. automethod:: __call__ .. autoclass:: deeppavlov.models.doc_retrieval.logit_ranker.LogitRanker :members: .. automethod:: __call__ .. autoclass:: deeppavlov.models.doc_retrieval.pop_ranker.PopRanker :members: .. automethod:: __call__ ================================================ FILE: docs/apiref/models/embedders.rst ================================================ deeppavlov.models.embedders ============================ .. autoclass:: deeppavlov.models.embedders.fasttext_embedder.FasttextEmbedder .. automethod:: __call__ .. automethod:: __iter__ .. autoclass:: deeppavlov.models.embedders.tfidf_weighted_embedder.TfidfWeightedEmbedder .. automethod:: __call__ .. autoclass:: deeppavlov.models.embedders.transformers_embedder.TransformersBertEmbedder .. automethod:: __call__ ================================================ FILE: docs/apiref/models/entity_extraction.rst ================================================ deeppavlov.models.entity_extraction =================================== .. autoclass:: deeppavlov.models.entity_extraction.ner_chunker.NerChunker .. automethod:: __init__ .. automethod:: __call__ .. autoclass:: deeppavlov.models.entity_extraction.entity_linking.EntityLinker .. automethod:: __init__ .. automethod:: __call__ .. autoclass:: deeppavlov.models.entity_extraction.entity_detection_parser.EntityDetectionParser .. automethod:: __init__ .. automethod:: __call__ .. autoclass:: deeppavlov.models.entity_extraction.entity_detection_parser.QuestionSignChecker ================================================ FILE: docs/apiref/models/kbqa.rst ================================================ deeppavlov.models.kbqa ====================== .. automodule:: deeppavlov.models.kbqa .. autoclass:: deeppavlov.models.kbqa.type_define.AnswerTypesExtractor .. automethod:: __init__ .. automethod:: __call__ .. autoclass:: deeppavlov.models.kbqa.query_generator.QueryGenerator .. automethod:: __init__ .. automethod:: __call__ .. autoclass:: deeppavlov.models.kbqa.query_generator_base.QueryGeneratorBase .. automethod:: __init__ .. automethod:: __call__ .. autoclass:: deeppavlov.models.kbqa.rel_ranking_infer.RelRankerInfer .. automethod:: __init__ .. automethod:: __call__ .. autoclass:: deeppavlov.models.kbqa.template_matcher.TemplateMatcher .. automethod:: __init__ .. automethod:: __call__ .. autoclass:: deeppavlov.models.kbqa.ru_adj_to_noun.RuAdjToNoun .. automethod:: __init__ .. automethod:: __call__ .. autoclass:: deeppavlov.models.kbqa.tree_to_sparql.TreeToSparql .. automethod:: __init__ .. automethod:: __call__ .. autoclass:: deeppavlov.models.kbqa.wiki_parser.WikiParser .. automethod:: __init__ .. automethod:: __call__ ================================================ FILE: docs/apiref/models/preprocessors.rst ================================================ deeppavlov.models.preprocessors =============================== .. autoclass:: deeppavlov.models.preprocessors.dirty_comments_preprocessor.DirtyCommentsPreprocessor .. automethod:: __call__ .. autoclass:: deeppavlov.models.preprocessors.mask.Mask .. autoclass:: deeppavlov.models.preprocessors.one_hotter.OneHotter .. autoclass:: deeppavlov.models.preprocessors.sanitizer.Sanitizer .. autofunction:: deeppavlov.models.preprocessors.str_lower.str_lower .. autoclass:: deeppavlov.models.preprocessors.str_token_reverser.StrTokenReverser .. automethod:: __call__ .. autoclass:: deeppavlov.models.preprocessors.str_utf8_encoder.StrUTF8Encoder .. automethod:: __call__ .. autoclass:: deeppavlov.models.preprocessors.odqa_preprocessors.DocumentChunker .. automethod:: __call__ .. autoclass:: deeppavlov.models.preprocessors.odqa_preprocessors.StringMultiplier .. automethod:: __call__ ================================================ FILE: docs/apiref/models/relation_extraction.rst ================================================ deeppavlov.models.relation_extraction ===================================== .. autoclass:: deeppavlov.models.relation_extraction.relation_extraction_bert.REBertModel .. automethod:: __init__ .. automethod:: __call__ .. automethod:: train_on_batch ================================================ FILE: docs/apiref/models/sklearn.rst ================================================ deeppavlov.models.sklearn ============================= .. automodule:: deeppavlov.models.sklearn :members: .. autoclass:: deeppavlov.models.sklearn.sklearn_component.SklearnComponent .. automethod:: __call__ .. automethod:: fit .. automethod:: init_from_scratch .. automethod:: load .. automethod:: save .. automethod:: compose_input_data .. automethod:: get_class_attributes .. automethod:: get_function_params ================================================ FILE: docs/apiref/models/spelling_correction.rst ================================================ deeppavlov.models.spelling_correction ===================================== .. autoclass:: deeppavlov.models.spelling_correction.brillmoore.ErrorModel .. automethod:: __call__ .. automethod:: fit .. automethod:: save .. automethod:: load .. autoclass:: deeppavlov.models.spelling_correction.levenshtein.LevenshteinSearcherComponent .. automethod:: __call__ .. autoclass:: deeppavlov.models.spelling_correction.electors.top1_elector.TopOneElector .. automethod:: __call__ .. autoclass:: deeppavlov.models.spelling_correction.electors.kenlm_elector.KenlmElector .. automethod:: __call__ ================================================ FILE: docs/apiref/models/tokenizers.rst ================================================ deeppavlov.models.tokenizers ============================ .. autoclass:: deeppavlov.models.tokenizers.nltk_moses_tokenizer.NLTKMosesTokenizer .. automethod:: __call__ .. autoclass:: deeppavlov.models.tokenizers.nltk_tokenizer.NLTKTokenizer .. automethod:: __call__ .. autoclass:: deeppavlov.models.tokenizers.split_tokenizer.SplitTokenizer .. autoclass:: deeppavlov.models.tokenizers.spacy_tokenizer.StreamSpacyTokenizer .. automethod:: __call__ ================================================ FILE: docs/apiref/models/torch_bert.rst ================================================ deeppavlov.models.torch_bert ============================ .. automodule:: deeppavlov.models.torch_bert :members: .. autoclass:: deeppavlov.models.preprocessors.torch_transformers_preprocessor.TorchTransformersPreprocessor .. automethod:: __call__ .. autoclass:: deeppavlov.models.preprocessors.torch_transformers_preprocessor.TorchTransformersNerPreprocessor .. automethod:: __call__ .. autoclass:: deeppavlov.models.preprocessors.torch_transformers_preprocessor.TorchBertRankerPreprocessor .. automethod:: __call__ .. autoclass:: deeppavlov.models.torch_bert.torch_transformers_classifier.TorchTransformersClassifierModel .. automethod:: __call__ .. automethod:: train_on_batch .. autoclass:: deeppavlov.models.torch_bert.torch_transformers_sequence_tagger.TorchTransformersSequenceTagger .. automethod:: __call__ .. automethod:: train_on_batch .. autoclass:: deeppavlov.models.torch_bert.torch_transformers_squad.TorchTransformersSquad .. automethod:: __call__ .. automethod:: train_on_batch .. autoclass:: deeppavlov.models.torch_bert.torch_bert_ranker.TorchBertRankerModel .. automethod:: __call__ .. automethod:: train_on_batch ================================================ FILE: docs/apiref/models/vectorizers.rst ================================================ deeppavlov.models.vectorizers ============================= .. autoclass:: deeppavlov.models.vectorizers.hashing_tfidf_vectorizer.HashingTfIdfVectorizer :members: .. automethod:: __call__ ================================================ FILE: docs/apiref/models.rst ================================================ models ====== Concrete Model classes. .. automodule:: deeppavlov.models :members: .. toctree:: :glob: :caption: Models models/* ================================================ FILE: docs/apiref/vocabs.rst ================================================ vocabs ====== Concrete Vocab classes. .. automodule:: deeppavlov.vocabs :members: .. autoclass:: deeppavlov.vocabs.wiki_sqlite.WikiSQLiteVocab :members: .. automethod:: __call__ .. automodule:: deeppavlov.vocabs.typos :members: ================================================ FILE: docs/conf.py ================================================ # -*- coding: utf-8 -*- # # Configuration file for the Sphinx documentation builder. # # This file does only contain a selection of the most common options. For a # full list see the documentation: # http://www.sphinx-doc.org/en/master/config # -- Path setup -------------------------------------------------------------- # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. import sphinx_rtd_theme import deeppavlov # -- Project information ----------------------------------------------------- project = 'DeepPavlov' copyright = '2018, ' + deeppavlov.__author__ author = deeppavlov.__author__ # The short X.Y version version = deeppavlov.__version__ # The full version, including alpha/beta/rc tags release = version # -- General configuration --------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. # # needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.napoleon', 'sphinx.ext.viewcode', 'sphinx.ext.mathjax', 'sphinx.ext.extlinks', 'nbsphinx', 'IPython.sphinxext.ipython_console_highlighting', 'sphinx_copybutton' ] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] source_suffix = '.rst' # The master toctree document. master_doc = 'index' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = 'en' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path . exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', '**.ipynb_checkpoints' ] # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = 'sphinx_rtd_theme' html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # html_theme_options = { 'collapse_navigation': False, 'display_version': True, 'logo_only': True, } html_logo = '_static/deeppavlov.png' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] html_css_files = ['my_blocks.css', 'deeppavlov.css'] # Custom sidebar templates, must be a dictionary that maps document names # to template names. # # The default sidebars (for documents that don't match any pattern) are # defined by theme itself. Builtin themes are using these templates by # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', # 'searchbox.html']``. # # html_sidebars = {} nbsphinx_prolog = """ .. raw:: html """ nbsphinx_execute = 'never' # -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. htmlhelp_basename = f'{project}-Docs' # -- Options for LaTeX output ------------------------------------------------ latex_engine = 'xelatex' latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', # Additional stuff for the LaTeX preamble. # # 'preamble': '', # Latex figure (float) alignment # # 'figure_align': 'htbp', 'extraclassoptions': 'openany,oneside', 'fncychap': r'\usepackage[Sonny]{fncychap}' } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ (master_doc, f'{project}.tex', f'{project} Documentation', author, 'manual'), ] # -- Options for manual page output ------------------------------------------ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ (master_doc, project.lower(), f'{project} Documentation', [author], 1) ] # -- Options for Texinfo output ---------------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ (master_doc, project, f'{project} Documentation', author, project, deeppavlov.__description__, str(deeppavlov.__keywords__)), ] # -- Extension configuration ------------------------------------------------- autodoc_mock_imports = ['bs4', 'faiss', 'fasttext', 'hdt', 'kenlm', 'lxml', 'navec', 'nltk', 'opt_einsum', 'rapidfuzz', 'razdel', 'sacremoses', 'slovnet', 'sortedcontainers', 'spacy', 'torch', 'torchcrf', 'transformers', 'udapi', 'ufal', 'whapi'] extlinks = { 'config': (f'https://github.com/deeppavlov/DeepPavlov/blob/{release}/deeppavlov/configs/%s', None), 'dp_file': (f'https://github.com/deeppavlov/DeepPavlov/blob/{release}/%s', None) } # -- Options for intersphinx extension --------------------------------------- # Configuration for intersphinx intersphinx_mapping = { 'python': ('https://docs.python.org/3.6', None), 'scipy': ('https://docs.scipy.org/doc/scipy/reference', None) } # -- Options for todo extension ---------------------------------------------- # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False ================================================ FILE: docs/devguides/contribution_guide.rst ================================================ Contribution Guide ===================== We are happy that you share your research with us and want to improve our code! Please follow the steps below to contribute to our project. If you have any questions or suggestions about the contributing process, please share them with us on the `forum `_. Please note that we do not answer general questions in the github issues interface. If you are a regular contributor in the DeepPavlov open source project, you can receive an invitation to one of our events or an opportunity to become a part of our team. How to contribute: #. Don't start the coding first. You should do a quick search over `existing issues `_ for the project to see if your suggestion was already discussed or even resolved. If nothing relevant was found, please create a new one and state what exactly you would like to implement or fix. You may proceed with coding once someone on our team accepts your offer. #. `Fork `_ the `DeepPavlov repository `_ #. Checkout the ``dev`` branch from `the upstream `_ as a base for your code: .. code:: bash git clone https://github.com//.git cd git remote add upstream https://github.com/deeppavlov/DeepPavlov.git git fetch upstream git checkout -b dev --track upstream/dev afterwards to sync the ``dev`` branch with external updates you can run: .. code:: bash git checkout dev git fetch upstream git pull #. **Create a new branch and switch** to it. Give it a meaningful name: .. code:: bash git checkout -b what_my_code_does_branch #. **Install DeepPavlov** in editable mode: .. code:: bash pip install -e . or .. code:: bash pip install -e .[docs,tests] In editable mode changes of the files in the repository directory will automatically reflect in your python environment. The last command with ``[docs,tests]`` will install additional requirements to build documentation and run tests. #. **Write readable code** and keep it `PEP8 `_-ed, **add docstrings** and keep them consistent with the `Google Style `_. Pay attention that we support typing annotations in every function declaration. Accompany your code with **clear comments** to let other people understand the flow of your mind. If you create new models, refer to the :doc:`Register your model ` section to add it to the DeepPavlov registry of models. #. We ask you to **add some tests**. This will help us maintain the framework, and this will help users to understand the feature you introduce. Examples of implemented tests are available in `tests/ `_ directory. #. Please, **update the documentation**, if you committed significant changes to our code. Make sure that documentation could be built after your changes and check how it looks using: .. code:: bash cd docs make html The built documentation will be added to ``docs/_build`` directory. Open it with your browser. #. **Commit your changes and push** your feature branch to your GitHub fork: .. code:: bash git add my_files git commit -m "fix: resolve issue #271" git push origin what_my_code_does_branch Follow the `semantic commit notation `_ for the name of the commit. #. Create a new `pull request `_ to get your feature branch merged into dev for others to use. Don't forget to `reference `_ the GitHub issue associated with your task in the description. #. **Relax and wait** : ) Some time after that your commit will be assigned to somebody from our team to check your code. After a code review and a successful completion of all tests, your pull request will be approved and pushed into the framework. If you still have any questions, either on the contribution process or about the framework itself, please share them with us on our `forum `_. Join our official `Telegram channel `_ to get notified about our updates & news. ================================================ FILE: docs/devguides/registry.rst ================================================ Register your model =================== In order to extend the library, you need to register your classes and functions; it is done in two steps. 1. Decorate your :class:`~deeppavlov.core.models.component.Component` (or :class:`~deeppavlov.core.data.dataset_reader.DatasetReader`, or :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator`, or :class:`~deeppavlov.core.data.data_fitting_iterator.DataFittingIterator`) using :func:`~deeppavlov.core.common.registry.register` and/or metrics function using :func:`~deeppavlov.core.common.metrics_registry.register_metric`. 2. Rebuild the registry running from DeepPavlov root directory: :: python -m utils.prepare.registry This script imports all the modules in deeppavlov package, builds the registry from them and writes it to a file. However, it is possible to use some classes and functions inside configuration files without registering them explicitly. There are two options available here: - instead of ``{"class_name": "registered_component_name"}`` in config file use key-value pair similar to ``{"class_name": "my_package.my_module:MyClass"}`` - if your classes/functions are properly decorated but not included in the registry, use ``"metadata"`` section of your config file specifying imports as ``"metadata": {"imports": ["my_local_package.my_module", "global_package.module"]}``; then the second step described above will be unnecessary (local packages are imported from the current working directory). ================================================ FILE: docs/features/hypersearch.rst ================================================ Hyperparameters optimization ============================ You can search for best hyperparameters of your model in DeepPavlov by means of cross-validation. Cross-validation ~~~~~~~~~~~~~~~~ You can run cross-validation in DeepPavlov to select best parameters of your model. For this purpose you have to run special command 'paramserach'. for example: .. code:: bash python -m deeppavlov.paramsearch path_to_json_config.json --folds 5 Parameters ---------- Cross validation command have several parameters: - ``config_path``: Specify config path, where you model is located. - ``--folds``: This parameter shows how many folds you need in cross validation. Do you want to use leave one out cross validation instead of folds? Just specify this: ``--folds loo``. If you want not to cross-validate just omit this parameter. - ``--search_type``: This parameter is optional - default value is "grid" (grid search). .. note:: Folds will be created automatically from union of train and validation datasets. Special parameters in config ---------------------------- Config file of model should be consist of parameters ranges for search. For example, you try to optimize regularization coefficient in model, so you should add additional parameter in config with suffix '_range'. Let's see example for logistic regression model: .. code:: python { "class_name": "faq_logreg_model", "in": "q_vect", "fit_on": ["q_vect", "y"], "c": {"search_choice": [1, 10, 100, 1000]}, "out": ["answer", "score"] } In this example parameter "c" described as search_choice, values for grid search: .. code:: python {"search_choice": [value_0, ..., value_n]} Results ------- As a result you'll have new json config with best model parameters. It'll be stored in the same directory as config file and will have suffix '_cvbest.json'. Also you'll see final log messages about best model: .. code:: bash INFO in '__main__'['paramsearch'] at line 169: Best model params: {'C': 10000, 'penalty': 'l1', 'accuracy': 0.81466} INFO in '__main__'['paramsearch'] at line 184: Best model saved in json-file: path_to_model_config_cvbest.json ================================================ FILE: docs/features/models/KBQA.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#### Knowledge Base Question Answering (KBQA)\n", "\n", "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/KBQA.ipynb)\n", "\n", "# Table of contents \n", "\n", "1. [Introduction to the task](#1.-Introduction-to-the-task)\n", "\n", "2. [Get started with the model](#2.-Get-started-with-the-model)\n", "\n", "3. [Models list](#3.-Models-list)\n", "\n", "4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\n", "\n", " 4.1. [Predict using Python](#4.1-Predict-using-Python)\n", " \n", " 4.2. [Predict using CLI](#4.2-Predict-using-CLI)\n", "\n", " 4.3. [Using entity linking and Wiki parser as standalone services for KBQA](#4.3-Using-entity-linking-and-Wiki-parser-as-standalone-tools-for-KBQA)\n", " \n", "5. [Customize the model](#5.-Customize-the-model)\n", " \n", " 5.1. [Description of config parameters](#5.1-Description-of-config-parameters)\n", " \n", " 5.2. [Train KBQA components](#5.2-Train-KBQA-components)\n", "\n", "# 1. Introduction to the task\n", "\n", "The knowledge base:\n", "\n", "* is a comprehensive repository of information about given domain or a number of domains;\n", "\n", "* reflects the ways we model knowledge about given subject or subjects, in terms of concepts, entities, properties, and relationships;\n", "\n", "* enables us to use this structured knowledge where appropriate, e.g. answering factoid questions.\n", "\n", "Currently, we support Wikidata as a Knowledge Base (Knowledge Graph). In the future, we will expand support for custom knowledge bases.\n", "\n", "The question answerer:\n", "\n", "* validates questions against the preconfigured list of question templates, disambiguates entities using entity linking and answers questions asked in natural language;\n", "\n", "* can be used with Wikidata (English, Russian) and (in the future versions) with custom knowledge graphs.\n", "\n", "Here are some of the most popular types of questions supported by the model:\n", "\n", "* **Complex questions with numerical values:** “What position did Angela Merkel hold on November 10, 1994?”\n", "* **Complex question where the answer is a number or a date:** “When did Jean-Paul Sartre move to Le Havre?”\n", "* **Questions with counting of answer entities:** “How many sponsors are for Juventus F.C.?”\n", "* **Questions with ordering of answer entities by ascending or descending of some parameter:** “Which country has highest individual tax rate?”\n", "* **Simple questions:** “What is crew member Yuri Gagarin’s Vostok?”\n", "\n", "The following models are used to find the answer (the links are for the English language model):\n", "\n", "* [BERT model](https://github.com/deeppavlov/DeepPavlov/blob/1.0.0rc1/deeppavlov/configs/classifiers/query_pr.json) for prediction of query template type. Model performs classification of questions into 8 classes correponding to 8 query template types;\n", "* [BERT entity detection model](https://github.com/deeppavlov/DeepPavlov/blob/1.0.0rc1/deeppavlov/configs/entity_extraction/entity_detection_en.json) for extraction of entity substrings from the questions;\n", "* Substring extracted by the entity detection model is used for [entity linking](https://github.com/deeppavlov/DeepPavlov/blob/1.0.0rc1/deeppavlov/configs/entity_extraction/entity_linking_en.json). Entity linking performs matching the substring with one of the Wikidata entities. Matching is based on the Levenshtein distance between the substring and an entity title. The result of the matching procedure is a set of candidate entities. There is also the search for the entity among this set with one of the top-k relations predicted by classification model;\n", "* [BERT model](https://github.com/deeppavlov/DeepPavlov/blob/1.0.0rc1/deeppavlov/configs/ranking/rel_ranking_bert_en.json) for ranking candidate relation paths;\n", "* Query generator model is used to fill query template with candidate entities and relations to find valid combinations of entities and relations for query template. Query generation model uses Wikidata HDT file.\n", "\n", "# 2. Get started with the model\n", "\n", "First make sure you have the DeepPavlov Library installed.\n", "[More info about the first installation](https://deeppavlov-test.readthedocs.io/en/latest/notebooks/Get%20Started%20with%20DeepPavlov.html)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! pip install --q deeppavlov" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then make sure that all the required packages for the model are installed." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! python -m deeppavlov install kbqa_cq_en\n", "! python -m deeppavlov install kbqa_cq_ru" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`kbqa_cq_en` and `kbqa_cq_rus` here are the names of the model's *config_files*. [What is a Config File?](https://docs.deeppavlov.ai/en/master/intro/configuration.html) \n", "\n", "Configuration file defines the model and describes its hyperparameters. To use another model, change the name of the *config_file* here and further.\n", "The full list of KBQA models with their config names can be found in the [table](#3.-Models-list).\n", "\n", "# 3. Models list\n", "\n", "The table presents a list of all of the KBQA-models available in DeepPavlov Library.\n", "\n", "| Config name | Database | Language | RAM | GPU |\n", "| :--- | --- | --- | --- | --- |\n", "| [kbqa_cq_en](https://github.com/deeppavlov/DeepPavlov/blob/1.0.0rc1/deeppavlov/configs/kbqa/kbqa_cq_en.json) | Wikidata | En | 3.1 Gb | 3.4 Gb |\n", "| [kbqa_cq_ru](https://github.com/deeppavlov/DeepPavlov/blob/1.0.0rc1/deeppavlov/configs/kbqa/kbqa_cq_en.json) | Wikidata | Ru | 4.3 Gb | 8.0 Gb |\n", "\n", "\n", "# 4. Use the model for prediction\n", "\n", "## 4.1 Predict using Python\n", "\n", "After [installing](#2.-Get-started-with-the-model) the model, build it from the config and predict." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from deeppavlov import configs, build_model\n", "\n", "kbqa = build_model('kbqa_cq_en', download=True, install=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Input**: List[sentences]\n", "\n", "**Output**: List[answers]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['Robert Zemeckis'],\n", " [['Q187364']],\n", " [['SELECT ?answer WHERE { wd:Q134773 wdt:P57 ?answer. }']]]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kbqa(['Who directed Forrest Gump?'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['United States senator'],\n", " [['Q4416090']],\n", " [['SELECT ?answer WHERE { wd:Q11613 p:P39 ?ent . ?ent ps:P39 ?answer . ?ent ?p ?x filter(contains(?x, n)). }']]]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kbqa(['What position was held by Harry S. Truman on 1/3/1935?'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['FC Barcelona B, Argentina national under-20 football team'],\n", " [['Q10467', 'Q1187790']],\n", " [['SELECT ?answer WHERE { wd:Q615 p:P54 ?ent . ?ent ps:P54 ?answer . ?ent ?p ?x filter(contains(?x, n)). }']]]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kbqa(['What teams did Lionel Messi play for in 2004?'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "KBQA model for complex question answering in Russian can be used from Python using the following code:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from deeppavlov import configs, build_model\n", "\n", "kbqa = build_model('kbqa_cq_ru', download=True, install=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['26 мая 1799, 06 июня 1799'],\n", " [['+1799-05-26^^T', '+1799-06-06^^T']],\n", " [['SELECT ?answer WHERE { wd:Q7200 wdt:P569 ?answer. }']]]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kbqa(['Когда родился Пушкин?'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4.2 Predict using CLI\n", "\n", "You can also get predictions in an interactive mode through CLI." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! python -m deeppavlov interact kbqa_сq_en [-d]\n", "! python -m deeppavlov interact kbqa_cq_ru [-d]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`-d` is an optional download key (alternative to `download=True` in Python code). It is used to download the pre-trained model along with embeddings and all other files needed to run the model.\n", "\n", "Or make predictions for samples from *stdin*." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! python -m deeppavlov predict kbqa_сq_en -f \n", "! python -m deeppavlov predict kbqa_cq_ru -f " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4.3 Using entity linking and Wiki parser as standalone tools for KBQA\n", "\n", "Default configuration for KBQA was designed to use all of the supporting models together as a part of the KBQA pipeline. However, there might be a case when you want to work with some of these models in addition to KBQA.\n", "\n", "For example, you might want to use entity linking model as an annotator in your [multiskill AI Assistant](https://github.com/deeppavlov/dream). Or, you might want to use Wiki Parser component to directly run SPARQL queries against your copy of Wikidata. To support these usages, you can also deploy supporting models as standalone components.\n", "\n", "Don’t forget to replace the `url` parameter values in the examples below with correct URLs.\n", "\n", "Config [entity_linking_en](https://github.com/deeppavlov/DeepPavlov/blob/1.0.0rc1/deeppavlov/configs/entity_extraction/entity_linking_en.json) can be used with the following commands:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! python -m deeppavlov install entity_linking_en -d" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! python -m deeppavlov riseapi entity_linking_en [-d] [-p ]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import requests\n", "\n", "payload = {\"entity_substr\": [[\"Forrest Gump\"]], \"tags\": [[\"PERSON\"]], \"probas\": [[0.9]],\n", " \"sentences\": [[\"Who directed Forrest Gump?\"]]}\n", "response = requests.post(entity_linking_url, json=payload).json()\n", "print(response)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Config [wiki_parser](https://github.com/deeppavlov/DeepPavlov/blob/1.0.0rc1/deeppavlov/configs/kbqa/wiki_parser.json) can be used with the following command:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! python -m deeppavlov riseapi wiki_parser [-d] [-p ]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Arguments of the annotator are `parser_info` (what we want to extract from Wikidata) and `query`.\n", "\n", "**Examples of queries:**\n", "\n", "To extract triplets for entities, the `query` argument should be the list of entities ids. `parser_info` should be the list of “find_triplets” strings." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "requests.post(wiki_parser_url, json = {\"parser_info\": [\"find_triplets\"], \"query\": [\"Q159\"]}).json()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To extract all relations of the entities, the `query` argument should be the list of entities ids, and `parser_info` should be the list of “find_rels” strings." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "requests.post(wiki_parser_url, json = {\"parser_info\": [\"find_rels\"], \"query\": [\"Q159\"]}).json()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To find labels for entities ids, the `query` argument should be the list of entities ids, and `parser_info` should be the list of “find_label” strings." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "requests.post(wiki_parser_url, json = {\"parser_info\": [\"find_label\"], \"query\": [[\"Q159\", \"\"]]}).json()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In this example, the second element of the list (an empty string) can be replaced with a sentence.\n", "\n", "To execute SPARQL queries, the `query` argument should be the list of tuples with the info about SPARQL queries, and `parser_info` should be the list of “query_execute” strings.\n", "\n", "Let us consider an example of the question “What is the deepest lake in Russia?” with the corresponding SPARQL query `SELECT ?ent WHERE { ?ent wdt:P31 wd:T1 . ?ent wdt:R1 ?obj . ?ent wdt:R2 wd:E1 } ORDER BY ASC(?obj) LIMIT 5`\n", "\n", "Arguments:\n", "\n", "* *what_return*: ```[“?obj”]```,\n", "* *query_seq*: ```[[“?ent”, “P17”, “Q159”], [“?ent”, “P31”, “Q23397”], [“?ent”, “P4511”, “?obj”]]```,\n", "* *filter_info*: ```[]```,\n", "* *order_info*: ```order_info(variable=’?obj’, sorting_order=’asc’)```." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "requests.post(\"wiki_parser_url\", json = {\"parser_info\": [\"query_execute\"], \"query\": [[[\"?obj\"], [[\"Q159\", \"P36\", \"?obj\"]], [], [], True]]}).json()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To use entity linking model in KBQA, you should add following API Requester component to the `pipe` in the *config_file*:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```\n", "{\n", " \"class_name\": \"api_requester\",\n", " \"id\": \"linker_entities\",\n", " \"url\": \"entity_linking_url\",\n", " \"out\": [\"entity_substr\", \"entity_ids\", \"entity_conf\", \"entity_pages\", \"entity_labels\"],\n", " \"param_names\": [\"entity_substr\", \"tags\", \"probas\", \"sentences\"]\n", " }\n", " ```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To use Wiki parser service in KBQA, you should add following API Requester component to the `pipe` in the *config_file*:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```\n", "{\n", " \"class_name\": \"api_requester\",\n", " \"id\": \"wiki_p\",\n", " \"url\": \"wiki_parser_url\",\n", " \"out\": [\"wiki_parser_output\"],\n", " \"param_names\": [\"parser_info\", \"query\"]\n", " }\n", " ```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 5. Customize the model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5.1 Description of config parameters" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Parameters of ``entity_linker`` component:\n", "\n", "- ``num_entities_to_return: int`` - the number of entity IDs, returned for each entity mention in text;\n", "- ``lemmatize: bool`` - whether to lemmatize entity mentions before searching candidate entity IDs in the inverted index;\n", "- ``use_decriptions: bool`` - whether to perform ranking of candidate entities by similarity of their descriptions to the context;\n", "- ``use_connections: bool`` - whether to use connections between candidate entities for different mentions for ranking;\n", "- ``use_tags: bool`` - whether to search only those entity IDs in the inverted index, which have the same tag as the entity mention;\n", "- ``prefixes: Dict[str, Any]`` - prefixes in the knowledge base for entities and relations;\n", "- ``alias_coef: float`` - the coefficient which is multiplied by the substring matching score of the entity if the entity mention in the text matches with the entity title." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Parameters of ``rel_ranking_infer`` component:\n", "\n", "- ``return_elements: List[str]`` - what elements should be returned by the component in the output tuple (answers are returned by default, optional elements are `\"confidences\"`, `\"answer_ids\"`, `\"entities_and_rels\"` (entities and relations from SPARQL queries), `\"queries\"` (SPARQL queries), `\"triplets\"` (triplets from SPARQL queries));\n", "- ``batch_size: int`` - candidate relations list will be split into N batches of the size `batch_size` for further ranking;\n", "- ``softmax: bool`` - whether to apply softmax function to the confidences list of candidate relations for a question;\n", "- ``use_api_requester: bool`` - true if wiki_parser [is called through api_requester](#4.3-Using-entity-linking-and-Wiki-parser-as-standalone-tools-for-KBQA);\n", "- ``rank: bool`` - whether to perform ranking of candidate relation paths;\n", "- ``nll_rel_ranking: bool`` - in DeepPavlov we have two types of relation ranking models: 1) the model which takes a question and a relation and is trained to classify question-relation by two classes (relevant / irrelevant relation) 2) the model which takes a question and a list of relations (one relevant relation and others - irrelevant) and is trained to define the relevant relation in the list with NLL loss; the output format in two cases is different;\n", "- ``nll_path_ranking: bool`` - the same case as `nll_rel_ranking` for ranking of relation paths;\n", "- ``top_possible_answers: int`` - SPARQL query execution can result in several valid answers, so `top_possible_answers` is the number of these answers which we leave in the output;\n", "- ``top_n: int`` - number of candidate SPARQL queries (and corresponding answers) in the output for a question;\n", "- ``pos_class_num: int`` - if we use the model which classifies question-relation into two classes (relevant / irrelevant), we should set the number of positive class (0 or 1);\n", "- ``rel_thres: float`` - we leave only relations with the confidence upper threshold;\n", "- ``type_rels: List[str]`` - relations which connect entity and its type in the knowledge graph." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Parameters of ``query_generator`` component:\n", "\n", "- ``entities_to_leave: int`` - how many entity IDs to use to make a a combination of entities and relations for filling in the slots of the SPARQL query template;\n", "- ``rels_to_leave: int`` - how many relations to use to make a a combination of entities and relations for filling in the slots of the SPARQL query template;\n", "- ``max_comb_num: int`` - maximal number of combinations of entities and relations for filling in the slots of SPARQL query template;\n", "- ``map_query_str_to_kb: List[Tuple[str, str]]`` - a list of elements like [\"wd:\", \"http://we/\"], where the first element is a prefix of an entity (\"wd:\") or relation in the SPARQL query template, the second - the corresponding prefix in the knowledge base (\"http://we/\");\n", "- ``kb_prefixes: Dict[str, str]`` - a dictionary {\"entity\": \"wd:E\", \"rel\": \"wdt:R\", ...} - prefixes of entities, relations and types in the knowledge base;\n", "- ``gold_query_info: Dict[str, str]`` - names of unknown variables in SPARQL queries in the dataset (LC-QuAD2.0 or RuBQ2.0);\n", "- ``syntax_structure_known: bool`` - whether the syntax structure of the question is known (is True in kbqa_cq_ru.json, because this config performs syntax parsing with slovnet_syntax_parser)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5.2 Train KBQA components" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Train Query Prediction Model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The dataset for training query prediction model consists of three *.csv* files: *train.csv*, *valid.csv* and *test.csv*. Each line in this file contains question and corresponding query template type, for example:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```\n", "\"What is the longest river in the UK?\", 6\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Train Entity Detection Model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The dataset is a pickle file. The dataset must be split into three parts: train, test, and validation. Each part is a list of tuples of question tokens and tags for each token. An example of training sample:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```\n", "(['What', 'is', 'the', 'complete', 'list', 'of', 'records', 'released', 'by', 'Jerry', 'Lee', 'Lewis', '?'],\n", " ['O', 'O', 'O', 'O', 'B-T', 'I-T', 'I-T', 'O', 'O', 'B-E', 'I-E', 'I-E', 'O'])\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`B-T` corresponds to tokens of entity types substrings beginning, `I-T` - to tokens of inner part of entity types substrings, `B-E` and `I-E` - for entities, `O` - for other tokens." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Train Path Ranking Model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The dataset (in pickle format) is a dict of three keys: \"train\", \"valid\" and \"test\". The value by each key is the list of samples, an example of a sample:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```\n", "(['What is the Main St. Exile label, which Nik Powell co-founded?', ['record label', 'founded by']], '1')\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The sample contains the question, relations in the question and label (1 - if the relations correspond to the question, 0 - otherwise)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Adding Templates For New SPARQL Queries" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Templates can be added to sparql_queries.json file, which is a dictionary, where keys are template types and values are templates with additional information. An example of a template:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```\n", "{\n", " \"query_template\": \"SELECT ?obj WHERE { wd:E1 p:R1 ?s . ?s ps:R1 ?obj . ?s ?p ?x filter(contains(?x, N)) }\",\n", " \"rank_rels\": [\"wiki\", \"do_not_rank\", \"do_not_rank\"],\n", " \"rel_types\": [\"no_type\", \"statement\", \"qualifier\"],\n", " \"query_sequence\": [1, 2, 3],\n", " \"return_if_found\": true,\n", " \"template_num\": \"0\",\n", " \"alternative_templates\": []\n", " }\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* `query_template` is the template of the SPARQL query;\n", "* `rank_rels` is a list which defines whether to rank relations, in this example **p:R1** relations we extract from Wikidata for **wd:E1** entities and rank with RelRanker, **ps:R1** and **?p** relations we do not extract or rank;\n", "* `rel_types` - direct, statement or qualifier relations;\n", "* `query_sequence` - the sequence in which the triplets will be extracted from the Wikidata hdt file;\n", "* `return_if_found` - the parameter which iterates over all possible combinations of entities, relations and types, if true - return the first valid combination found, if false - consider all combinations;\n", "* `template_num` - the type of a template;\n", "* `alternative_templates` - type numbers of alternative templates to use if the answer was not found using the current template." ] } ], "metadata": {}, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: docs/features/models/NER.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#### Named Entity Recognition (NER)\n", "\n", "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/NER.ipynb)\n", "\n", "# Table of contents \n", "\n", "1. [Introduction to the task](#1.-Introduction-to-the-task)\n", "\n", "2. [Get started with the model](#2.-Get-started-with-the-model)\n", "\n", "3. [Models list](#3.-Models-list)\n", "\n", "4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\n", "\n", " 4.1. [Predict using Python](#4.1-Predict-using-Python)\n", " \n", " 4.2. [Predict using CLI](#4.2-Predict-using-CLI)\n", " \n", "5. [Evaluate](#5.-Evaluate)\n", " \n", " 5.1. [Evaluate from Python](#5.1-Evaluate-from-Python)\n", " \n", " 5.2. [Evaluate from CLI](#5.2-Evaluate-from-CLI)\n", "\n", "6. [Customize the model](#6.-Customize-the-model)\n", " \n", " 6.1. [Train your model from Python](#6.1-Train-your-model-from-Python)\n", " \n", " 6.2. [Train your model from CLI](#6.2-Train-your-model-from-CLI)\n", "\n", "7. [NER-tags list](#7.-NER-tags-list)\n", "\n", "# 1. Introduction to the task\n", "\n", "**Named Entity Recognition (NER)** is a task of assigning a tag (from a predefined set of tags) to each token in a given sequence. In other words, NER-task consists of identifying named entities in the text and classifying them into types (e.g. person name, organization, location etc). \n", "\n", "**BIO encoding schema** is usually used in NER task. It uses 3 tags: B for the beginning of the entity, I for the inside of the entity, and O for non-entity tokens. The second part of the tag stands for the entity type.\n", "\n", "Here is an example of a tagged sequence:\n", "\n", "| Elon | Musk | founded | Tesla| in | 2003 | . |\n", "| --- | --- | --- | --- | --- | --- | --- |\n", "| B-PER | I-PER | O | B-ORG | O | B-DATE | O |\n", "\n", "Here we can see three extracted named entities: *Elon Musk* (which is a person's name), *Tesla* (which is a name of an organization) and *2003* (which is a date). To see more examples try out our [Demo](https://demo.deeppavlov.ai/#/en/ner).\n", "\n", "The list of possible types of NER entities may vary depending on your dataset domain. The list of tags used in DeepPavlov's models can be found in the [table](#7.-NER-tags-list).\n", "\n", "# 2. Get started with the model\n", "\n", "First make sure you have the DeepPavlov Library installed.\n", "[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install -q deeppavlov" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then make sure that all the required packages for the model are installed." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!python -m deeppavlov install ner_ontonotes_bert" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`ner_ontonotes_bert` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html) \n", "\n", "Configuration file defines the model and describes its hyperparameters. To use another model, change the name of the *config_file* here and further.\n", "The full list of NER models with their config names can be found in the [table](#3.-Models-list).\n", "\n", "There are alternative ways to install the model's packages that do not require executing a separate command -- see the options in the next sections of this page.\n", "\n", "# 3. Models list\n", "\n", "The table presents a list of all of the NER-models available in the DeepPavlov Library.\n", "\n", "| Config name | Dataset | Language | Model Size | F1 score (ner_f1) | F1 score (ner_f1_token) |\n", "| :--- | --- | --- | --- | --- | ---: |\n", "| ner_case_agnostic_mdistilbert| [CoNLL-2003](https://paperswithcode.com/dataset/conll-2003) | En | 1.6 GB | 89.9 | 91.6 |\n", "| ner_conll2003_bert | [CoNLL-2003](https://paperswithcode.com/dataset/conll-2003) | En | 1.3 GB | **91.9** | **93.4** |\n", "| ner_ontonotes_bert | [OntoNotes](https://paperswithcode.com/dataset/ontonotes-5-0) | En | 1.3 GB | 89.2 | 92.7 |\n", "| ner_collection3_bert | [Collection3](https://www.researchgate.net/publication/313808701_Combining_Knowledge_and_CRF-Based_Approach_to_Named_Entity_Recognition_in_Russian) | Ru | 2.1 GB | **98.5** | **98.9** |\n", "| ner_rus_bert | [Collection3](https://www.researchgate.net/publication/313808701_Combining_Knowledge_and_CRF-Based_Approach_to_Named_Entity_Recognition_in_Russian) | Ru | 2.1 GB | 97.6 | 98.5 |\n", "| ner_rus_convers_distilrubert_2L | [Collection-rus](https://www.researchgate.net/publication/313808701_Combining_Knowledge_and_CRF-Based_Approach_to_Named_Entity_Recognition_in_Russian) | Ru | 1.3 GB | 92.9 | 96.6 |\n", "| ner_rus_convers_distilrubert_6L | [Collection-rus](https://www.researchgate.net/publication/313808701_Combining_Knowledge_and_CRF-Based_Approach_to_Named_Entity_Recognition_in_Russian) | Ru | 1.6 GB | 96.7 | 98.5 |\n", "| ner_rus_bert_probas | [Wiki-NER-rus](https://aclanthology.org/I17-1042/) | Ru | 2.1 GB | 72.6 | 79.5 |\n", "| ner_ontonotes_bert_mult | [OntoNotes](https://paperswithcode.com/dataset/ontonotes-5-0) | Multi | 2.1 GB | 88.9 | 92.0 |\n", "\n", "\n", "# 4. Use the model for prediction\n", "\n", "## 4.1 Predict using Python\n", "\n", "After [installing](#2.-Get-started-with-the-model) the model, build it from the config and predict." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from deeppavlov import build_model\n", "\n", "ner_model = build_model('ner_ontonotes_bert', download=True, install=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The `download` argument defines whether it is necessary to download the files defined in the `download` section of the config: usually it provides the links to the train and test data, to the pretrained models, or to the embeddings.\n", "\n", "Setting the `install` argument to `True` is equivalent to executing the command line `install` command. If set to `True`, it will first install all the required packages.\n", "\n", "**Input**: List[sentences]\n", "\n", "**Output**: List[tokenized sentences, corresponding NER-tags]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[['Bob', 'Ross', 'lived', 'in', 'Florida'],\n", " ['Elon', 'Musk', 'founded', 'Tesla']],\n", " [['B-PERSON', 'I-PERSON', 'O', 'O', 'B-GPE'],\n", " ['B-PERSON', 'I-PERSON', 'O', 'B-ORG']]]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ner_model(['Bob Ross lived in Florida', 'Elon Musk founded Tesla'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4.2 Predict using CLI\n", "\n", "You can also get predictions in an interactive mode through CLI (Сommand Line Interface)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! python -m deeppavlov interact ner_ontonotes_bert -d" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`-d` is an optional download key (alternative to `download=True` in Python code). The key `-d` is used to download the pre-trained model along with embeddings and all other files needed to run the model.\n", "\n", "Or make predictions for samples from *stdin*." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! python -m deeppavlov predict ner_ontonotes_bert -f " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 5. Evaluate\n", "\n", "There are two metrics that are used to evaluate a NER model in DeepPavlov:\n", "\n", "`ner_f1` is measured on the entity-level (actual text spans should match exactly)\n", "\n", "`ner_token_f1` is measured on a token level (correct tokens from not fully extracted entities will still be counted as TPs (true positives))\n", "\n", "## 5.1 Evaluate from Python" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from deeppavlov import evaluate_model\n", "\n", "model = evaluate_model('ner_ontonotes_bert', download=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5.2 Evaluate from CLI" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! python -m deeppavlov evaluate ner_ontonotes_bert" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 6. Customize the model\n", "\n", "## 6.1 Train your model from Python\n", "\n", "### Provide your data path\n", "\n", "To train the model on your data, you need to change the path to the training data in the *config_file*.\n", " \n", "Parse the *config_file* and change the path to your data from Python." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "~/.deeppavlov/downloads/ontonotes/\n" ] } ], "source": [ "from deeppavlov import train_model\n", "from deeppavlov.core.commands.utils import parse_config\n", "\n", "model_config = parse_config('ner_ontonotes_bert')\n", "\n", "# dataset that the model was trained on\n", "print(model_config['dataset_reader']['data_path'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Provide a *data_path* to your own dataset. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# download and unzip a new example dataset\n", "!wget http://files.deeppavlov.ai/deeppavlov_data/conll2003_v2.tar.gz\n", "!tar -xzvf \"conll2003_v2.tar.gz\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# provide a path to the train file\n", "model_config['dataset_reader']['data_path'] = 'contents/train.txt'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "### Train dataset format\n", "\n", "To train the model, you need to have a txt-file with a dataset in the following format:\n", "\n", "```\n", "EU B-ORG\n", "rejects O\n", "the O\n", "call O\n", "of O\n", "Germany B-LOC\n", "to O\n", "boycott O\n", "lamb O\n", "from O\n", "Great B-LOC\n", "Britain I-LOC\n", ". O\n", "\n", "China B-LOC\n", "says O\n", "time O\n", "right O\n", "for O\n", "Taiwan B-LOC\n", "talks O\n", ". O\n", "```\n", "\n", "The source text is **tokenized** and **tagged**. For each token, there is a tag with **BIO** markup. Tags are separated from tokens with **whitespaces**. Sentences are separated with **empty lines**.\n", "\n", "\n", "### Train the model using new config" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ner_model = train_model(model_config)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Use your model for prediction." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[['Bob', 'Ross', 'lived', 'in', 'Florida'],\n", " ['Elon', 'Musk', 'founded', 'Tesla']],\n", " [['B-PERSON', 'I-PERSON', 'O', 'O', 'B-GPE'],\n", " ['B-PERSON', 'I-PERSON', 'O', 'B-ORG']]]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ner_model(['Bob Ross lived in Florida', 'Elon Musk founded Tesla'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6.2 Train your model from CLI" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! python -m deeppavlov train ner_ontonotes_bert" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 7. NER-tags list" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The table presents a list of all of the NER entity tags used in DeepPavlov's NER-models." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "| | |\n", "| ------------ | ------------------------------------------------------ |\n", "| **PERSON** | People including fictional |\n", "| **NORP** | Nationalities or religious or political groups |\n", "| **FACILITY** | Buildings, airports, highways, bridges, etc. |\n", "| **ORGANIZATION** | Companies, agencies, institutions, etc. |\n", "| **GPE** | Countries, cities, states |\n", "| **LOCATION** | Non-GPE locations, mountain ranges, bodies of water |\n", "| **PRODUCT** | Vehicles, weapons, foods, etc. (Not services) |\n", "| **EVENT** | Named hurricanes, battles, wars, sports events, etc. |\n", "| **WORK OF ART** | Titles of books, songs, etc. |\n", "| **LAW** | Named documents made into laws |\n", "| **LANGUAGE** | Any named language |\n", "| **DATE** | Absolute or relative dates or periods |\n", "| **TIME** | Times smaller than a day |\n", "| **PERCENT** | Percentage (including “%”) |\n", "| **MONEY** | Monetary values, including unit |\n", "| **QUANTITY** | Measurements such as weight or distance |\n", "| **ORDINAL** | “first”, “second”, etc. |\n", "| **CARDINAL** | Numerals that do not fall under another type |" ] } ], "metadata": {}, "nbformat": 4, "nbformat_minor": 4 } ================================================ FILE: docs/features/models/ODQA.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#### Open Domain Question Answering (ODQA)\n", "\n", "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/ODQA.ipynb)\n", "\n", "# Table of contents \n", "\n", "1. [Introduction to the task](#1.-Introduction-to-the-task)\n", "\n", "2. [Get started with the model](#2.-Get-started-with-the-model)\n", "\n", "3. [Models list](#3.-Models-list)\n", "\n", "4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\n", "\n", " 4.1 [Predict using Python](#4.1-Predict-using-Python)\n", "\n", " 4.2 [Predict using CLI](#4.2-Predict-using-CLI)\n", "\n", "5. [Customize the model](#5.-Customize-the-model)\n", "\n", " 5.1 [Description of config parameters](#5.1-Description-of-config-parameters)\n", " \n", " 5.2 [Building the index and training the reader model](#5.2-Building-the-index-and-training-the-reader-model)\n", "\n", "# 1. Introduction to the task\n", "\n", "**Open Domain Question Answering (ODQA)** is a task to find an exact answer\n", "to any question in **Wikipedia** articles. Thus, given only a question, the system outputs\n", "the best answer it can find.\n", "The default ODQA implementation takes a batch of queries as input and returns the best answer.\n", "\n", "English ODQA version consists of the following components:\n", "\n", "- TF-IDF ranker, which defines top-N most relevant paragraphs in TF-IDF index;\n", "- Binary Passage Retrieval (BPR) ranker, which defines top-K most relevant in binary index;\n", "- a database of paragraphs (by default, from Wikipedia) which finds N + K most relevant paragraph text by IDs, defined by TF-IDF and BPR ranker;\n", "- Reading Comprehension component, which finds answers in paragraphs and defines answer confidences.\n", "\n", "Russian ODQA version performs retrieval only with TF-IDF index.\n", "\n", "Binary Passage Retrieval is resource-efficient the method of building a dense passage index. The dual encoder (with BERT or other Tranformer as backbone) is trained on question answering dataset (Natural Questions in our case) to maximize dot product of question and passage with answer embeddings and minimize otherwise. The question or passage embeddings are obtained the following way: vector of BERT CLS-token is fed into a dense layer followed by a hash function which turns dense vector into binary one.\n", "\n", "# 2. Get started with the model\n", "\n", "First make sure you have the DeepPavlov Library installed.\n", "[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install -q deeppavlov" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The example below is given for basic ODQA config [en_odqa_infer_wiki](https://github.com/deeppavlov/DeepPavlov/blob/1.1.1/deeppavlov/configs/odqa/en_odqa_infer_wiki.json).\n", "Check what [other ODQA configs](#3.-Models-list) are available and simply replace `en_odqa_infer_wiki`\n", "with the config name of your preference. [What is a Config File?](https://docs.deeppavlov.ai/en/master/intro/configuration.html)\n", "\n", "Before using the model make sure that all required packages are installed running the command:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!python -m deeppavlov install en_odqa_infer_wiki" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There are alternative ways to install the model's packages that do not require executing a separate command -- see the options in the next sections of this page.\n", "\n", "# 3. Models list\n", "\n", "The table presents a list of all of the ODQA models available in the DeepPavlov Library.\n", "\n", "| Config | Description |\n", "| :--- | :--- |\n", "| odqa/en_odqa_infer_wiki.json | Basic config for **English** language. Consists of of Binary Passage Retrieval, TF-IDF retrieval and reader. |\n", "| odqa/en_odqa_pop_infer_wiki.json | Extended config for **English** language. Consists of of Binary Passage Retrieval, TF-IDF retrieval, popularity ranker and reader. |\n", "| odqa/ru_odqa_infer_wiki.json | Basic config for **Russian** language. Consists of TF-IDF ranker and reader. |\n", "\n", "The table presents the scores on Natural Questions and SberQuAD dataset and memory consumption.\n", "\n", "| Config | Number of
paragraphs | Dataset | F1 | EM | RAM | GPU | Time for
1 query |\n", "| :--- | :---: | :--- | :---: | :---: | :---: | :---: | :---: |\n", "| odqa/en_odqa_infer_wiki.json | 200 | Natural Questions | 45.2 | 37.0 | 10.4 | 2.4 | 4.9 s |\n", "| odqa/ru_odqa_infer_wiki.json | 100 | SberQuAD | 59.2 | 49.0 | 13.1 | 5.3 | 2.0 s |\n", "\n", "# 4. Use the model for prediction\n", "\n", "## 4.1 Predict using Python\n", "\n", "### English" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from deeppavlov import build_model\n", "\n", "odqa_en = build_model('en_odqa_infer_wiki', download=True, install=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Input**: List[questions]\n", "\n", "**Output**: Tuple[List[answers], List[answer scores], List[answer places in paragraph]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[['Luke Skywalker'], [4.196979999542236]]\n" ] } ], "source": [ "odqa_en([\"What is the name of Darth Vader's son?\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Russian" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from deeppavlov import build_model\n", "\n", "odqa_ru = build_model('ru_odqa_infer_wiki', download=True, install=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[['на востоке и юге Австралии'], [0.9999760985374451]]\n" ] } ], "source": [ "odqa_ru([\"Где живут кенгуру?\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4.2 Predict using CLI\n", "\n", "You can also get predictions in an interactive mode through CLI (Сommand Line Interface)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! python -m deeppavlov interact en_odqa_infer_wiki -d" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`-d` is an optional download key (alternative to `download=True` in Python code). The key `-d` is used to download the pre-trained model along with embeddings and all other files needed to run the model.\n", "\n", "# 5. Customize the model\n", "\n", "## 5.1 Description of config parameters\n", "\n", "Parameters of ``bpr`` component:\n", " \n", "- ``load_path`` - path with checkpoint of query encoder and bpr index;\n", "- ``query_encoder_file`` - filename of query encoder (Transformer-based model which takes a question as input and obtains its binary embedding);\n", "- ``bpr_index`` - filename with BPR index (matrix of paragraph binary vectors);\n", "- ``pretrained_model`` - Transformer model, used in query encoder;\n", "- ``max_query_length`` - maximal length (in sub-tokens) of the input to the query encoder;\n", "- ``top_n`` - how many paragraph IDs to return per a question.\n", "\n", "Parameters of ``tfidf_ranker`` component:\n", "\n", "- ``top_n`` - how many paragraph IDs to return per a question.\n", "\n", "Parameters of ``logit_ranker`` component:\n", "\n", "- ``batch_size`` - the paragraphs from the database (some of which contain the answer to the question, others - do not contain) will be split into batches with the size ``batch_size`` for extraction of candidate answer in each paragraph;\n", "- ``squad_model`` - the model which finds spans of an answer in a paragraph;\n", "- ``sort_noans`` - whether to put paragraphs with no answer in the end of paragraph list, sorted by confidences;\n", "- ``top_n`` - the number of possible answers for a question;\n", "- ``return_answer_sentence`` - whether to return the sentence from the paragraph with the answer.\n", "\n", "## 5.2 Building the index and training the reader model\n", "\n", "There are two customizable components in ODQA configs:\n", "\n", "- TF-IDF ranker;\n", "- Reading comprehension model.\n", "\n", "If you would like to build the TF-IDF index for your own text database, read [here](https://docs.deeppavlov.ai/en/master/features/models/tfidf_ranking.html#ranker-training). \n", "\n", "In addition, to train the Reader on your data, read [here](https://docs.deeppavlov.ai/en/master/features/models/SQuAD.html#4.1-Train-your-model-from-Python)." ] } ], "metadata": {}, "nbformat": 4, "nbformat_minor": 4 } ================================================ FILE: docs/features/models/SQuAD.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#### Context Question Answering\n", "\n", "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/SQuAD.ipynb)\n", "\n", "[![Medium](https://img.shields.io/badge/Medium-12100E?style=for-the-badge&logo=medium&logoColor=white)](https://medium.com/deeppavlov/developing-qa-systems-for-any-language-with-deeppavlov-a9033d5231a8)\n", "\n", "# Table of contents \n", "\n", "1. [Introduction to the task](#1.-Introduction-to-the-task)\n", "\n", "2. [Get started with the model](#2.-Get-started-with-the-model)\n", "\n", "3. [Models list](#3.-Models-list)\n", "\n", "4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\n", "\n", " 4.1. [Predict using Python](#4.1-Predict-using-Python)\n", " \n", " 4.2. [Predict using CLI](#4.2-Predict-using-CLI)\n", " \n", "5. [Train the model on your data](#5.-Train-the-model-on-your-data)\n", " \n", " 5.1. [from Python](#5.1-Train-your-model-from-Python)\n", " \n", " 5.2. [from CLI](#5.2-Train-your-model-from-CLI)\n", " \n", "6. [Evaluate](#6.-Evaluate)\n", " \n", " 6.1. [from Python](#6.1-Evaluate-from-Python)\n", " \n", " 6.2. [from CLI](#6.2-Evaluate-from-CLI)\n", "\n", "# 1. Introduction to the task\n", "\n", "Context Question Answering is a task of finding a fragment with an answer to a question in a given segment of context.\n", "\n", "**Context**:\n", "\n", "```\n", "In meteorology, precipitation is any product of the condensation \n", "of atmospheric water vapor that falls under gravity. The main forms \n", "of precipitation include drizzle, rain, sleet, snow, graupel and hail… \n", "Precipitation forms as smaller droplets coalesce via collision with \n", "other rain drops or ice crystals within a cloud. Short, intense periods \n", "of rain in scattered locations are called “showers”.\n", "```\n", "\n", "**Question**:\n", "```\n", "Where do water droplets collide with ice crystals to form precipitation?\n", "```\n", "\n", "**Answer**: \n", "```\n", "within a cloud\n", "```\n", "\n", "Datasets that follow this task format:\n", "\n", "- [Stanford Question Answering Dataset (SQuAD) (EN)](https://rajpurkar.github.io/SQuAD-explorer/)\n", "\n", "- [SberQuAD (RU)](https://paperswithcode.com/dataset/sberquad)\n", "\n", "# 2. Get started with the model\n", "\n", "First make sure you have the DeepPavlov Library installed.\n", "[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install -q deeppavlov" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then make sure that all the required packages for the model are installed." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!python -m deeppavlov install squad_bert" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`squad_bert` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html) \n", "\n", "Configuration file defines the model and describes its hyperparameters. To use another model, change the name of the *config_file* here and further.\n", "The full list of the models with their config names can be found in the [table](#3.-Models-list).\n", "\n", "# 3. Models list\n", "\n", "The table presents a list of all of the Context Question Answering models available in DeepPavlov Library.\n", "\n", "| Config name | Dataset | Language | Model Size | F1 score | EM |\n", "| :--- | --- | --- | --- | --- | ---: |\n", "| squad_bert | SQuAD v1.1 | En | 1.3 GB | 88.86 | 81.49 |\n", "| qa_squad2_bert | SQuAD v2.0 | En | 1.3 GB | 83.56 | 75.54 |\n", "| qa_multisberquad_bert | MultiSQuAD | Multi | 2 GB | 80.76 | 63.81 |\n", "| squad_ru_bert | SberQuAD | Ru | 2.0 GB | 84.71 | 66.21 |\n", "| squad_ru_convers_distilrubert_2L | SberQuAD | Ru | 1.2 GB | 65.20 | 44.52 |\n", "| squad_ru_convers_distilrubert_6L | SberQuAD | Ru | 1.6 GB | 80.57 | 61.54 |\n", "\n", "\n", "# 4. Use the model for prediction\n", "\n", "## 4.1 Predict using Python\n", "\n", "After [installing](#2.-Get-started-with-the-model) the model, build it from the config and predict." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from deeppavlov import build_model\n", "\n", "model = build_model('squad_bert', download=True, install=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Input**: List[context], List[question]\n", "\n", "**Output**: List[answer, start_character, logit]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['a library for NLP and dialog systems'], [14], [200928.390625]]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model(['DeepPavlov is a library for NLP and dialog systems.'], ['What is DeepPavlov?'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4.2 Predict using CLI\n", "\n", "You can also get predictions in an interactive mode through CLI (Command Line Interface)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!python -m deeppavlov interact squad_bert -d" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`-d` is an optional download key (alternative to `download=True` in Python code). The key `-d` is used to download the pre-trained model along with embeddings and all other files needed to run the model.\n", "\n", "Or make predictions for samples from *stdin*." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!python -m deeppavlov predict squad_bert -f " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 5. Train the model on your data\n", "\n", "\n", "## 5.1 Train your model from Python\n", "\n", "### Provide your data path\n", "\n", "To train the model on your data, you need to change the path to the training data in the *config_file*.\n", "\n", "Parse the *config_file* and change the path to your data from Python." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "~/.deeppavlov/downloads/squad/\n" ] } ], "source": [ "from deeppavlov import train_model\n", "from deeppavlov.core.commands.utils import parse_config\n", "\n", "model_config = parse_config('squad_bert')\n", "\n", "# dataset that the model was trained on\n", "print(model_config['dataset_reader']['data_path'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Provide a *data_path* to your own dataset. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# download and unzip a new example dataset\n", "!wget http://files.deeppavlov.ai/datasets/squad-v1.1.tar.gz\n", "!tar -xzvf \"squad-v1.1.tar.gz\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note that if you want to provide your own dataset, it should have the same format as the SQuAD dataset downloaded in this cell." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# provide a path to the train file\n", "model_config['dataset_reader']['data_path'] = '/contents/train-v1.1.json'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### SQuAD dataset info\n", "\n", "There are *two* versions of the SQuAD dataset available for training at the moment: \n", "\n", "- [SQuAD 1.1](https://arxiv.org/abs/1606.05250) contains 107,785 question-answer pairs on 536 articles. Dataset size: `33.52 MiB`.\n", "\n", "- [SQuAD 2.0](https://arxiv.org/abs/1806.03822) combines all of the questions from SQuAD 1.1 with over 50,000 un-answerable questions written adversarially by crowdworkers. Dataset size: `44.34 MiB`.\n", "\n", "### Train the model using new config" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model = train_model(model_config)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Use your model for prediction." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['a library for NLP and dialog systems'], [14], [200928.390625]]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model(['DeepPavlov is a library for NLP and dialog systems.'], ['What is DeepPavlov?'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5.2 Train your model from CLI" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!python -m deeppavlov train squad_bert" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 6. Evaluate\n", "\n", "## 6.1 Evaluate from Python" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from deeppavlov import evaluate_model\n", "\n", "model = evaluate_model('squad_bert', download=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6.2 Evaluate from CLI" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! python -m deeppavlov evaluate squad_bert -d" ] } ], "metadata": {}, "nbformat": 4, "nbformat_minor": 4 } ================================================ FILE: docs/features/models/bert.rst ================================================ BERT in DeepPavlov ================== BERT (Bidirectional Encoder Representations from Transformers) is a Transformer pre-trained on masked language model and next sentence prediction tasks. This approach showed state-of-the-art results on a wide range of NLP tasks in English. | BERT paper: https://arxiv.org/abs/1810.04805 | Google Research BERT repository: https://github.com/google-research/bert There are several pre-trained BERT models released by Google Research, more details about these pre-trained models could be found here: https://github.com/google-research/bert#pre-trained-models - BERT-base, English, cased, 12-layer, 768-hidden, 12-heads, 110M parameters: download from `[google] `__, `[deeppavlov] `__ - BERT-base, English, uncased, 12-layer, 768-hidden, 12-heads, 110M parameters: download from `[google] `__, `[deeppavlov] `__ - BERT-large, English, cased, 24-layer, 1024-hidden, 16-heads, 340M parameters: download from `[google] `__ - BERT-base, multilingual, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: download from `[google] `__, `[deeppavlov] `__, `[deeppavlov_pytorch] `__ - BERT-base, Chinese, cased, 12-layer, 768-hidden, 12-heads, 110M parameters: download from `[google] `__, `[deeppavlov] `__, `[deeppavlov_pytorch] `__ We have trained BERT-base model for other languages and domains: - RuBERT, Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] `__, `[deeppavlov_pytorch] `__ - SlavicBERT, Slavic (bg, cs, pl, ru), cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] `__, `[deeppavlov_pytorch] `__ - Conversational BERT, English, cased, 12-layer, 768-hidden, 12-heads, 110M parameters: `[deeppavlov] `__, `[deeppavlov_pytorch] `__ - Conversational RuBERT, Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] `__, `[deeppavlov_pytorch] `__ - Conversational DistilRuBERT, Russian, cased, 6-layer, 768-hidden, 12-heads, 135.4M parameters: `[deeppavlov_pytorch] `__ - Conversational DistilRuBERT-tiny, Russian, cased, 2-layer, 768-hidden, 12-heads, 107M parameters: `[deeppavlov_pytorch] `__ - Sentence Multilingual BERT, 101 languages, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] `__, `[deeppavlov_pytorch] `__ - Sentence RuBERT, Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] `__, `[deeppavlov_pytorch] `__ The ``deeppavlov_pytorch`` models are designed to be run with the `HuggingFace's Transformers `__ library. RuBERT was trained on the Russian part of Wikipedia and news data. We used this training data to build vocabulary of Russian subtokens and took multilingual version of BERT-base as initialization for RuBERT [1]_. SlavicBERT was trained on Russian News and four Wikipedias: Bulgarian, Czech, Polish, and Russian. Subtoken vocabulary was built using this data. Multilingual BERT was used as an initialization for SlavicBERT. The model is described in our ACL paper [2]_. Conversational BERT was trained on the English part of Twitter, Reddit, DailyDialogues [4]_, OpenSubtitles [5]_, Debates [6]_, Blogs [7]_, Facebook News Comments. We used this training data to build the vocabulary of English subtokens and took English cased version of BERT-base as initialization for English Conversational BERT. Conversational RuBERT was trained on OpenSubtitles [5]_, Dirty, Pikabu, and Social Media segment of Taiga corpus [8]_. We assembled new vocabulary for Conversational RuBERT model on this data and initialized model with RuBERT. Conversational DistilRuBERT (6 transformer layers) and DistilRuBERT-tiny (2 transformer layers) were trained on the same data as Conversational RuBERT and highly inspired by DistilBERT [3]_. Namely, Distil* models (students) used pretrained Conversational RuBERT as teacher and linear combination of the following losses: 1. Masked language modeling loss (between student output logits for tokens and its true labels) 2. Kullback-Leibler divergence (between student and teacher output logits) 3. Cosine embedding loss (between averaged hidden states of the teacher and hidden states of the student) 4. Mean squared error loss (between averaged attention maps of the teacher and attention maps of the student) Sentence Multilingual BERT is a representation-based sentence encoder for 101 languages of Multilingual BERT. It is initialized with Multilingual BERT and then fine-tuned on english MultiNLI [9]_ and on dev set of multilingual XNLI [10]_. Sentence representations are mean pooled token embeddings in the same manner as in Sentence-BERT [12]_. Sentence RuBERT is a representation-based sentence encoder for Russian. It is initialized with RuBERT and fine-tuned on SNLI [11]_ google-translated to russian and on russian part of XNLI dev set [10]_. Sentence representations are mean pooled token embeddings in the same manner as in Sentence-BERT [12]_. Here, in DeepPavlov, we made it easy to use pre-trained BERT for downstream tasks like classification, tagging, question answering and ranking. We also provide pre-trained models and examples on how to use BERT with DeepPavlov. BERT as Embedder ---------------- :class:`~deeppavlov.models.embedders.transformers_embedder.TransformersBertEmbedder` allows for using BERT model outputs as token, subtoken and sentence level embeddings. Additionaly the embeddings can be easily used in DeepPavlov. To get text level, token level and subtoken level representations, you can use or modify a :config:`BERT embedder configuration `: .. code:: python from deeppavlov.core.common.file import read_json from deeppavlov import build_model, configs bert_config = read_json(configs.embedder.bert_embedder) bert_config['metadata']['variables']['BERT_PATH'] = 'path/to/bert/directory' m = build_model(bert_config) texts = ['Hi, i want my embedding.', 'And mine too, please!'] tokens, token_embs, subtokens, subtoken_embs, sent_max_embs, sent_mean_embs, bert_pooler_outputs = m(texts) BERT for Classification ----------------------- :class:`~deeppavlov.models.torch_bert.torch_transformers_classifier.TorchTransformersClassifierModel` provides solution for classification problem using pre-trained BERT on PyTorch. One can use several pre-trained English, multi-lingual and Russian BERT models that are listed above. :class:`~deeppavlov.models.torch_bert.torch_transformers_classifier.TorchTransformersClassifierModel` also supports any Transformer-based model of `Transformers `. Two main components of BERT classifier pipeline in DeepPavlov are :class:`~deeppavlov.models.preprocessors.torch_transformers_preprocessor.TorchTransformersPreprocessor` and :class:`~deeppavlov.models.torch_bert.torch_transformers_classifier.TorchTransformersClassifierModel`. Non-processed texts should be given to ``torch_transformers_preprocessor`` for tokenization on subtokens, encoding subtokens with their indices and creating tokens and segment masks. ``torch_transformers_classifier`` has a dense layer of number of classes size upon pooled outputs of Transformer encoder, it is followed by ``softmax`` activation (``sigmoid`` if ``multilabel`` parameter is set to ``true`` in config). BERT for Named Entity Recognition (Sequence Tagging) ---------------------------------------------------- Pre-trained BERT model can be used for sequence tagging. Examples of BERT application to sequence tagging can be found :doc:`here `. The module used for tagging is :class:`~deeppavlov.models.torch_bert.torch_transformers_sequence_tagger:TorchTransformersSequenceTagger`. The tags are obtained by applying a dense layer to the representation of the first subtoken of each word. There is also an optional CRF layer on the top. You can choose among different Transformers architectures by modifying the TRANSFORMER variable in the corresponding configuration files. The possible choices are DistilBert, Albert, Camembert, XLMRoberta, Bart, Roberta, Bert, XLNet, Flaubert, XLM. .. TODO: fix Zero-Shot NER reference Multilingual BERT model allows to perform zero-shot transfer across languages. To use our 19 tags NER for over a hundred languages see ner_multi_bert. BERT for Context Question Answering (SQuAD) ------------------------------------------- Context Question Answering on `SQuAD `__ dataset is a task of looking for an answer on a question in a given context. This task could be formalized as predicting answer start and end position in a given context. :class:`~deeppavlov.models.torch_bert.torch_transformers_squad:TorchTransformersSquad` on PyTorch uses two linear transformations to predict probability that current subtoken is start/end position of an answer. For details check :doc:`Context Question Answering documentation page `. Using custom BERT in DeepPavlov ------------------------------- The previous sections describe the BERT based models implemented in DeepPavlov. To change the BERT model used for initialization in any downstream task mentioned above the following parameters of the :doc:`config ` file must be changed to match new BERT path: * download URL in the ``metadata.download.url`` part of the config * ``bert_config_file``, ``pretrained_bert`` in the BERT based Component. In case of PyTorch BERT, ``pretrained_bert`` can be assigned to string name of any Transformer-based model (e.g. ``"bert-base-uncased"``, ``"distilbert-base-uncased"``) and then ``bert_config_file`` is set to ``None``. * ``vocab_file`` in the ``torch_transformers_preprocessor``. ``vocab_file`` can be assigned to string name of used pre-trained BERT (e.g. ``"bert-base-uncased"``). .. [1] Kuratov, Y., Arkhipov, M. (2019). Adaptation of Deep Bidirectional Multilingual Transformers for Russian Language. arXiv preprint arXiv:1905.07213. .. [2] Arkhipov M., Trofimova M., Kuratov Y., Sorokin A. (2019). `Tuning Multilingual Transformers for Language-Specific Named Entity Recognition `__ . ACL anthology W19-3712. .. [3] Sanh, V., Debut, L., Chaumond, J., & Wolf, T. (2019). DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108. .. [4] Yanran Li, Hui Su, Xiaoyu Shen, Wenjie Li, Ziqiang Cao, and Shuzi Niu. DailyDialog: A Manually Labelled Multi-turn Dialogue Dataset. IJCNLP 2017. .. [5] P. Lison and J. Tiedemann, 2016, OpenSubtitles2016: Extracting Large Parallel Corpora from Movie and TV Subtitles. In Proceedings of the 10th International Conference on Language Resources and Evaluation (LREC 2016) .. [6] Justine Zhang, Ravi Kumar, Sujith Ravi, Cristian Danescu-Niculescu-Mizil. Proceedings of NAACL, 2016. .. [7] J. Schler, M. Koppel, S. Argamon and J. Pennebaker (2006). Effects of Age and Gender on Blogging in Proceedings of 2006 AAAI Spring Symposium on Computational Approaches for Analyzing Weblogs. .. [8] Shavrina T., Shapovalova O. (2017) TO THE METHODOLOGY OF CORPUS CONSTRUCTION FOR MACHINE LEARNING: «TAIGA» SYNTAX TREE CORPUS AND PARSER. in proc. of “CORPORA2017”, international conference , Saint-Petersbourg, 2017. .. [9] Williams A., Nangia N. & Bowman S. (2017) A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference. arXiv preprint arXiv:1704.05426 .. [10] Williams A., Bowman S. (2018) XNLI: Evaluating Cross-lingual Sentence Representations. arXiv preprint arXiv:1809.05053 .. [11] S. R. Bowman, G. Angeli, C. Potts, and C. D. Manning. (2015) A large annotated corpus for learning natural language inference. arXiv preprint arXiv:1508.05326 .. [12] N. Reimers, I. Gurevych (2019) Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. arXiv preprint arXiv:1908.10084 ================================================ FILE: docs/features/models/classification.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#### Classification\n", "\n", "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/classification.ipynb)\n", "\n", "[![Medium](https://img.shields.io/badge/Medium-12100E?style=for-the-badge&logo=medium&logoColor=white)](https://medium.com/deeppavlov/text-classification-using-deeppavlov-library-with-pytorch-and-transformers-f14db5528821)\n", "\n", "# Table of contents\n", "\n", "1. [Introduction to the task](#1.-Introduction-to-the-task)\n", "\n", "2. [Get started with the model](#2.-Get-started-with-the-model)\n", "\n", "3. [Models list](#3.-Models-list)\n", "\n", "4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\n", "\n", " 4.1. [Predict using Python](#4.1-Predict-using-Python)\n", "\n", " 4.2. [Predict using CLI](#4.2-Predict-using-CLI)\n", "\n", "5. [Evaluation](#5.-Evaluation)\n", "\n", " 5.1. [from Python](#5.1-Evaluate-from-Python)\n", "\n", " 5.2. [from CLI](#5.2-Evaluate-from-CLI)\n", "\n", "6. [Train the model on your data](#6.-Train-the-model-on-your-data)\n", "\n", " 6.1. [from Python](#6.1-Train-your-model-from-Python)\n", "\n", " 6.2. [from CLI](#6.2-Train-your-model-from-CLI)\n", "\n", "7. [Simple few-shot classifiers](#7.-Simple-few-shot-classifiers)\n", "\n", " 7.1. [Few-shot setting](#7.1-Few-shot-setting)\n", "\n", " 7.2. [Multiple languages support](#7.2-Multiple-languages-support)\n", "\n", " 7.3. [Dataset and Scores](#7.3-Dataset-and-Scores)\n", "\n", "# 1. Introduction to the task\n", "This section describes a family of BERT-based models that solve a variety of different classification tasks.\n", "\n", "**Insults detection** is a binary classification task of identying wether a given sequence is an insult of another participant of communication.\n", "\n", "**Sentiment analysis** is a task of classifying the polarity of the the given sequence. The number of classes may vary depending on the data: positive/negative binary classification, multiclass classification with a neutral class added or with a number of different emotions.\n", "\n", "The models trained for the **paraphrase detection** task identify whether two sentences expressed with different words convey the same meaning.\n", "\n", "**Topic classification** refers to the task of classifying an utterance by the topic which belongs to the conversational domain.\n", "\n", "# 2. Get started with the model\n", "\n", "First make sure you have the DeepPavlov Library installed.\n", "[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install -q deeppavlov" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then make sure that all the required packages for the model are installed." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!python -m deeppavlov install insults_kaggle_bert" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`insults_kaggle_bert` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html)\n", "\n", "Configuration file defines the model and describes its hyperparameters. To use another model, change the name of the *config_file* here and further.\n", "The full list of classification models with their config names can be found in the [table](#3.-Models-list).\n", "\n", "# 3. Models list\n", "\n", "The table presents a list of all of the classification models available in DeepPavlov Library.\n", "\n", "| Config name | Language | Task | Dataset | Model Size | Metric | Score |\n", "| :--- | --- | --- | --- | --- | --- | ---: |\n", "| insults_kaggle_bert | En | Insults | [Insults](https://www.kaggle.com/c/detecting-insults-in-social-commentary) | 1.1 GB | ROC-AUC | 0.8770 |\n", "| paraphraser_rubert | Ru | Paraphrase | [Paraphrase Corpus](http://paraphraser.ru/download/) | 2.0 GB | F1 | 0.8738 |\n", "| paraphraser_convers_distilrubert_2L | Ru | Paraphrase | [Paraphrase Corpus](http://paraphraser.ru/download/) | 1.2 GB | F1 | 0.7396 |\n", "| paraphraser_convers_distilrubert_6L | Ru | Paraphrase | [Paraphrase Corpus](http://paraphraser.ru/download/) | 1.6 GB | F1 | 0.8354 |\n", "| sentiment_sst_conv_bert | En | Sentiment | [SST](https://paperswithcode.com/dataset/sst) | 1.1 GB | Accuracy | 0.6626 |\n", "| sentiment_twitter | Ru | Sentiment | [Twitter Mokoron](https://github.com/mokoron/sentirueval) | 6.2 GB | F1-macro | 0.9961 |\n", "| rusentiment_bert | Ru | Sentiment | [RuSentiment](https://text-machine.cs.uml.edu/projects/rusentiment/) | 1.3 GB | F1-weighted | 0.7005 |\n", "| rusentiment_convers_bert | Ru | Sentiment | [RuSentiment](https://text-machine.cs.uml.edu/projects/rusentiment/) | 1.5 GB | F1-weighted | 0.7724 |\n", "| topics_distilbert_base_uncased | En | Topics | [DeepPavlov Topics](https://deeppavlov.ai/datasets/topics) | 6.2 GB | F1-macro | 0.9961 |\n", "\n", "# 4. Use the model for prediction\n", "\n", "## 4.1 Predict using Python\n", "\n", "After [installing](#2.-Get-started-with-the-model) the model, build it from the config and predict." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from deeppavlov import build_model\n", "\n", "model = build_model('insults_kaggle_bert', download=True, install=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Input format**: List[sentences]\n", "\n", "**Output format**: List[labels]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Insult', 'Not Insult']" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model(['You are kind of stupid', 'You are a wonderful person!'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4.2 Predict using CLI\n", "\n", "You can also get predictions in an interactive mode through CLI (Command Line Interface)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!python deeppavlov interact insults_kaggle_bert -d" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`-d` is an optional download key (alternative to `download=True` in Python code). The key `-d` is used to download the pre-trained model along with embeddings and all other files needed to run the model.\n", "\n", "Or make predictions for samples from *stdin*." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!python deeppavlov predict insults_kaggle_bert -f " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 5. Evaluation\n", "\n", "## 5.1 Evaluate from Python" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from deeppavlov import evaluate_model\n", "\n", "model = evaluate_model('insults_kaggle_bert', download=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5.2 Evaluate from CLI" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!python -m deeppavlov evaluate insults_kaggle_bert -d" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 6. Train the model on your data\n", "\n", "## 6.1 Train your model from Python\n", "\n", "### Provide your data path\n", "\n", "To train the model on your data, you need to change the path to the training data in the *config_file*.\n", "\n", "Parse the *config_file* and change the path to your data from Python." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "~/.deeppavlov/downloads/insults_data\n" ] } ], "source": [ "from deeppavlov import train_model\n", "from deeppavlov.core.commands.utils import parse_config\n", "\n", "model_config = parse_config('insults_kaggle_bert')\n", "\n", "# dataset that the model was trained on\n", "print(model_config['dataset_reader']['data_path'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Provide a *data_path* to your own dataset. You can also change any of the hyperparameters of the model." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# download and unzip a new example dataset\n", "!wget http://files.deeppavlov.ai/datasets/insults_data.tar.gz\n", "!tar -xzvf \"insults_data.tar.gz\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# provide a path to the directory with your train, valid and test files\n", "model_config['dataset_reader']['data_path'] = \"./contents/\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "### Train dataset format\n", "\n", "### Train the model using new config" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model = train_model(model_config)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Use your model for prediction." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Insult', 'Not Insult']" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model(['You are kind of stupid', 'You are a wonderful person!'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6.2 Train your model from CLI\n", "\n", "To train the model on your data, create a copy of a config file and change the *data_path* variable in it. After that, train the model using your new *config_file*. You can also change any of the hyperparameters of the model." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!python -m deeppavlov train model_config.json" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 7. Simple few-shot classifiers\n", "\n", "Additionally, in the [faq](https://github.com/deeppavlov/DeepPavlov/tree/master/deeppavlov/configs/faq) section you can find a config for a fast and simple pre-BERT model, which consists of a fasttext vectorizer and a simple logistic regression classifier.\n", "\n", "## 7.1 Few-shot setting\n", "\n", "In the current setting the config can be used for few-shot classification - a task, in which only a few training examples are available for each class (usually from 5 to 10). Note that the config takes the full version of the dataset as the input and samples N examples for each class of the train data in the iterator.\n", "\n", "The sampling is done within the `basic_classification_iterator` component of the pipeline and the `shot` parameter defines the number of examples to be sampled. By default the `shot` parameter is set to `None` (no sampling applied).\n", "\n", "## 7.2 Multiple languages support\n", "\n", "By default `fasttext_logreg` supports classification in English, but can be modified for classification in Russian.\n", "\n", "In order to change `fasttext_logreg` language to Russian, change `LANGUAGE` variable in the `metadata.variables` section from `en` to `ru` and change the Spacy model by changing `SPACY_MODEL` variable from `en_core_web_sm` to `ru_core_news_sm`.\n", "\n", "You can do that by directly editing the config file through an editor or change it through Python (example below). N.B. `read_json` and `find_config` combination is intentionally used instead of `parse_config` to read config in the example, because `parse_config` will replace all `LANGUAGE` and `SPACY_MODEL` usages in the config with the default values from `metadata.variables`." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from deeppavlov import build_model\n", "from deeppavlov.core.common.file import read_json, find_config\n", "\n", "model_config = read_json(find_config('fasttext_logreg'))\n", "model_config['metadata']['variables']['LANGUAGE'] = 'ru'\n", "model_config['metadata']['variables']['SPACY_MODEL'] = 'ru_core_news_sm'\n", "model = build_model(model_config, install=True, download=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 7.3 Dataset and Scores\n", "\n", "To demonstrate the performance of the model in two languages, we use the English and Russian subsets of [the MASSIVE dataset](https://github.com/alexa/massive).\n", "\n", "MASSIVE is a parallel dataset of utterrances in 52 languages with annotations for the Natural Language Understanding tasks of intent prediction and slot annotation. We only employ the intent classification data. You can see the results of the given configs in 5-shot classification setting in the table below.\n", "\n", "| Config name | Language | Train accuracy | Validation accuracy | Test accuracy |\n", "| :--- | --- | --- | --- | ---: |\n", "| fasttext_logreg | en | 0.9632 | 0.5239 | 0.5155 |\n", "| fasttext_logreg | ru | 0.9231 | 0.4565 | 0.4304 |" ] } ], "metadata": {}, "nbformat": 4, "nbformat_minor": 4 } ================================================ FILE: docs/features/models/entity_extraction.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#### Entity Extraction\n", "\n", "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/entity_extraction.ipynb)\n", "\n", "# Table of contents \n", "\n", "1. [Introduction to the task](#1.-Introduction-to-the-task)\n", "\n", "2. [Get started with the model](#2.-Get-started-with-the-model)\n", "\n", "3. [Models list](#3.-Models-list)\n", "\n", "4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\n", "\n", " 4.1 [Predict using Python](#4.1-Predict-using-Python)\n", " \n", " 4.2 [Predict using CLI](#4.2-Predict-using-CLI)\n", "\n", "5. [Customize the model](#5.-Customize-the-model)\n", " \n", " 5.1 [Description of config parameters](#5.1-Description-of-config-parameters)\n", " \n", " 5.2 [Training entity detection model](#5.2-Training-entity-detection-model)\n", " \n", " 5.3 [Using custom knowledge base](#5.3-Using-custom-knowledge-base)\n", "\n", "# 1. Introduction to the task\n", "\n", "**Entity Detection** is the task of identifying entity mentions in text with corresponding entity types. Entity Detection models in DeepPavlov split the input text into fragments of the lengths less than 512 tokens and find entities with BERT-based models.\n", "\n", "**Entity Linking** is the task of finding knowledge base entity ids for entity mentions in text. Entity Linking in DeepPavlov supports Wikidata and Wikipedia. Entity Linking component performs the following steps:\n", "\n", "* extraction of candidate entities from SQLite database;\n", "* candidate entities sorting by entity tags (if entity tags are provided);\n", "* ranking of candidate entities by connections in Wikidata knowledge graph of candidate entities for different mentions;\n", "* candidate entities ranking by context and descriptions using Transformer model [bert-small](https://huggingface.co/prajjwal1/bert-small) in English config and [distilrubert-tiny](https://huggingface.co/DeepPavlov/distilrubert-tiny-cased-conversational-v1).\n", "\n", "**Entity Extraction** configs perform subsequent Entity Detection and Entity Linking of extracted entity mentions.\n", "\n", "# 2. Get started with the model\n", "\n", "First make sure you have the DeepPavlov Library installed.\n", "[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install -q deeppavlov" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then make sure that all the required packages for the model are installed." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!python -m deeppavlov install entity_extraction_en" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`entity_extraction_en` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html)\n", "\n", "There are alternative ways to install the model's packages that do not require executing a separate command -- see the options in the next sections of this page.\n", "The full list of models for entity detection, linking and extraction with their config names can be found in the [table](#3.-Models-list).\n", "\n", "# 3. Models list\n", "\n", "The table presents a list of all of the models for entity detection, linking and extraction available in the DeepPavlov Library.\n", "\n", "| Config name | Language | RAM | GPU |\n", "| :--- | --- | --- | --- |\n", "| entity_detection_en | En | 2.5 Gb | 3.7 Gb |\n", "| entity_detection_ru | Ru | 2.5 Gb | 5.3 Gb |\n", "| entity_linking_en | En | 2.4 Gb | 1.2 Gb |\n", "| entity_linking_ru | Ru | 2.2 Gb | 1.1 Gb |\n", "| entity_extraction_en | En | 2.5 Gb | 3.7 Gb |\n", "| entity_extraction_ru | Ru | 2.5 Gb | 5.3 Gb |\n", "\n", "# 4. Use the model for prediction\n", "\n", "## 4.1 Predict using Python\n", "\n", "After [installing](#2.-Get-started-with-the-model) the model, build it from the config and predict.\n", "\n", "### Entity Detection\n", "\n", "**For English:**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", "from deeppavlov import build_model\n", "\n", "ed_en = build_model('entity_detection_en', download=True, install=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**The output elements:**\n", "\n", "* entity substrings\n", "* entity offsets (indices of start and end symbols of entities in text)\n", "* entity positions (indices of entity tokens in text)\n", "* entity tags\n", "* sentences offsets\n", "* list of sentences in text\n", "* confidences of detected entities" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ed_en(['Forrest Gump is a comedy-drama film directed by Robert Zemeckis and written by Eric Roth.'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**For Russian:**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ed_ru = build_model('entity_detection_ru', download=True, install=True)\n", "ed_ru(['Москва — столица России, центр Центрального федерального округа и центр Московской области.'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Entity Linking\n", "\n", "**For English:**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "el_en = build_model('entity_linking_en', download=True, install=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**The input elements:**\n", "\n", "* entity substrings\n", "* entity tags (optional argument)\n", "* confidences of entity substrings (optional argument)\n", "* sentences (context) of the entities (optional argument)\n", "* entity offsets (optional argument)\n", "* sentences offsets (optional argument)\n", "\n", "**The output elements:**\n", "\n", "* entity ids\n", "* entity confidences (for each entity - the list with three confidences: substring matching confidence, popularity ranking confidence and context ranking confidence)\n", "* entity pages in Wikipedia\n", "* entity labels in Wikidata" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "el_en([['forrest gump', 'robert zemeckis', 'eric roth']],\n", " [['WORK_OF_ART', 'PERSON', 'PERSON']],\n", " [[1.0, 1.0, 1.0]],\n", " [['Forrest Gump is a comedy-drama film directed by Robert Zemeckis and written by Eric Roth.']],\n", " [[(0, 12), (48, 63), (79, 88)]],\n", " [[(0, 89)]])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**For Russian:**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "el_ru = build_model('entity_linking_ru', download=True, install=True)\n", "\n", "el_ru([['москва', 'россии', 'центрального федерального округа', 'московской области']],\n", " [['CITY', 'COUNTRY', 'LOC', 'LOC']],\n", " [[1.0, 1.0, 1.0, 1.0]],\n", " [['Москва — столица России, центр Центрального федерального округа и центр Московской области.']],\n", " [[(0, 6), (17, 23), (31, 63), (72, 90)]],\n", " [[(0, 91)]])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Entity Extraction\n", "\n", "**For English:**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ex_en = build_model('entity_extraction_en', download=True, install=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**The output elements:**\n", "\n", "* entity substrings\n", "* entity tags\n", "* entity offsets\n", "* entity ids in the knowledge base\n", "* entity linking confidences\n", "* entity pages\n", "* entity labels" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ex_en(['Forrest Gump is a comedy-drama film directed by Robert Zemeckis and written by Eric Roth.'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**For Russian:**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ex_ru = build_model('entity_extraction_ru', download=True, install=True)\n", "\n", "ex_ru(['Москва — столица России, центр Центрального федерального округа и центр Московской области.'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4.2 Predict using CLI\n", "\n", "You can also get predictions in an interactive mode through CLI (Сommand Line Interface)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! python -m deeppavlov interact entity_extraction_en -d" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 5. Customize the model\n", "\n", "## 5.1 Description of config parameters\n", "\n", "Parameters of ``ner_chunker`` component:\n", "\n", "- ``batch_size: int`` - each text from the input text batch is split into chunks with the length lower than the threshold (because Transformer-based models for entity detection work with limited lengths of the input sequences), than all chunks are concatenated into one list and the list is split into batches of the size ``batch_size``;\n", "- ``max_seq_len: int`` - maximum length of chunk (in wordpiece tokens);\n", "- ``vocab_file: str`` - vocab file of Transformer tokenizer, which is used to tokenize the text for further splitting into chunks.\n", "\n", "Parameters of ``entity_detection_parser`` component:\n", " \n", "- ``thres_proba: float`` - the NER models return tag confidences for each token; if the probability of \"O\" tag (which is used for tokens not related to entities) for the token is lower than the ``thres_proba``, the tag with the maximum probability from entity tags list is chosen;\n", "- ``o_tag: str`` - tag for non-entity tokens (by default is \"O\" tag);\n", "- ``tags_file: str`` - the filename with the list of tags used in the NER model.\n", "\n", "Parameters of ``ner_chunk_model`` component:\n", "\n", "- ``ner: deeppavlov.core.common.chainer:Chainer`` - the config for entity recognition, which defines entity tags (or \"O\" tag) and tag probabilities for each token in the input text;\n", "- ``ner_parser: deeppavlov.models.entity_extraction.entity_detection_parser:EntityDetectionParser`` - the component which processes the tags and tag probabilities returned by the entity recognition model and defines entity substrings;\n", "- ``ner2: deeppavlov.core.common.chainer:Chainer`` - (optional) an additional entity recognition config, which can improve the quality of entity recognition in the case of joint usage with ``ner`` config;\n", "- ``ner_parser2: deeppavlov.models.entity_extraction.entity_detection_parser:EntityDetectionParser`` - (optional) an additional config for processing entity recognition output.\n", "\n", "Parameters of ``entity_linker`` component:\n", "\n", "- ``load_path: str`` - the path to the folder with the inverted index;\n", "- ``entity_ranker`` - the component for ranking of candidate entities by descriptions;\n", "- ``entities_database_filename: str`` - file with the inverted index (the mapping between entity titles and entity IDs);\n", "- ``words_dict_filename: str`` - file with mapping of entity titles to the tags of entity detection model;\n", "- ``ngrams_matrix_filename: str`` - matrix of char ngrams of words from entity titles from the knowledge base;\n", "- ``num_entities_for_bert_ranking: int`` - number of candidate entities which are re-ranked by context and description using Transformer-based model;\n", "- ``num_entities_for_conn_ranking: int`` - number of candidate entities which are re-ranked by connections in the knowledge graph between entities for different mentions in the text;\n", "- ``num_entities_to_return: int`` - the number of entity IDs, returned for each entity mention in text; \n", "- ``max_paragraph_len: int`` - maximum length of context used for ranking of entities by description;\n", "- ``lang: str`` - language of the entity linking model (Russian or English);\n", "- ``use_descriptions: bool`` - whether to perform ranking of candidate entities by similarity of their descriptions to the context;\n", "- ``alias_coef: float`` - the coefficient which is multiplied by the substring matching score of the entity if the entity mention in the text matches with the entity title;\n", "- ``use_tags: bool`` - whether to search only those entity IDs in the inverted index, which have the same tag as the entity mention;\n", "- ``lemmatize: bool`` - whether to lemmatize entity mentions before searching candidate entity IDs in the inverted index;\n", "- ``full_paragraph: bool`` - whether to use full context for ranking of entities by descriptions or cut the paragraph to one sentence with entity mention;\n", "- ``use_connections: bool`` - whether to use connections between candidate entities for different mentions for ranking;\n", "- ``kb_filename: str`` - file with the knowledge base in .hdt format;\n", "- ``prefixes: Dict[str, Any]`` - prefixes in the knowledge base for entities and relations.\n", "\n", "## 5.2 Training entity detection model\n", "\n", "The configs `entity_detection_en` and `entity extraction_en` use `ner_ontonotes_bert` model for detection of entity mentions, the configs `entity_detection_ru` and `entity extraction_ru` use `ner_rus_bert_probas` model. [How to train a NER model](http://docs.deeppavlov.ai/en/master/features/models/NER.html#6.-Customize-the-model).\n", "\n", "## 5.3 Using custom knowledge base\n", "\n", "The database filename is defined with the **entities_database_filename** in entity linking configs. The file is in SQLite format with FTS5 extensions for full-text search of entities by entity mention. The database file should contain the **inverted_index** table with the following columns:\n", "\n", "* ``title`` - entity title (name or alias) in the knowledge base;\n", "* ``entity_id`` - entity ID in the knowledge base;\n", "* ``num_rels`` - number of relations of the entity with other entities in the knowledge graph;\n", "* ``ent_tag`` - entity tag of the entity detection model (for example, CITY, PERSON, WORK_OF_ART, etc.);\n", "* ``page`` - page title of the entity (for Wikidata entities - the Wikipedia page);\n", "* ``label`` - entity label in the knowledge base;\n", "* ``descr`` - entity description in the knowledge base.\n", "\n", "Tags of entities in the knowledge base should correspond with the tags of the custom NER model or default `ner_ontonotes_bert` or `ner_rus_bert_probas` models. The list of `ner_ontonotes_bert` tags is listed in tags.dict file in ~/.deeppavlov/models/ner_ontonotes_bert_torch_crf directory, the list of `ner_rus_bert_probas tags` - in tags.dict file in ~/.deeppavlov/models/wiki_ner_rus_bert directory." ] } ], "metadata": {}, "nbformat": 4, "nbformat_minor": 4 } ================================================ FILE: docs/features/models/few_shot_classification.ipynb ================================================ { "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "#### Few-shot Text Classification\n", "\n", "# Table of contents \n", "\n", "1. [Introduction to the task](#1.-Introduction-to-the-task)\n", "\n", "2. [Get started with the model](#2.-Get-started-with-the-model)\n", "\n", "3. [Models list](#3.-Models-list)\n", "\n", "4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\n", "\n", " 4.1 [Dataset format](#4.1-Dataset-format)\n", "\n", " 4.2. [Predict using Python](#4.2-Predict-using-Python)\n", " \n", " 4.3. [Predict using CLI](#4.3-Predict-using-CLI)\n", "\n", "5. [Customize the model](#5.-Customize-the-model)\n", "\n", "# 1. Introduction to the task\n", "\n", "__Text classification__ is a task of identifying one of the pre-defined label given an utterance, where label is one of N classes or \"OOS\" (out-of-scope examples - utterances that do not belong to any of the predefined classes). We consider few-shot setting, where only few examples (5 or 10) per intent class are given as a training set.\n", "\n", "\n", "# 2. Get started with the model\n", "\n", "First make sure you have the DeepPavlov Library installed.\n", "[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install -q deeppavlov" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Then make sure that all the required packages are installed." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!python -m deeppavlov install few_shot_roberta" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "`few_shot_roberta` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html) \n", "\n", "Configuration file defines the model and describes its hyperparameters. To use another model, change the name of the *config_file* here and further.\n", "Some of few-shot classification models with their config names can be found in the [table](#3.-Models-list).\n", "\n", "# 3. Models list\n", "\n", "At the moment, only `few_shot_roberta` config support out-of-scope detection.\n", "\n", "| Config name | Dataset | Shot | Model Size | In-domain accuracy | Out-of-scope recall | Out-of-scope precision |\n", "| :--- | --- | --- | --- | --- | --- | ---: |\n", "| few_shot_roberta| [CLINC150-Banking-Domain](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 1.4 GB | 84.1±1.9 | 93.2±0.8 | 97.8±0.3 |\n", "| few_shot_roberta| [CLINC150](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 1.4 GB | 59.4±1.4 | 87.9±1.2 | 40.3±0.7 |\n", "| few_shot_roberta| [BANKING77-OOS](https://paperswithcode.com/paper/are-pretrained-transformers-robust-in-intent) | 5 | 1.4 GB | 51.4±2.1 | 93.7±0.7 | 82.7±1.4 |\n", "| fasttext_logreg*| [CLINC150-Banking-Domain](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 37 KB |24.8±2.2 | 98.2±0.4 | 74.8±0.6 |\n", "| fasttext_logreg*| [CLINC150](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 37 KB | 13.4±0.5 | 98.6±0.2 | 20.5±0.1 |\n", "| fasttext_logreg*| [BANKING77-OOS](https://paperswithcode.com/paper/are-pretrained-transformers-robust-in-intent) | 5 | 37 KB |10.7±0.8 | 99.0±0.3 | 36.4±0.2 |\n", "\n", "\n", "With zero threshold we can get a classification accuracy without OOS detection:\n", "\n", "| Config name | Dataset | Shot | Model Size | Accuracy |\n", "| :--- | --- | --- | --- | ---: |\n", "| few_shot_roberta| [CLINC150-Banking-Domain](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 1.4 GB | 89.6 |\n", "| few_shot_roberta| [CLINC150](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 1.4 GB | 79.6 |\n", "| few_shot_roberta| [BANKING77-OOS](https://paperswithcode.com/paper/are-pretrained-transformers-robust-in-intent) | 5 | 1.4 GB | 55.1 |\n", "| fasttext_logreg*| [CLINC150-Banking-Domain](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 37 KB | 86.3 |\n", "| fasttext_logreg*| [CLINC150](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 37 KB | 73.6\n", "| fasttext_logreg*| [BANKING77-OOS](https://paperswithcode.com/paper/are-pretrained-transformers-robust-in-intent) | 5 | 37 KB | 51.6 |\n", "\n", "\\* \\- config file was modified to predict OOS examples\n", "\n", "\n", "# 4. Use the model for prediction\n", "\n", "Base model `few_shot_roberta` was already pre-trained to recognize simmilar utterances, so you can use off-the-shelf model to make predictions and evalutation. No additional training needed.\n", "\n", "## 4.1 Dataset format\n", "\n", "DNNC model compares input text to every example in dataset to determine, which class the input example belongs to. The dataset based on which classification is performed has the following format:\n", "\n", "```\n", "[\n", " [\"text_1\", \"label_1\"],\n", " [\"text_2\", \"label_2\"],\n", " ...\n", " [\"text_n\", \"label_n\"]\n", "]\n", "```\n", "\n", "## 4.2 Predict using Python\n", "\n", "After [installing](#2.-Get-started-with-the-model) the model, build it from the config and predict." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from deeppavlov import build_model\n", "\n", "model = build_model(\"few_shot_roberta\", download=True)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "If you set `download` flag to `True`, then existing model weights will be overwritten.\n", "\n", "Setting the `install` argument to `True` is equivalent to executing the command line `install` command. If set to `True`, it will first install all the required packages.\n", "\n", "**Input**: List[texts, dataset]\n", "\n", "**Output**: List[labels]" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['translate', 'exchange_rate', 'car_rental']" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "texts = [\n", " \"what expression would i use to say i love you if i were an italian\",\n", " \"what's the currency conversion between krones and yen\",\n", " \"i'd like to reserve a high-end car\"\n", "]\n", "\n", "dataset = [\n", " [\"please help me book a rental car for nashville\", \"car_rental\"],\n", " [\"how can i rent a car in boston\", \"car_rental\"],\n", " [\"help me get a rental car for march 2 to 6th\", \"car_rental\"],\n", " \n", " [\"how many pesos can i get for one dollar\", \"exchange_rate\"],\n", " [\"tell me the exchange rate between rubles and dollars\", \"exchange_rate\"],\n", " [\"what is the exchange rate in pesos for 100 dollars\", \"exchange_rate\"],\n", " \n", " [\"can you tell me how to say 'i do not speak much spanish', in spanish\", \"translate\"],\n", " [\"please tell me how to ask for a taxi in french\", \"translate\"],\n", " [\"how would i say thank you if i were russian\", \"translate\"]\n", "]\n", "\n", "model(texts, dataset)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## 4.3 Predict using CLI\n", "\n", "You can also get predictions in an interactive mode through CLI (Сommand Line Interface)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!python -m deeppavlov interact few_shot_roberta -d" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "`-d` is an optional download key (alternative to `download=True` in Python code). The key `-d` is used to download the pre-trained model along with all other files needed to run the model.\n", "\n", "Or make predictions for samples from *stdin*." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!python -m deeppavlov predict few_shot_roberta -f " ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# 5. Customize the model\n", "\n", "Out-of-scope (OOS) examples are determined via confidence with *confidence_threshold* parameter. For each input text, if the confidence of the model is lower than the *confidence_threshold*, then the input example is considered out-of-scop. The higher the threshold, the more often the model predicts \"oos\" class. By default it is set to 0, but you can change it to your preferences in configuration file." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.0\n" ] } ], "source": [ "from deeppavlov import build_model\n", "from deeppavlov.core.commands.utils import parse_config\n", "\n", "model_config = parse_config('few_shot_roberta')\n", "model_config['chainer']['pipe'][-1]['confidence_threshold'] = 0.1\n", "model = build_model(model_config)" ] } ], "metadata": {}, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: docs/features/models/morpho_tagger.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#### Morphotagger\n", "\n", "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/morpho_tagger.ipynb)\n", "\n", "# Table of contents \n", "\n", "1. [Introduction to the task](#1.-Introduction-to-the-task)\n", "\n", "2. [Get started with the model](#2.-Get-started-with-the-model)\n", "\n", "3. [Models list](#3.-Models-list)\n", "\n", "4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\n", "\n", " 4.1. [Predict using Python](#4.1-Predict-using-Python)\n", "\n", " 4.2. [Predict using CLI](#4.2-Predict-using-CLI)\n", "\n", "5. [Customize the model](#5.-Customize-the-model)\n", "\n", "# 1. Introduction to the task\n", "\n", "Morphological tagging is definition morphological tags, such as case, number, gender, aspect etc. for text tokens.\n", "\n", "An example:\n", "```\n", "Я шёл домой по незнакомой улице.\n", "```\n", "```\n", "1\tЯ\tя\tPRON\t_\tCase=Nom|Number=Sing|Person=1\t_\t_\t_\t_\n", "2\tшёл\tидти\tVERB\t_\tAspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act\t_\t_\t_\t_\n", "3\tдомой\tдомой\tADV\t_\tDegree=Pos\t_\t_\t_\t_\n", "4\tпо\tпо\tADP\t_\t_\t_\t_\t_\t_\n", "5\tнезнакомой\tнезнакомый\tADJ\t_\tCase=Dat|Degree=Pos|Gender=Fem|Number=Sing\t_\t_\t_\t_\n", "6\tулице\tулица\tNOUN\t_\tAnimacy=Inan|Case=Dat|Gender=Fem|Number=Sing\t_\t_\t_\t_\n", "7\t.\t.\tPUNCT\t_\t_\t_\t_\t_\t_\n", "```\n", "\n", "The model is based on [BERT for token classification](https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoModelForTokenClassification).\n", "The model is trained on [Universal Dependencies corpora](https://universaldependencies.org/) (version 2.3).\n", "\n", "# 2. Get started with the model\n", "\n", "First make sure you have the DeepPavlov Library installed.\n", "[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install -q deeppavlov" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Before using the model make sure that all required packages are installed running the command:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!python -m deeppavlov install morpho_ru_syntagrus_bert" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 3. Models list\n", "\n", "The table presents comparison of ``morpho_ru_syntagrus_bert`` config with other models on UD2.3 dataset.\n", "\n", "| Model | Accuracy |\n", "| :--- | :---: |\n", "| UDPipe | 93.5 |\n", "| morpho_ru_syntagrus_bert | 97.6 |\n", "\n", "# 4. Use the model for prediction\n", "\n", "## 4.1 Predict using Python" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from deeppavlov import build_model\n", "\n", "model = build_model(\"morpho_ru_syntagrus_bert\", download=True, install=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\tЯ\tя\tPRON\t_\tCase=Nom|Number=Sing|Person=1\t_\t_\t_\t_\n", "2\tшёл\tшёл\tVERB\t_\tAspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act\t_\t_\t_\t_\n", "3\tдомой\tдомой\tADV\t_\tDegree=Pos\t_\t_\t_\t_\n", "4\tпо\tпо\tADP\t_\t_\t_\t_\t_\t_\n", "5\tнезнакомой\tнезнакомый\tADJ\t_\tCase=Dat|Degree=Pos|Gender=Fem|Number=Sing\t_\t_\t_\t_\n", "6\tулице\tулица\tNOUN\t_\tAnimacy=Inan|Case=Dat|Gender=Fem|Number=Sing\t_\t_\t_\t_\n", "7\t.\t.\tPUNCT\t_\t_\t_\t_\t_\t_\n", "\n", "1\tДевушка\tдевушка\tNOUN\t_\tAnimacy=Anim|Case=Nom|Gender=Fem|Number=Sing\t_\t_\t_\t_\n", "2\tпела\tпеть\tVERB\t_\tAspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act\t_\t_\t_\t_\n", "3\tв\tв\tADP\t_\t_\t_\t_\t_\t_\n", "4\tцерковном\tцерковном\tADJ\t_\tCase=Loc|Degree=Pos|Gender=Masc|Number=Sing\t_\t_\t_\t_\n", "5\tхоре\tхор\tNOUN\t_\tAnimacy=Inan|Case=Loc|Gender=Masc|Number=Sing\t_\t_\t_\t_\n", "6\tо\tо\tADP\t_\t_\t_\t_\t_\t_\n", "7\tвсех\tвесь\tDET\t_\tCase=Loc|Number=Plur\t_\t_\t_\t_\n", "8\tуставших\tустать\tVERB\t_\tAspect=Perf|Case=Loc|Number=Plur|Tense=Past|VerbForm=Part|Voice=Act\t_\t_\t_\t_\n", "9\tв\tв\tADP\t_\t_\t_\t_\t_\t_\n", "10\tчужом\tчужом\tADJ\t_\tCase=Loc|Degree=Pos|Gender=Masc|Number=Sing\t_\t_\t_\t_\n", "11\tкраю\tкрай\tNOUN\t_\tAnimacy=Inan|Case=Loc|Gender=Masc|Number=Sing\t_\t_\t_\t_\n", "12\t.\t.\tPUNCT\t_\t_\t_\t_\t_\t_\n" ] } ], "source": [ "sentences = [\"Я шёл домой по незнакомой улице.\", \"Девушка пела в церковном хоре о всех уставших в чужом краю.\"]\n", "for parse in model(sentences):\n", " print(parse)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4.2 Predict using CLI\n", "\n", "You can also get predictions in an interactive mode through CLI (Сommand Line Interface)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! python -m deeppavlov interact morpho_ru_syntagrus_bert -d" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`-d` is an optional download key (alternative to `download=True` in Python code). The key `-d` is used to download the pre-trained model along with embeddings and all other files needed to run the model.\n", "\n", "# 5. Customize the model\n", "\n", "To train **morphotagger** on your own data, you should prepare a dataset in **CoNLL-U format**. The description of **CoNLL-U format** can be found [here](https://universaldependencies.org/format.html#conll-u-format).\n", "\n", "Then you should place files for training, validation and testing into the ``\"data_path\"`` directory of ``morphotagger_dataset_reader``, change file names in ``morphotagger_dataset_reader`` to your filenames and launch the training:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from deeppavlov import train_model\n", "\n", "train_model(\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "or **using CLI**:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! python -m deeppavlov train " ] } ], "metadata": {}, "nbformat": 4, "nbformat_minor": 4 } ================================================ FILE: docs/features/models/multitask_bert.rst ================================================ Multi-task BERT in DeepPavlov ============================= Multi-task BERT in DeepPavlov is an implementation of BERT training algorithm published in the paper `Knowledge Transfer Between Tasks and Languages in the Multi-task Encoder-agnostic Transformer-based Models `_. The idea is to share BERT body between several tasks. This is necessary if a model pipe has several components using BERT and the amount of GPU memory is limited. Each task has its own 'head' part attached to the output of the BERT encoder. If multi-task BERT has :math:`T` heads, one training iteration consists of - composing :math:`T` lists of examples, one for each task, - :math:`T` gradient steps, one gradient step for each task. By default, on every training steps lists of examples for all but one tasks are empty, as if in the original MT-DNN repository. When one of BERT heads is being trained, other heads' parameters do not change. On each training step both BERT head and body parameters are modified. Currently multitask bert heads support classification, regression, NER and multiple choice tasks. At this page, multi-task BERT usage is explained on a toy configuration file of a model that is trained for the single-sentence classification, sentence pair classification, regression, multiple choice and NER. The config for this model is :config:`multitask_example `. Other examples of using multitask models can be found in :config:`mt_glue `. Train config ------------ When using ``multitask_transformer`` component, you can use the same inference file as the train file. Data reading and iteration is performed by :class:`~deeppavlov.dataset_readers.multitask_reader.MultiTaskReader` and :class:`~deeppavlov.dataset_iterators.multitask_iterator.MultiTaskIterator`. These classes are composed of task readers and iterators and generate batches that contain data from heterogeneous datasets. Example below demonstrates the usage of multitask dataset reader: .. code:: json "dataset_reader": { "class_name": "multitask_reader", "task_defaults": { "class_name": "huggingface_dataset_reader", "path": "glue", "train": "train", "valid": "validation", "test": "test" }, "tasks": { "cola": {"name": "cola"}, "copa": { "path": "super_glue", "name": "copa" }, "conll": { "class_name": "conll2003_reader", "use_task_defaults": false, "data_path": "{DOWNLOADS_PATH}/conll2003/", "dataset_name": "conll2003", "provide_pos": false } } } Nested dataset readers are listed in the ``tasks`` section. By default, default nested readers parameters are taken from ``task_defaults`` section. Values from the ``tasks`` could complement parameters, like ``name`` parameter in the ``dataset_reader.tasks.cola``, and could overwrite default parameter values, like ``path`` parameter from ``dataset_reader.tasks.copa``. In the ``dataset_reader.tasks.conll`` ``use_task_defaults`` is ``False``. This is special parameter, that forces ``multitask_reader`` to ignore ``task_defaults`` while creating nested reader, which means that dataset reader for ``conll`` task will use only parameters from ``dataset_reader.tasks.conll``. The same principle with default values applies to ``multitask_iterator``. Batches generated by ``multitask_iterator`` are tuples of two elements: inputs of the model and labels. Both inputsand labels are lists of tuples. The inputs have following format: ``[(first_task_inputs[0], second_task_inputs[0],...), (first_task_inputs[1], second_task_inputs[1], ...), ...]`` where ``first_task_inputs``, ``second_task_inputs``, and so on are x values of batches from task dataset iterators. The labels in the second element have the similar format. If task datasets have different sizes, then for smaller datasets the lists are padded with ``None`` values. For example, if the first task dataset inputs are ``[0, 1, 2, 3, 4, 5, 6]``, the second task dataset inputs are ``[7, 8, 9]``, and the batch size is ``2``, then multi-task input mini-batches will be ``[(0, 7), (1, 8)]``, ``[(2, 9), (3, None)]``, ``[(4, None), (5, None)]``, ``[(6, None)]``. In this tutorial, there are 5 datasets. Considering the batch structure, ``chainer`` inputs in :config:`multitask_example ` are: .. code:: json "in": ["x_cola", "x_rte", "x_stsb", "x_copa", "x_conll"], "in_y": ["y_cola", "y_rte", "y_stsb", "y_copa", "y_conll"] Sometimes a task dataset iterator returns inputs or labels consisting of more than one element. For example, in the model input element could consist of two strings. If there is a necessity to split such a variable, ``InputSplitter`` component can be used. Data preparation in the multitask setting can be similar to the preparation in singletask setting except for the names of the variables. For streamlining the code, however, ``input_splitter`` and ``tokenizer`` can be unified into the ``multitask_pipeline_preprocessor``. This preprocessor gets as a parameter ``preprocessor`` the one preprocessor class name for all tasks, or gets the preprocessor name list as a parameter ``preprocessors``. After splitting input by ``possible_keys_to_extract``, every preprocessor (being initialized by the input beforehand) processes the input. Note, that if ``strict`` parameter(default:False) is set to True, we always try to split data. Here is the definition of ``multitask_pipeline_preprocessor`` from the :config:`multitask_example `: .. code:: json "class_name": "multitask_pipeline_preprocessor", "possible_keys_to_extract": [0, 1], "preprocessors": [ "TorchTransformersPreprocessor", "TorchTransformersPreprocessor", "TorchTransformersPreprocessor", "TorchTransformersMultiplechoicePreprocessor", "TorchTransformersNerPreprocessor" ], "do_lower_case": true, "n_task": 5, "vocab_file": "{BACKBONE}", "max_seq_length": 200, "max_subword_length": 15, "token_masking_prob": 0.0, "return_features": true, "in": ["x_cola", "x_rte", "x_stsb", "x_copa", "x_conll"], "out": [ "bert_features_cola", "bert_features_rte", "bert_features_stsb", "bert_features_copa", "bert_features_conll" ] The ``multitask_transformer`` component has common and task-specific parameters. Shared parameters are provided inside the tasks parameter. The tasks is a dictionary that keys are task names and values are task-specific parameters (type, options). Common parameters, are backbone_model(same parameter as in the tokenizer) and all parameters from torch_bert. **The order of tasks MATTERS.** Here is the definition of ``multitask_transformer`` from the :config:`multitask_example `: .. code:: json "id": "multitask_transformer", "class_name": "multitask_transformer", "optimizer_parameters": {"lr": 2e-5}, "gradient_accumulation_steps": "{GRADIENT_ACC_STEPS}", "learning_rate_drop_patience": 2, "learning_rate_drop_div": 2.0, "return_probas": true, "backbone_model": "{BACKBONE}", "save_path": "{MODEL_PATH}", "load_path": "{MODEL_PATH}", "tasks": { "cola": { "type": "classification", "options": 2 }, "rte": { "type": "classification", "options": 2 }, "stsb": { "type": "regression", "options": 1 }, "copa": { "type": "multiple_choice", "options": 2 }, "conll": { "type": "sequence_labeling", "options": "#vocab_conll.len" } }, "in": [ "bert_features_cola", "bert_features_rte", "bert_features_stsb", "bert_features_copa", "bert_features_conll" ], "in_y": ["y_cola", "y_rte", "y_stsb", "y_copa", "y_ids_conll"], "out": [ "y_cola_pred_probas", "y_rte_pred_probas", "y_stsb_pred", "y_copa_pred_probas", "y_conll_pred_ids" ] Note that ``proba2labels`` can now take several arguments. .. code:: json { "in":["y_cola_pred_probas", "y_rte_pred_probas", "y_copa_pred_probas"], "out":["y_cola_pred_ids", "y_rte_pred_ids", "y_copa_pred_ids"], "class_name":"proba2labels", "max_proba":true } You may need to create your own metric for early stopping. In this example, the target metric is an average of AUC ROC for insults and sentiment tasks and F1 for NER task: .. code:: python from deeppavlov.metrics.roc_auc_score import roc_auc_score def roc_auc__roc_auc__ner_f1(true_onehot1, pred_probas1, true_onehot2, pred_probas2, ner_true3, ner_pred3): roc_auc1 = roc_auc_score(true_onehot1, pred_probas1) roc_auc2 = roc_auc_score(true_onehot2, pred_probas2) ner_f1_3 = ner_f1(ner_true3, ner_pred3) / 100 return (roc_auc1 + roc_auc2 + ner_f1_3) / 3 It he code above will be saved at ``custom_metric.py``, metric could be used in the config as ``custom_metric:roc_auc__roc_auc__ner_f1`` (``module.submodules:function_name`` reference format). You can make an inference-only config. In this config, there is no need in dataset reader and dataset iterator. A ``train`` field and components preparing ``in_y`` are removed. In ``multitask_transformer`` component configuration all training parameters (learning rate, optimizer, etc.) are omitted. Here are the results of ``deeppavlov/configs/multitask/mt_glue.json`` compared to the analogous single-task configs, according to the test server. +-------------------+-------------+----------------+----------+---------------+-----------------------+---------------+------------+----------+----------+----------------+ | Task | Score | CoLA | SST-2 | MRPC | STS-B | QQP | MNLI(m/mm) | QNLI | RTE | AX | +-------------------+-------------+----------------+----------+---------------+-----------------------+---------------+------------+----------+----------+----------------+ | Metric | from server | Matthew's Corr | Accuracy | F1 / Accuracy | Pearson/Spearman Corr | F1 / Accuracy | Accuracy | Accuracy | Accuracy | Matthew's Corr | +===================+=============+================+==========+===============+=======================+===============+============+==========+==========+================+ | Multitask config | 77.8 | 43.6 | 93.2 | 88.6/84.2 | 84.3/84.0 | 70.1/87.9 | 83.0/82.6 | 90.6 | 75.4 | 35.4 | +-------------------+-------------+----------------+----------+---------------+-----------------------+---------------+------------+----------+----------+----------------+ | Singletask config | 77.6 | 53.6 | 92.7 | 87.7/83.6 | 84.4/83.1 | 70.5/88.9 | 84.4/83.2 | 90.3 | 63.4 | 36.3 | +-------------------+-------------+----------------+----------+---------------+-----------------------+---------------+------------+----------+----------+----------------+ ================================================ FILE: docs/features/models/neural_ranking.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#### Neural Ranking\n", "\n", "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/neural_ranking.ipynb)\n", "\n", "# Table of contents \n", "\n", "1. [Introduction to the task](#1.-Introduction-to-the-task)\n", "\n", "2. [Get started with the model](#2.-Get-started-with-the-model)\n", "\n", "3. [Models list](#3.-Models-list)\n", "\n", "4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\n", "\n", " 4.1. [Predict using Python](#4.1-Predict-using-Python)\n", " \n", " 4.2. [Predict using CLI](#4.2-Predict-using-CLI)\n", "\n", "5. [Customize the model](#5.-Customize-the-model)\n", "\n", "# 1. Introduction to the task\n", "\n", "This model solves the tasks of ranking and paraphrase identification based on semantic similarity which is trained with siamese neural networks. The trained network can retrieve the response closest semantically to a given context from some database or answer whether two sentences are paraphrases or not. It is possible to build automatic semantic FAQ systems with such neural architectures.\n", "\n", "# 2. Get started with the model\n", "\n", "First make sure you have the DeepPavlov Library installed.\n", "[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install -q deeppavlov" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then make sure that all the required packages for the model are installed." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!python -m deeppavlov install ranking_ubuntu_v2_torch_bert_uncased" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`ranking_ubuntu_v2_torch_bert_uncased` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html)\n", "\n", "There are alternative ways to install the model's packages that do not require executing a separate command -- see the options in the next sections of this page.\n", "The full list of models for neural ranking with their config names can be found in the [table](#3.-Models-list).\n", "\n", "# 3. Models list\n", "\n", "| Config | Language | Dataset | Transformer model |\n", "| :--- | :---: | :--- | :--- |\n", "| ranking/ranking_ubuntu_v2_torch_bert_uncased.json | En | [Ubuntu v2](https://github.com/rkadlec/ubuntu-ranking-dataset-creator) | bert-base-uncased |\n", "| classifiers/paraphraser_rubert.json | Ru | [paraphraser.ru](https://paraphraser.ru) | DeepPavlov/rubert-base-cased |\n", "| classifiers/paraphraser_convers_distilrubert_2L.json | Ru | [paraphraser.ru](https://paraphraser.ru) | DeepPavlov/distilrubert-tiny-cased-conversational |\n", "| classifiers/paraphraser_convers_distilrubert_6L.json | Ru | [paraphraser.ru](https://paraphraser.ru) | DeepPavlov/distilrubert-base-cased-conversational |\n", "\n", "# 4. Use the model for prediction\n", "\n", "## 4.1 Predict using Python\n", "\n", "### English" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from deeppavlov import configs, build_model\n", "\n", "\n", "ranking = build_model(\"ranking_ubuntu_v2_torch_bert_uncased\", download=True, install=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ranking([[\"Forrest Gump is a 1994 American epic comedy-drama film directed by Robert Zemeckis.\",\n", " \"Robert Zemeckis directed Forrest Gump.\",\n", " \"Robert Lee Zemeckis was born on May 14, 1952, in Chicago.\"]])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Input:** List[List[sentence1, sentence2, ...]], where the sentences from the second to the last will be ranked by similarity with the first sentence.\n", "\n", "**Output:** List[List[scores]] - similarity scores to the first sentence of the sentences from the second to the last.\n", "\n", "### Russian" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from deeppavlov import configs, build_model\n", "\n", "\n", "ranking = build_model(\"paraphraser_rubert\", download=True, install=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ranking([\"Форрест Гамп - комедийная драма, девятый полнометражный фильм режиссёра Роберта Земекиса.\"],\n", " [\"Роберт Земекис был режиссером фильма «Форрест Гамп».\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Input:** Tuple[List[sentences1], List[sentence2]], where each element of the list of sentences1 will be compared with the corresponding element of the sentence2 list.\n", "\n", "**Output:** List[labels] - each label is 1 or 0, 1 - if the sentence from the first list is a paraphrase to the corresponding sentence from the second list, 0 - otherwise.\n", "\n", "## 4.2 Predict using CLI\n", "\n", "### English\n", "\n", "It is not intended to use the class ``deeppavlov.models.torch_bert.torch_bert_ranker.TorchBertRankerModel`` in the interact mode, so it is better to launch the config ranking/ranking_ubuntu_v2_torch_bert_uncased.json [using Python](#4.1-Predict-using-Python).\n", "\n", "### Russian\n", "\n", "You can also get predictions in an interactive mode through CLI (Сommand Line Interface)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! python -m deeppavlov interact paraphraser_rubert -d" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 5. Customize the model\n", "\n", "## English\n", "\n", "To train the ranking model on your own data, you should make a dataset in the following format:\n", "\n", "- the dataset should have **train.csv**, **valid.csv** and **test.csv** files.\n", "\n", "- **train.csv** file should contain the following columns: Context, Utterance, Label. Context and utterance are two texts and label (0 or 1) shows the relevance of the utterance to the context.\n", "\n", "- **valid.csv** and **test.csv** files should contain the following columns: Context, Ground Truth Utterance, Distractor_0, Distractor_1, ..., Distractor_N. Distractor utterances are negative samples (utterances, irrelevant to the context).\n", "\n", "Then you should put train.csv, valid.csv and test.csv files into the directory ``\"data_path\"`` in the dataset reader from the config and launch training of the model:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "python -m deeppavlov train ranking_ubuntu_v2_torch_bert_uncased" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Russian\n", "\n", "To train the ranking model on your own data, you should make a dataset with two files: **paraphrases.xml** (for training) and **paraphrases_gold.xml** (for testing).\n", "\n", "The xml files should have the following format:\n", "\n", " \n", " \n", " \n", " Russian Paraphrase Corpus\n", " This file contains a collection of sentence pairs with crowdsourced annotation. Paraphrase classes: -1: non-paraphrases, 0: loose paraphrases, 1: strict paraphrases.\n", " http://paraphraser.ru\n", " 1.0 beta\n", " 2015-11-28\n", " \n", " \n", " \n", " 1\n", " 201\n", " 8159\n", " text 1\n", " text 2\n", " 0.65\n", " 0\n", " \n", " \n", " ...\n", " \n", " \n", " \n", "\n", "Place **paraphrases.xml** and **paraphrases_gold.xml** files into the directory ``\"data_path\"`` in the dataset reader from the config and launch training of the model:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "python -m deeppavlov train paraphraser_rubert" ] } ], "metadata": {}, "nbformat": 4, "nbformat_minor": 4 } ================================================ FILE: docs/features/models/popularity_ranking.rst ================================================ ================= Popularity Ranker ================= Popularity Ranker re-ranks results obtained via :doc:`TF-IDF Ranker ` using information about the number of article views. The number of Wikipedia articles views is an open piece of information which can be obtained via `Wikimedia REST API `_. We assigned a mean number of views for the period since 2017/11/05 to 2018/11/05 to each article in our English Wikipedia database `enwiki20180211 `_. The inner algorithm of Popularity Ranker is a Logistic Regression classifier based on 3 features: - tfidf score of the article - popularity of the article - multiplication of two above features The classifier is trained on `SQuAD-v1.1`_ train set. Quick Start =========== Before using the model make sure that all required packages are installed running the command: .. code:: bash python -m deeppavlov install en_ranker_pop_wiki Building the model .. code:: python from deeppavlov import build_model ranker = build_model('en_ranker_pop_wiki', download=True) Inference .. code:: python result = ranker(['Who is Ivan Pavlov?']) print(result[:5]) Output :: >> ['Ivan Pavlov', 'Vladimir Bekhterev', 'Classical conditioning', 'Valentin Pavlov', 'Psychology'] Text for the output titles can be further extracted with :class:`~deeppavlov.vocabs.wiki_sqlite.WikiSQLiteVocab` class. Configuration ============= Default ranker config is :config:`doc_retrieval/en_ranker_pop_wiki.json ` Running the Ranker ================== .. note:: About **17 GB of RAM** required. Interacting ----------- When interacting, the ranker returns document titles of the relevant documents. Run the following to interact with the ranker: .. code:: bash python -m deeppavlov interact en_ranker_pop_wiki -d Available Data and Pretrained Models ==================================== Available information about Wikipedia articles popularity is downloaded to ``~/.deeppavlov/downloads/odqa/popularities.json`` and pre-trained logistic regression classifier is downloaded to ``~/.deeppavlov/models/odqa/logreg_3features.joblib`` by default. References ========== .. target-notes:: .. _`SQuAD-v1.1`: https://arxiv.org/abs/1606.05250 ================================================ FILE: docs/features/models/relation_extraction.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#### Relation Extraction\n", "\n", "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/relation_extraction.ipynb)\n", "\n", "# Table of contents \n", "\n", "1. [Introduction to the task](#1.-Introduction-to-the-task)\n", "\n", "2. [Get started with the model](#2.-Get-started-with-the-model)\n", "\n", "3. [Models list](#3.-Models-list)\n", "\n", "4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\n", "\n", " 4.1 [Predict using Python](#4.1-Predict-using-Python)\n", " \n", " 4.2 [Predict using CLI](#4.2-Predict-using-CLI)\n", "\n", "5. [Customize the model](#5.-Customize-the-model)\n", " \n", " 5.1 [Description of config parameters](#5.1-Description-of-config-parameters)\n", " \n", " 5.2 [Train Relation Extraction on custom data](#5.2-Train-Relation-Extraction-on-custom-data)\n", "\n", "6. [Relations list](#6.-Relations-list)\n", "\n", " 6.1 [Relations used in English model](#6.1-Relations-used-in-English-model)\n", " \n", " 6.2 [Relations used in Russian model](#6.2-Relations-used-in-Russian-model)\n", "\n", "# 1. Introduction to the task\n", "\n", "Relation extraction is the task of detecting and classifying the relationship between two entities in text.\n", "DeepPavlov provides the document-level relation extraction meaning that the relation can be detected between the entities that are not in one sentence.\n", "\n", "**RE Model Architecture**\n", "\n", "We based our model on the [Adaptive Thresholding and Localized Context Pooling](https://arxiv.org/pdf/2010.11304.pdf) model and used NER entity tags as additional input. Two core ideas of this model are:\n", "\n", "- Adaptive Threshold\n", "\n", "The usual global threshold for converting the RE classifier output probability to relation label is replaced with a learnable one. A new threshold class that learns an entities-dependent threshold value is introduced and learnt as all other classes. During prediction the positive classes (= relations that are hold in the sample indeed) are claimed to be the classes with higher logins that the TH class, while all others are negative ones.\n", "\n", "- Localised Context Pooling\n", "\n", "The embedding of each entity pair is enhanced with an additional local context embedding related to both entities. Such representation, which is attended to the relevant context in the document, is useful to decide the relation for exactly this entity pair. For incorporating the context information the attention heads are directly used.\n", "\n", "# 2. Get started with the model\n", "\n", "First make sure you have the DeepPavlov Library installed.\n", "[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install -q deeppavlov" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Before using the model make sure that all required packages are installed running the command:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!python -m deeppavlov install re_docred" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 3. Models list\n", "\n", "The table presents a list of all of the relation extraction models available in the DeepPavlov Library.\n", "\n", "| Config | Language | Dataset |\n", "| :--- | :---: | :--- |\n", "| relation_extraction/re_docred.json | En | [DocRED](https://www.aclweb.org/anthology/P19-1074/) |\n", "| relation_extraction/re_rured.json | Ru | [RuRED](http://www.dialog-21.ru/media/5093/gordeevdiplusetal-031.pdf) |\n", "\n", "## Some details on DocRED corpus English RE model was trained on\n", "\n", "The English RE model was trained on DocRED English corpus. It was constructed from Wikipedia and Wikidata and is now the largest human-annotated dataset for document-level RE from plain text.\n", "\n", "As the original DocRED test dataset containes only unlabeled data, while we want to have labeled one in order to perform evaluation, we decided to:\n", "1. merge train and dev data (= labeled data)\n", "2. split them into new train, dev and test dataset\n", "\n", "Currently, there are two types of possible splittings provided:\n", "\n", "- user can set the relative size of dev and test data (e.g. 1/7)\n", "- user can set the absolute size of dev and test data (e.g. 2000 samples)\n", "\n", "In our experiment, we set the absolute size of dev and test data == 150 initial documents. It resulted in approximately 3500 samples.\n", "\n", "We additionally generate negative samples if it was necessary to have the following proportions:\n", "- for train set: negative samples are twice as many as positive ones\n", "- for dev & test set: negative samples are the same amount as positive ones\n", "\n", "| Train | Dev | Test |\n", "| :---: | :---: | :---: |\n", "| 130650 | 3406 | 3545 |\n", "\n", "| Train Positive | Train Negative | Dev Positive | Dev Negative | Test Positive | Test Negative |\n", "| :---: | :---: | :---: | :---: | :---: | :---: |\n", "| 44823 | 89214 | 1239 | 1229 | 1043 | 1036 |\n", "\n", "## Some details on RuRED corpus Russian RE model was trained on\n", "\n", "In case of RuRED we used the train, dev and test sets from the original RuRED setting. We additionally generate negative samples if it was necessary to have the following proportions:\n", "\n", "- for train set: negative samples are twice as many as positive ones\n", "- for dev & test set: negative samples are the same amount as positive ones\n", "\n", "| Train | Dev | Test |\n", "| :---: | :---: | :---: |\n", "| 12855 | 1076 |1072 |\n", "\n", "| Train Positive | Train Negative | Dev Positive | Dev Negative | Test Positive | Test Negative |\n", "| :---: | :---: | :---: | :---: | :---: | :---: |\n", "| 4285 | 8570 | 538 | 538 | 536 | 536 |\n", "\n", "# 4. Use the model for prediction\n", "\n", "## 4.1 Predict using Python\n", "\n", "### English" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from deeppavlov import configs, build_model\n", "\n", "re_model = build_model(configs.relation_extraction.re_docred, download=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[['P26'], ['spouse']]\n" ] } ], "source": [ "sentence_tokens = [[\"Barack\", \"Obama\", \"is\", \"married\", \"to\", \"Michelle\", \"Obama\", \",\", \"born\", \"Michelle\", \"Robinson\", \".\"]]\n", "entity_pos = [[[(0, 2)], [(5, 7), (9, 11)]]]\n", "entity_tags = [[\"PER\", \"PER\"]]\n", "pred = re_model(sentence_tokens, entity_pos, entity_tags)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Model Input**:\n", "\n", "- list of tokens of a text document\n", "- list of entities positions (i.e. all start and end positions of both entities' mentions)\n", "- list of NER tags of both entities.\n", "\n", "As NER tags, we adapted the used in the DocRED corpus, which are, in turn, inherited from [Tjong Kim Sang and De Meulder(2003)](https://aclanthology.org/W03-0419/)\n", "\n", "**The whole list of 6 English NER tags**\n", "\n", "| Tag | Description |\n", "| :--- | :--- |\n", "|PER | People, including fictional |\n", "|ORG | Companies, universities, institutions, political or religious groups, etc. |\n", "|LOC | Geographically defined locations, including mountains, waters, etc.
Politically defined locations, including countries, cities, states, streets, etc.
Facilities, including buildings, museums, stadiums, hospitals, factories, airports, etc. |\n", "|TIME | Absolute or relative dates or periods. |\n", "|NUM | Percents, money, quantities |\n", "|MISC | Products, including vehicles, weapons, etc.
Events, including elections, battles, sporting MISC events, etc. Laws, cases, languages, etc. |\n", "\n", "**Model Output**: one or several of the [97 relations](#6.1-Relations-used-in-English-model) found between the given entities; relation id in [Wikidata](https://www.wikidata.org/wiki/Wikidata:Main_Page) (e.g. 'P26') and relation name ('spouse').\n", "\n", "### Russian" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from deeppavlov import configs, build_model\n", "\n", "re_model = build_model(configs.relation_extraction.re_rured)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[['P495'], ['страна происхождения']]\n" ] } ], "source": [ "sentence_tokens = [[\"Илон\", \"Маск\", \"живет\", \"в\", \"Сиэттле\", \".\"]]\n", "entity_pos = [[[(0, 2)], [(4, 5)]]]\n", "entity_tags = [[\"PERSON\", \"CITY\"]]\n", "pred = re_model(sentence_tokens, entity_pos, entity_tags)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Model Input**:\n", "\n", "- list of tokens of a text document\n", "- list of entities positions (i.e. all start and end positions of both entities' mentions)\n", "- list of NER tags of both entities.\n", "\n", "**Model Output**: one or several of the [30 relations](#6.2-Relations-used-in-Russian-model) found between the given entities; a Russian relation name (e.g. \"участник\") or an English one, if Russian one is unavailable, and, if applicable, its id in [Wikidata](https://www.wikidata.org/wiki/Wikidata:Main_Page) (e.g. 'P710').\n", "\n", "## 4.2 Predict using CLI\n", "\n", "You can also get predictions in an interactive mode through CLI." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! python -m deeppavlov interact re_docred [-d]\n", "! python -m deeppavlov interact re_rured [-d]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`-d` is an optional download key (alternative to `download=True` in Python code). It is used to download the pre-trained model along with embeddings and all other files needed to run the model." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 5. Customize the model\n", "\n", "## 5.1 Description of config parameters\n", "\n", "Parameters of ``re_preprocessor`` component:\n", "\n", "- ``ner_tags: List[str]`` - ner tags of the entities, which are one-hot encoded and concatenated to entity embeddings in the output of the Transformer;\n", "- ``special_token: str`` - the token which is added before and after the entities (subject and object in the triplet) mentions;\n", "- ``default_tag: str`` - default ner tags, if no tags are provided;\n", "- ``do_lower_case: bool`` - set True if lowercasing is needed.\n", "\n", "Parameters of ``re_classifier`` component:\n", "\n", "- ``n_classes: int`` - number of relations which the model supports;\n", "- ``num_ner_tags: int`` - number of ner tags;\n", "- ``return_probas: bool`` - whether to return confidences of predicted relations.\n", "\n", "Parameters of ``re_postprocessor`` component:\n", " \n", "- ``rel2id_path: str`` - the file with mapping of relation IDs in the knowledge base to relation number (for example, \"P19\": 24);\n", "- ``rel2label_path: str`` - the file with mapping of relation IDs to relation labels.\n", "\n", "## 5.2 Train Relation Extraction on custom data\n", "\n", "There are two kinds of dataset readers for relation extraction in DeepPavlov library:\n", "\n", "- ``docred_reader``, which takes into account partition of the text into sentences and several mentions in the text for one entity;\n", "- ``rured_reader``, a simplified dataset reader.\n", "\n", "### Train with ``docred_reader``\n", "\n", "You should prepare **train_annotated.json**, **dev.json**, **test.json** in the following format:\n", "\n", " {\n", " \"vertexSet\": [\n", " [\n", " {\n", " \"name\": entity1_mention1,\n", " \"pos\": [mention1 start token index, mention1 end token index],\n", " \"sent_id\": ID of the sentence with the entity1 mention1,\n", " \"type\": ner tag\n", " },\n", " {\n", " \"name\": entity1_mention2,\n", " ...\n", " },\n", " ...\n", " ],\n", " [ ... ]\n", " ],\n", " \"labels\": [\n", " {\n", " \"r\": relation ID,\n", " \"h\": index of head entity of the triplet in the vertexSet list,\n", " \"t\": index of tail entity of the triplet in the vertexSet list,\n", " \"evidence\": [\n", " indices of the sentences with the triplet\n", " ]\n", " },\n", " ...\n", " ],\n", " \"title\": doc title,\n", " \"sentences\": [\n", " list of tokens of sentence 1,\n", " list of tokens of sentence 2,\n", " ...\n", " ],\n", " ...\n", " }\n", "\n", "For example,\n", "\n", " {\n", " \"vertexSet\": [\n", " [\n", " {\n", " \"name\": \"Elon Musk\",\n", " \"pos\": [0, 2],\n", " \"sent_id\": 0,\n", " \"type\": \"PER\"\n", " }\n", " ],\n", " [\n", " {\n", " \"name\": \"Seattle\",\n", " \"pos\": [4, 5],\n", " \"sent_id\": 0,\n", " \"type\": \"CITY\"\n", " }\n", " ]\n", " ],\n", " \"labels\": [\n", " {\n", " \"r\": \"P551\",\n", " \"h\": 0,\n", " \"t\": 1,\n", " \"evidence\": [0]\n", " }\n", " ],\n", " \"title\": \"title1\",\n", " \"sentences\": [\n", " [\"Elon\", \"Musk\", \"lives\", \"in\", \"Seattle\", \".\"]\n", " ]\n", " }\n", "\n", "### Train with ``rured_reader``\n", "\n", "You should prepare **train.json**, **dev.json**, **test.json** in the following format:\n", "\n", " {\n", " \"token\": list of text tokens,\n", " \"relation\": relation ID,\n", " \"subj_start\": index of the token of the subject start in the list,\n", " \"subj_end\": index of the token of the subject end in the list,\n", " \"obj_start\": index of the token of the object start in the list,\n", " \"obj_end\": index of the token of the object end in the list,\n", " \"subj_type\": ner tag of the subject entity,\n", " \"obj_type\": ner tag of the object entity,\n", " },\n", "\n", "for example:\n", "\n", " {\n", " \"token\": [\"Илон\", \"Маск\", \"живет\", \"в\", \"Сиэттле\", \".\"],\n", " \"relation\": \"P551\",\n", " \"subj_start\": 0,\n", " \"subj_end\": 2,\n", " \"obj_start\": 4,\n", " \"obj_end\": 5,\n", " \"subj_type\": \"PERSON\",\n", " \"obj_type\": \"CITY\"\n", " }\n", "\n", "#### Train the model using Python:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from deeppavlov import train_model\n", "\n", "train_model(\"re_docred\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**or using CLI:**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! python -m deeppavlov train re_docred" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 6. Relations list" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6.1 Relations used in English model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "|Relation id | Relation |\n", "| :--- | :--- |\n", "|P6 | head of government |\n", "|P17 | country |\n", "|P19 | place of birth |\n", "|P20 | place of death |\n", "|P22 | father |\n", "|P25 | mother |\n", "|P26 | spouse |\n", "|P27 | country of citizenship |\n", "|P30 | continent |\n", "|P31 | instance of |\n", "|P35 | head of state |\n", "|P36 | capital |\n", "|P37 | official language |\n", "|P39 | position held |\n", "|P40 | child |\n", "|P50 | author |\n", "|P54 | member of sports team |\n", "|P57 | director |\n", "|P58 | screenwriter |\n", "|P69 | educated at |\n", "|P86 | composer |\n", "|P102 | member of political party |\n", "|P108 | employer |\n", "|P112 | founded by |\n", "|P118 | league |\n", "|P123 | publisher |\n", "|P127 | owned by |\n", "|P131 | located in the administrative territorial entity |\n", "|P136 | genre |\n", "|P137 | operator |\n", "|P140 | religion |\n", "|P150 | contains administrative territorial entity |\n", "|P155 | follows |\n", "|P156 | followed by |\n", "|P159 | headquarters location |\n", "|P161 | cast member |\n", "|P162 | producer |\n", "|P166 | award received |\n", "|P170 | creator |\n", "|P171 | parent taxon |\n", "|P172 | ethnic group |\n", "|P175 | performer |\n", "|P176 | manufacturer |\n", "|P178 | developer |\n", "|P179 | series |\n", "|P190 | sister city |\n", "|P194 | legislative body |\n", "|P205 | basin country |\n", "|P206 | located in or next to body of water |\n", "|P241 | military branch |\n", "|P264 | record label |\n", "|P272 | production company |\n", "|P276 | location |\n", "|P279 | subclass of |\n", "|P355 | subsidiary |\n", "|P361 | part of |\n", "|P364 | original language of work |\n", "|P400 | platform |\n", "|P403 | mouth of the watercourse |\n", "|P449 | original network |\n", "|P463 | member of |\n", "|P488 | chairperson |\n", "|P495 | country of origin |\n", "|P527 | has part |\n", "|P551 | residence |\n", "|P569 | date of birth |\n", "|P570 | date of death |\n", "|P571 | inception |\n", "|P576 | dissolved, abolished or demolished |\n", "|P577 | publication date |\n", "|P580 | start time |\n", "|P582 | end time |\n", "|P585 | point in time |\n", "|P607 | conflict |\n", "|P674 | characters |\n", "|P676 | lyrics by |\n", "|P706 | located on terrain feature |\n", "|P710 | participant |\n", "|P737 | influenced by |\n", "|P740 | location of formation |\n", "|P749 | parent organization |\n", "|P800 | notable work |\n", "|P807 | separated from |\n", "|P840 | narrative location |\n", "|P937 | work location |\n", "|P1001 | applies to jurisdiction |\n", "|P1056 | product or material produced |\n", "|P1198 | unemployment rate |\n", "|P1336 | territory claimed by |\n", "|P1344 | participant of |\n", "|P1365 | replaces |\n", "|P1366 | replaced by |\n", "|P1376 | capital of |\n", "|P1412 | languages spoken, written or signed |\n", "|P1441 | present in work |\n", "|P3373 | sibling |" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6.2 Relations used in Russian model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "| Relation | Relation id | Russian relation |\n", "| :--- | :--- | :--- |\n", "| MEMBER | P710 | участник |\n", "| WORKS_AS | P106 | род занятий |\n", "| WORKPLACE | -- | -- |\n", "| OWNERSHIP | P1830 | владеет |\n", "| SUBORDINATE_OF | -- | -- |\n", "| TAKES_PLACE_IN | P276 | местонахождение |\n", "| EVENT_TAKES_PART_IN | P1344 | участвовал в |\n", "| SELLS_TO | -- | -- |\n", "| ALTERNATIVE_NAME | -- | -- |\n", "| HEADQUARTERED_IN | P159 | расположение штаб-квартиры |\n", "| PRODUCES | P1056 | продукция |\n", "| ABBREVIATION | -- | -- |\n", "| DATE_DEFUNCT_IN | P576 | дата прекращения существования |\n", "| SUBEVENT_OF | P361 | часть от |\n", "| DATE_FOUNDED_IN | P571 | дата основания/создания/возн-я |\n", "| DATE_TAKES_PLACE_ON | P585 | момент времени |\n", "| NUMBER_OF_EMPLOYEES_FIRED | -- | -- |\n", "| ORIGINS_FROM | P495 | страна происхождения |\n", "| ACQUINTANCE_OF | -- | -- |\n", "| PARENT_OF | P40 | дети |\n", "| ORGANIZES | P664 | организатор |\n", "| FOUNDED_BY | P112 | основатель |\n", "| PLACE_RESIDES_IN | P551 | место жительства |\n", "| BORN_IN | P19 | место рождения |\n", "| AGE_IS | -- | -- |\n", "| RELATIVE | -- | -- |\n", "| NUMBER_OF_EMPLOYEES | P1128 | число сотрудников |\n", "| SIBLING | P3373 | брат/сестра |\n", "| DATE_OF_BIRTH | P569 | дата рождения |" ] } ], "metadata": {}, "nbformat": 4, "nbformat_minor": 4 } ================================================ FILE: docs/features/models/spelling_correction.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#### Spelling correction\n", "\n", "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/spelling_correction.ipynb)\n", "\n", "# Table of contents \n", "\n", "1. [Introduction to the task](#1.-Introduction-to-the-task)\n", "\n", "2. [Get started with the model](#2.-Get-started-with-the-model)\n", "\n", "3. [Models list](#3.-Models-list)\n", "\n", "4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\n", "\n", " 4.1. [Predict using Python](#4.1-Predict-using-Python)\n", "\n", " 4.2. [Predict using CLI](#4.2-Predict-using-CLI)\n", "\n", "5. [Customize the model](#5.-Customize-the-model)\n", "\n", " 5.1. [Training configuration](#5.1-Training-configuration)\n", "\n", " 5.2. [Language model](#5.2-Language-model)\n", "\n", "6. [Comparison](#6.-Comparison)\n", "\n", "# 1. Introduction to the task\n", "\n", "Spelling correction is detection of words in the text with spelling errors and replacement them with correct ones.\n", "\n", "For example, the sentence\n", "\n", "```\n", "The platypus lives in eastern Astralia, inkluding Tasmania.\n", "```\n", "\n", "with spelling mistakes ('Astralia', 'inkluding') will be corrected as\n", "\n", "```\n", "The platypus lives in eastern Australia, including Tasmania.\n", "```\n", "\n", "# 2. Get started with the model\n", "\n", "First make sure you have the DeepPavlov Library installed.\n", "[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install -q deeppavlov" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then make sure that all the required packages for the model are installed." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!python -m deeppavlov install brillmoore_wikitypos_en" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`brillmoore_wikitypos_en` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html)\n", "\n", "There are alternative ways to install the model's packages that do not require executing a separate command -- see the options in the next sections of this page.\n", "The full list of models for spelling correction with their config names can be found in the [table](#3.-Models-list).\n", "\n", "# 3. Models list\n", "\n", "The table presents a list of all of the models for entity detection, linking and extraction available in the DeepPavlov Library.\n", "\n", "| Config name | Language | RAM |\n", "| :--- | --- | --- |\n", "| brillmoore_wikitypos_en | En | 6.7 Gb |\n", "| levenshtein_corrector_ru | Ru | 8.7 Gb |\n", "\n", "We provide two types of pipelines for spelling correction:\n", "\n", "* [levenshtein_corrector](#4.1.1-Levenshtein-corrector) uses simple Damerau-Levenshtein distance to find correction candidates\n", "\n", "* [brillmoore](#4.1.2-Brillmoore) uses statistics based error model for it.\n", "\n", "In both cases correction candidates are chosen based on context with the help of a [kenlm language model](https://docs.deeppavlov.ai/en/master/features/models/spelling_correction.html#language-model).\n", "\n", "You can find [the comparison](#6.-Comparison) of these and other approaches near the end of this readme.\n", "\n", "# 4. Use the model for prediction\n", "\n", "## 4.1 Predict using Python\n", "\n", "### 4.1.1 Levenshtein corrector\n", "\n", "[This component](https://docs.deeppavlov.ai/en/master/apiref/models/spelling_correction.html#deeppavlov.models.spelling_correction.levenshtein.LevenshteinSearcherComponent) finds all the candidates in a static dictionary on a set Damerau-Levenshtein distance. It can separate one token into two but it will not work the other way around.\n", "\n", "**Component config parameters**:\n", "\n", "- ``in`` — list with one element: name of this component's input in\n", " chainer's shared memory\n", "- ``out`` — list with one element: name for this component's output in\n", " chainer's shared memory\n", "- ``class_name`` always equals to ``\"spelling_levenshtein\"`` or ``deeppavlov.models.spelling_correction.levenshtein.searcher_component:LevenshteinSearcherComponent``.\n", "- ``words`` — list of all correct words (should be a reference)\n", "- ``max_distance`` — maximum allowed Damerau-Levenshtein distance\n", " between source words and candidates\n", "- ``error_probability`` — assigned probability for every edit" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from deeppavlov import build_model, configs\n", "\n", "model = build_model('levenshtein_corrector_ru', download=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['утконос живет в восточной австралии на обширном ареале от холодных плато тасмании и австралийских альп до дождевых лесов прибрежного квинсленда.']\n" ] } ], "source": [ "model(['Утканос живет в Васточной Австралии на обширном ареале от холодных плато Тасмании и Австралийских Альп до дождевых лесов прибрежного Квинсленда.'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4.1.2 Brillmoore\n", "\n", "[This component](https://docs.deeppavlov.ai/en/master/apiref/models/spelling_correction.html#deeppavlov.models.spelling_correction.brillmoore.ErrorModel) is based on [An Improved Error Model for Noisy Channel Spelling Correction](http://www.aclweb.org/anthology/P00-1037) by Eric Brill and Robert C. Moore and uses statistics based error model to find best candidates in a static dictionary.\n", "\n", "**Component config parameters:**\n", "\n", "- ``in`` — list with one element: name of this component's input in\n", " chainer's shared memory\n", "- ``out`` — list with one element: name for this component's output in\n", " chainer's shared memory\n", "- ``class_name`` always equals to ``\"spelling_error_model\"`` or ``deeppavlov.models.spelling_correction.brillmoore.error_model:ErrorModel``.\n", "- ``save_path`` — path where the model will be saved at after a\n", " training session\n", "- ``load_path`` — path to the pretrained model\n", "- ``window`` — window size for the error model from ``0`` to ``4``,\n", " defaults to ``1``\n", "- ``candidates_count`` — maximum allowed count of candidates for every\n", " source token\n", "- ``dictionary`` — description of a static dictionary model, instance\n", " of (or inherited from)\n", " ``deeppavlov.vocabs.static_dictionary.StaticDictionary``\n", "\n", " - ``class_name`` — ``\"static_dictionary\"`` for a custom dictionary or one\n", " of two provided:\n", "\n", " - ``\"russian_words_vocab\"`` to automatically download and use a\n", " list of russian words from\n", " `https://github.com/danakt/russian-words/ `__\n", " - ``\"wikitionary_100K_vocab\"`` to automatically download a list\n", " of most common words from Project Gutenberg from\n", " `Wiktionary `__\n", "\n", " - ``dictionary_name`` — name of a directory where a dictionary will\n", " be built to and loaded from, defaults to ``\"dictionary\"`` for\n", " static\\_dictionary\n", " - ``raw_dictionary_path`` — path to a file with a line-separated\n", " list of dictionary words, required for static\\_dictionary" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from deeppavlov import build_model, configs\n", "\n", "model = build_model('brillmoore_wikitypos_en', download=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['the platypus lives in australia.']\n" ] } ], "source": [ "model(['The platypus lives in Astralia.'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4.2 Predict using CLI\n", "\n", "You can also get predictions in an interactive mode through CLI (Сommand Line Interface)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! python -m deeppavlov interact brillmoore_wikitypos_en -d" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 5. Customize the model\n", "\n", "## 5.1 Training configuration\n", "\n", "For the training phase config file needs to also include these\n", "parameters:\n", "\n", "- ``dataset_iterator`` — it should always be set like\n", " ``\"dataset_iterator\": {\"class_name\": \"typos_iterator\"}``\n", "\n", " - ``class_name`` always equals to ``typos_iterator``\n", " - ``test_ratio`` — ratio of test data to train, from ``0.`` to\n", " ``1.``, defaults to ``0.``\n", "\n", "- ``dataset_reader``\n", "\n", " - ``class_name`` — ``typos_custom_reader`` for a custom dataset or one of\n", " two provided:\n", "\n", " - ``typos_kartaslov_reader`` to automatically download and\n", " process misspellings dataset for russian language from\n", " https://github.com/dkulagin/kartaslov/tree/master/dataset/orfo_and_typos\n", " - ``typos_wikipedia_reader`` to automatically download and\n", " process a list of common misspellings from english\n", " Wikipedia - https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines\n", "\n", " - ``data_path`` — required for typos\\_custom\\_reader as a path to\n", " a dataset file,\n", " where each line contains a misspelling and a correct spelling\n", " of a word separated by a tab symbol\n", "\n", "Component's configuration for ``spelling_error_model`` also has to\n", "have as ``fit_on`` parameter — list of two elements:\n", "names of component's input and true output in chainer's shared\n", "memory.\n", "\n", "## 5.2 Language model\n", "\n", "Provided pipelines use [KenLM](http://kheafield.com/code/kenlm/) to process language models, so if you want to build your own, we suggest you consult its website. We do also provide our own language models for\n", "[english](http://files.deeppavlov.ai/lang_models/en_wiki_no_punkt.arpa.binary.gz) (5.5GB) and\n", "[russian](http://files.deeppavlov.ai/lang_models/ru_wiyalen_no_punkt.arpa.binary.gz) (3.1GB) languages.\n", "\n", "# 6. Comparison\n", "\n", "We compared our pipelines with\n", "[Yandex.Speller](http://api.yandex.ru/speller/),\n", "[JamSpell](https://github.com/bakwc/JamSpell) and\n", "[PyHunSpell](https://github.com/blatinier/pyhunspell)\n", "on the [test set](http://www.dialog-21.ru/media/3838/test_sample_testset.txt) for the [SpellRuEval\n", "competition](http://www.dialog-21.ru/en/evaluation/2016/spelling_correction/)\n", "on Automatic Spelling Correction for Russian:\n", "\n", "| Correction method | Precision | Recall | F-measure | Speed (sentences/s) |\n", "| :---------------- | --------- | ------ | --------- | ------------------- |\n", "| Yandex.Speller | 83.09 | 59.86 | 69.59 | 5. |\n", "| DeepPavlov levenshtein_corrector_ru | 59.38 | 53.44 | 56.25 | 39.3 |\n", "| Hunspell + lm | 41.03 | 48.89 | 44.61 | 2.1 |\n", "| JamSpell | 44.57 | 35.69 | 39.64 | 136.2 |\n", "| Hunspell | 30.30 | 34.02 | 32.06 | 20.3 |" ] } ], "metadata": {}, "nbformat": 4, "nbformat_minor": 4 } ================================================ FILE: docs/features/models/superglue.rst ================================================ Russian SuperGLUE Submission ========================================== The DeepPavlov library provides a way to train your Russian SuperGLUE models and submit the results to the leaderboard in a couple of easy steps. Task definition --------------- `Russian SuperGLUE `__ is a benchmark that contains a set of tasks in Russian developed for evaluating general language understanding. There are 9 tasks in the Russian SuperGLUE set: **DaNetQA (Yes/no Question Answering Dataset for Russian)** is a binary classification task of question answering, in which the model is asked to answer a yes/no question based on a given context fragment. **PARus (Choice of Plausible Alternatives for Russian language)** is a causal reasoning task. The model is asked to choose the most plausible alternative that has causal relation with the given premise. **RCB (Russian Commitment Bank)** is a classification task in which the model is asked to define the type of textual entailment (Entailment, Contradiction, Neutral) between two sentences. In the **MuSeRC (Russian Multi-Sentence Reading Comprehension)** task the model needs to process information from multiple sentences at once and identify the correct answers for the question from the given list. In the **RuCoS (Russian reading comprehension with Commonsense reasoning)** task the model has to choose the answer to each query from a list of text spans from a fragment. **RUSSE (Russian Word-in-Context)** is a reading comprehension task in which the model has to identify whether a given word is used in the same meaning in two different sentences. In **RWSD (The Russian Winograd Schema Challenge)** the data is a set of sentences that differ by one or two words in which syntactic ambiguity is resolved differently. The model is trained to predict whether it is resolved correctly. **LiDiRus** is a diagnostic task in which the model has to identify whether there is entailment between two sentences. **TERRa (Textual Entailment Recognition for Russian)** is a binary classification task of identifying whether there is entailment between two sentences. For more detailed description of each task see `this `__. Train your model ---------------- Modify the configuration file you need and train your own model for the task (see :doc:`here ` for more detailed instructions). The full list of models designed for each task can be found in the table below. Create your submission files ---------------------------- To do that, use the ``submit`` command with the name of the configuration file that defines the path to your model. Note that the name of the Russian SuperGLUE task should be defined in the ``["metadata"]["variables"]["TASK"]`` variable in the config file. .. code:: bash python -m deeppavlov.utils.benchmarks.superglue [-d] [-o ] * ``-d``: downloads model specific data before starting submission generation. * ``-o ``: set output file name. By default for Russian SuperGLUE models output filenames are comply with benchmark requirements. For example, ``russian_superglue_danetqa_rubert`` solves **Yes/no Question Answering Dataset for the Russian** task. Following command will generate ``DaNetQA.jsonl`` ready for submission: .. code:: bash python -m deeppavlov.utils.benchmarks.superglue russian_superglue_danetqa_rubert -d The prediction results will be saved in the correct format and the file will be automatically named with the name required by the system and saved to the current directory. All you have to do next is to zip the files you want into one archive and `submit them to leaderboard `__. Scores ------ The scores for DeepPavlov's pretrained models on the tasks are presented in the table. +--------------------------------------------------------------------------------------------------------+----------------+-----------------+ | Model | Metric | Score | +========================================================================================================+================+=================+ | :config:`russian_superglue_danetqa_rubert ` | Accuracy | 0.647 | +--------------------------------------------------------------------------------------------------------+----------------+-----------------+ | :config:`russian_superglue_parus_rubert ` | Accuracy | 0.588 | +--------------------------------------------------------------------------------------------------------+----------------+-----------------+ | :config:`russian_superglue_russe_rubert ` | Accuracy | 0.641 | +--------------------------------------------------------------------------------------------------------+----------------+-----------------+ | :config:`russian_superglue_lidirus_rubert ` | Matthew's Corr | 0.251 | +--------------------------------------------------------------------------------------------------------+----------------+-----------------+ | :config:`russian_superglue_rcb_rubert ` | F1/Acc | 0.336 / 0.486 | +--------------------------------------------------------------------------------------------------------+----------------+-----------------+ | :config:`russian_superglue_rwsd_rubert ` | Accuracy | 0.669 | +--------------------------------------------------------------------------------------------------------+----------------+-----------------+ | :config:`russian_superglue_muserc_rubert ` | F1a/Em | 0.689 / 0.298 | +--------------------------------------------------------------------------------------------------------+----------------+-----------------+ | :config:`russian_superglue_rucos_rubert ` | F1/EM | 0.77 / 0.768 | +--------------------------------------------------------------------------------------------------------+----------------+-----------------+ | :config:`russian_superglue_terra_rubert ` | Accuracy | 0.65 | +--------------------------------------------------------------------------------------------------------+----------------+-----------------+ ================================================ FILE: docs/features/models/syntax_parser.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#### Syntax Parser\n", "\n", "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/syntax_parser.ipynb)\n", "\n", "# Table of contents \n", "\n", "1. [Introduction to the task](#1.-Introduction-to-the-task)\n", "\n", "2. [Get started with the model](#2.-Get-started-with-the-model)\n", "\n", "3. [Models list](#3.-Models-list)\n", "\n", "4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\n", "\n", " 4.1. [Predict using Python](#4.1-Predict-using-Python)\n", "\n", " 4.2. [Predict using CLI](#4.2-Predict-using-CLI)\n", "\n", "5. [Customize the model](#5.-Customize-the-model)\n", "\n", "# 1. Introduction to the task\n", "\n", "Syntactic parsing is the task of prediction of the syntactic tree given the tokenized (or raw) sentence.\n", "\n", "To define a tree, for each word one should know its syntactic head and the dependency label for the edge between them.\n", "For example, the tree above can be restored from the data\n", "\n", "```\n", " 1\tJohn 2\tnsubj\t\n", " 2\tbought 0\troot\t\n", " 3\ta 6\tdet\t\n", " 4\tvery 5\tadvmod\t\n", " 5\ttasty 6\tamod\t\n", " 6\tcake 2\tobj\n", " 7\t. 2\tpunct\n", "```\n", "Here the third column contains the positions of syntactic heads and the last one -- the dependency labels.\n", "The words are enumerated from 1 since 0 is the index of the artificial root of the tree, whose only\n", "dependent is the actual syntactic head of the sentence (usually a verb).\n", "\n", "Syntactic trees can be used in many information extraction tasks. For example, to detect who is the winner\n", "and who is the loser in the sentence *Manchester defeated Liverpool* one relies on the word order. However,\n", "many languages, such as Russian, Spanish and German, have relatively free word order, which means we need\n", "other cues. Note also that syntactic relations (`nsubj`, `obj` and so one) have clear semantic counterparts,\n", "which makes syntactic parsing an appealing preprocessing step for the semantic-oriented tasks.\n", "\n", "We use BERT as the lowest layer of our model (the embedder). To extract syntactic information we apply\n", "the biaffine network of [Dozat, Manning, 2017](https://arxiv.org/pdf/1611.01734.pdf).\n", "For each sentence of length `K` this network produces two outputs: the first is an array of shape ``K*(K+1)``,\n", "where `i`-th row is the probability distribution of the head of `i`-th word over the sentence elements.\n", "The 0-th element of this distribution is the probability of the word to be a root of the sentence.\n", "The second output of the network is of shape `K*D`, where `D` is the number of possible dependency labels.\n", "\n", "The easiest way to obtain a tree is simply to return the head with the highest probability\n", "for each word in the sentence. However, the graph obtained in such a way may fail to be a valid tree:\n", "it may either contain a cycle or have multiple nodes with head at position 0.\n", "Therefore we apply the well-known Chu-Liu-Edmonds algorithm for minimal spanning tree\n", "to return the optimal tree, using the open-source modification from [dependency_decoding package](https://pypi.org/project/ufal.chu-liu-edmonds/).\n", "\n", "# 2. Get started with the model\n", "\n", "First make sure you have the DeepPavlov Library installed.\n", "[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install -q deeppavlov" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Before using the model make sure that all required packages are installed running the command:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!python -m deeppavlov install syntax_ru_syntagrus_bert" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 3. Models list\n", "\n", "The table presents a list of all of the syntax parsing models available in the DeepPavlov Library.\n", "\n", "| Config | Description |\n", "| :--- | :--- |\n", "| morpho_syntax_parser/syntax_ru_syntagrus_bert.json | Config with the model which defines for each token in the sentence
its head and dependency type in the syntactic tree. |\n", "| morpho_syntax_parser/ru_syntagrus_joint_parsing | Config which unifies syntax parsing and morphological tagging. |\n", "\n", "The table presents comparison of syntax_ru_syntagrus_bert config with other models on UD2.3 dataset.\n", "\n", "| Model | UAS | LAS |\n", "| :--- | :---: | :---: |\n", "| [UD Pipe 2.3](http://ufal.mff.cuni.cz/udpipe) (Straka et al., 2017) | 90.3 | 89.0 |\n", "| [UD Pipe Future](https://github.com/CoNLL-UD-2018/UDPipe-Future) (Straka, 2018) | 93.0 | 91.5 |\n", "| [UDify (multilingual BERT)](https://github.com/hyperparticle/udify) (Kondratyuk, 2018) | 94.8 | 93.1 |\n", "| Our BERT model (morpho_syntax_parser/syntax_ru_syntagrus_bert.json) | 94.9 | 93.4 |\n", "\n", "So our model is the state-of-the-art system for Russian syntactic parsing.\n", "\n", "# 4. Use the model for prediction\n", "\n", "## 4.1 Predict using Python\n", "\n", "### Syntax Parser\n", "\n", "Our model produces the output in [CONLL-U format](http://universaldependencies.org/format.html)\n", "and is trained on Universal Dependency corpora, available on http://universaldependencies.org/format.html .\n", "The example usage for inference is" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from deeppavlov import build_model\n", "\n", "model = build_model(\"syntax_ru_syntagrus_bert\", download=True, install=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\tЯ\t_\t_\t_\t_\t2\tnsubj\t_\t_\n", "2\tшёл\t_\t_\t_\t_\t0\troot\t_\t_\n", "3\tдомой\t_\t_\t_\t_\t2\tadvmod\t_\t_\n", "4\tпо\t_\t_\t_\t_\t6\tcase\t_\t_\n", "5\tнезнакомой\t_\t_\t_\t_\t6\tamod\t_\t_\n", "6\tулице\t_\t_\t_\t_\t2\tobl\t_\t_\n", "7\t.\t_\t_\t_\t_\t2\tpunct\t_\t_\n", "\n", "1\tДевушка\t_\t_\t_\t_\t2\tnsubj\t_\t_\n", "2\tпела\t_\t_\t_\t_\t0\troot\t_\t_\n", "3\tв\t_\t_\t_\t_\t5\tcase\t_\t_\n", "4\tцерковном\t_\t_\t_\t_\t5\tamod\t_\t_\n", "5\tхоре\t_\t_\t_\t_\t2\tobl\t_\t_\n", "6\t.\t_\t_\t_\t_\t2\tpunct\t_\t_\n" ] } ], "source": [ "sentences = [\"Я шёл домой по незнакомой улице.\", \"Девушка пела в церковном хоре.\"]\n", "for parse in model(sentences):\n", " print(parse, end=\"\\n\\n\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "As prescribed by UD standards, our model writes the head information to the 7th column and the dependency\n", "information -- to the 8th. Our parser does not return morphological tags and even does not use them in\n", "training.\n", "\n", "### Joint Syntax Parser and Morphological tagger\n", "\n", "Our model in principle supports joint prediction of morphological tags and syntactic information, however, the quality of the joint model is slightly inferior to the separate ones. Therefore we release a special component that can combine the outputs of tagger and parser: `deeppavlov.models.syntax_parser.joint.JointTaggerParser`. Its sample output for the Russian language with default settings (see the configuration file `morpho_syntax_parser/ru_syntagrus_joint_parsing.json` for exact options) looks like" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from deeppavlov import build_model\n", "\n", "model = build_model(\"ru_syntagrus_joint_parsing\", download=True, install=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\tЯ\tя\tPRON\t_\tCase=Nom|Number=Sing|Person=1\t2\tnsubj\t_\t_\n", "2\tшёл\tшёл\tVERB\t_\tAspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act\t0\troot\t_\t_\n", "3\tдомой\tдомой\tADV\t_\tDegree=Pos\t2\tadvmod\t_\t_\n", "4\tпо\tпо\tADP\t_\t_\t6\tcase\t_\t_\n", "5\tнезнакомой\tнезнакомый\tADJ\t_\tCase=Dat|Degree=Pos|Gender=Fem|Number=Sing\t6\tamod\t_\t_\n", "6\tулице\tулица\tNOUN\t_\tAnimacy=Inan|Case=Dat|Gender=Fem|Number=Sing\t2\tobl\t_\t_\n", "7\t.\t.\tPUNCT\t_\t_\t2\tpunct\t_\t_\n", "1\tДевушка\tдевушка\tNOUN\t_\tAnimacy=Anim|Case=Nom|Gender=Fem|Number=Sing\t2\tnsubj\t_\t_\n", "2\tпела\tпеть\tVERB\t_\tAspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act\t0\troot\t_\t_\n", "3\tв\tв\tADP\t_\t_\t5\tcase\t_\t_\n", "4\tцерковном\tцерковном\tADJ\t_\tCase=Loc|Degree=Pos|Gender=Masc|Number=Sing\t5\tamod\t_\t_\n", "5\tхоре\tхор\tNOUN\t_\tAnimacy=Inan|Case=Loc|Gender=Masc|Number=Sing\t2\tobl\t_\t_\n", "6\t.\t.\tPUNCT\t_\t_\t2\tpunct\t_\t_\n" ] } ], "source": [ "sentences = [\"Я шёл домой по незнакомой улице.\", \"Девушка пела в церковном хоре.\"]\n", "for parse in model(sentences):\n", " print(parse, end=\"\\n\\n\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In the basic case the model outputs a human-readable string with parse data for each information. If you need\n", "to use the output in Python, consult the `deeppavlov.models.syntax_parser.joint.JointTaggerParser` and source code.\n", "\n", "## 4.2 Predict using CLI\n", "\n", "You can also get predictions in an interactive mode through CLI (Сommand Line Interface)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! python -m deeppavlov interact syntax_ru_syntagrus_bert -d" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`-d` is an optional download key (alternative to `download=True` in Python code). The key `-d` is used to download the pre-trained model along with embeddings and all other files needed to run the model.\n", "\n", "# 5. Customize the model\n", "\n", "To train **syntax parser** on your own data, you should prepare a dataset in **CoNLL-U format**. The description of **CoNLL-U format** can be found [here](https://universaldependencies.org/format.html#conll-u-format).\n", "\n", "Then you should place files for training, validation and testing into the ``\"data_path\"`` directory of ``morphotagger_dataset_reader``, change file names in ``morphotagger_dataset_reader`` to your filenames and launch the training:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from deeppavlov import train_model\n", "\n", "train_model(\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "or **using CLI**:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! python -m deeppavlov train " ] } ], "metadata": {}, "nbformat": 4, "nbformat_minor": 4 } ================================================ FILE: docs/features/models/tfidf_ranking.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#### Tfidf Ranking\n", "\n", "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/features/models/tfidf_ranking.ipynb)\n", "\n", "# Table of contents \n", "\n", "1. [Introduction to the task](#1.-Introduction-to-the-task)\n", "\n", "2. [Get started with the model](#2.-Get-started-with-the-model)\n", "\n", "3. [Models list](#3.-Models-list)\n", "\n", "4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\n", "\n", " 4.1. [Predict using Python](#4.1-Predict-using-Python)\n", " \n", " 4.2. [Predict using CLI](#4.2-Predict-using-CLI)\n", "\n", "5. [Customize the model](#5.-Customize-the-model)\n", " \n", " 5.1. [Fit on Wikipedia](#5.1-Fit-on-Wikipedia)\n", " \n", " 5.2. [Download, parse new Wikipedia dump, build database and index](#5.2-Download,-parse-new-Wikipedia-dump,-build-database-and-index)\n", "\n", "# 1. Introduction to the task\n", "\n", "This is an implementation of a passage ranker based on tf-idf vectorization.\n", "The ranker implementation is based on [DrQA](https://github.com/facebookresearch/DrQA/) project.\n", "The default ranker implementation takes a batch of queries as input and returns 100 passage titles sorted via relevance.\n", "\n", "# 2. Get started with the model\n", "\n", "First make sure you have the DeepPavlov Library installed.\n", "[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install -q deeppavlov" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then make sure that all the required packages for the model are installed." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!python -m deeppavlov install en_ranker_tfidf_wiki" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`en_ranker_tfidf_wiki` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html)\n", "\n", "There are alternative ways to install the model's packages that do not require executing a separate command -- see the options in the next sections of this page.\n", "The full list of models for tfidf ranking with their config names can be found in the [table](#3.-Models-list).\n", "\n", "# 3. Models list\n", "\n", "| Config | Language | Description | RAM |\n", "| :--- | :---: | :--- | :---: |\n", "| doc_retrieval/en_ranker_tfidf_wiki.json | En | Config for TF-IDF ranking over Wikipedia | 2.9 Gb |\n", "| doc_retrieval/en_ranker_pop_wiki.json | En | Config for TF-IDF ranking, followed by
popularity ranking, over Wikipedia | 8.1 Gb |\n", "| doc_retrieval/ru_ranker_tfidf_wiki.json | Ru | TF-IDF ranking config over Wikipedia | 8.4 Gb |\n", "\n", "# 4. Use the model for prediction\n", "\n", "## 4.1 Predict using Python\n", "\n", "### English\n", "\n", "Building (if you don't have your own data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from deeppavlov import build_model, configs\n", "\n", "ranker = build_model(configs.doc_retrieval.en_ranker_tfidf_wiki, download=True, install=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Inference" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[18155097, 628663, 17123727, 628662, 19097375]\n" ] } ], "source": [ "result = ranker(['Who is Ivan Pavlov?'])\n", "print(result[0][:5])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Russian" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from deeppavlov import build_model, configs\n", "\n", "ranker = build_model(configs.doc_retrieval.ru_ranker_tfidf_wiki, download=True, install=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[4902620, 1900377, 11129584, 1720563, 1720658]\n" ] } ], "source": [ "result = ranker(['Когда произошла Куликовская битва?'])\n", "print(result[0][:5])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Text for the output titles can be further extracted with [deeppavlov.vocabs.wiki_sqlite.WikiSQLiteVocab](https://docs.deeppavlov.ai/en/master/apiref/vocabs.html#deeppavlov.vocabs.wiki_sqlite.WikiSQLiteVocab) class.\n", "\n", "## 4.2 Predict using CLI\n", "\n", "You can also get predictions in an interactive mode through CLI (Сommand Line Interface)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! python -m deeppavlov interact en_ranker_tfidf_wiki -d" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 5. Customize the model\n", "\n", "## 5.1 Fit on Wikipedia\n", "\n", "Run the following to fit the ranker on **English** Wikipedia:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "python -m deppavlov train en_ranker_tfidf_wiki" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Run the following to fit the ranker on **Russian** Wikipedia:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "python -m deeppavlov train ru_ranker_tfidf_wiki" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "As a result of ranker training, a SQLite database and tf-idf matrix are created.\n", "\n", "## 5.2 Download, parse new Wikipedia dump, build database and index\n", "\n", "**enwiki.db** SQLite database consists of ~21 M Wikipedia articles and is built by the following steps:\n", "\n", "- Download a Wikipedia dump file. We took the latest\n", " [enwiki dump](https://dumps.wikimedia.org/enwiki/20230501/)\n", "\n", "- Unpack and extract the articles with [WikiExtractor](https://github.com/attardi/wikiextractor)\n", " (with ``--json``, ``--no-templates``, ``--filter_disambig_pages``\n", " options)\n", "\n", "- [Build a database](#5.1-Fit-on-Wikipedia).\n", "\n", "**enwiki_tfidf_matrix.npz** is a full Wikipedia tf-idf matrix of size **hash_size x number of documents** which is\n", "$2^{24}$ x 21 M. This matrix is built with [deeppavlov.models.vectorizers.hashing_tfidf_vectorizer.HashingTfIdfVectorizer](https://docs.deeppavlov.ai/en/master/apiref/models/vectorizers.html#deeppavlov.models.vectorizers.hashing_tfidf_vectorizer.HashingTfIdfVectorizer) class.\n", "\n", "**ruwiki.db** SQLite database consists of ~12 M Wikipedia articles and is built by the following steps:\n", "\n", "- Download a Wikipedia dump file. We took the latest [ruwiki dump](https://dumps.wikimedia.org/ruwiki/20230501/)\n", "\n", "- Unpack and extract the articles with [WikiExtractor](https://github.com/attardi/wikiextractor)\n", " (with ``--json``, ``--no-templates``, ``--filter_disambig_pages``\n", " options)\n", "\n", "- [Build a database](#5.1-Fit-on-Wikipedia).\n", "\n", "**ruwiki_tfidf_matrix.npz** is a full Wikipedia tf-idf matrix of size **hash_size x number of documents** which is\n", "$2^{24}$ x 12 M. This matrix is built with\n", "[deeppavlov.models.vectorizers.hashing_tfidf_vectorizer.HashingTfIdfVectorizer](https://docs.deeppavlov.ai/en/master/apiref/models/vectorizers.html#deeppavlov.models.vectorizers.hashing_tfidf_vectorizer.HashingTfIdfVectorizer) class." ] } ], "metadata": {}, "nbformat": 4, "nbformat_minor": 4 } ================================================ FILE: docs/features/overview.rst ================================================ Features ======== .. contents:: :local: Models ------ NER model :doc:`[docs] ` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Named Entity Recognition task in DeepPavlov is solved with BERT-based model. The models predict tags (in BIO format) for tokens in input. BERT-based model is described in `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding `__. +---------------------------------------------------------+-------+--------------------------------------------------------------------------------------------+-------------+ | Dataset | Lang | Model | Test F1 | +=========================================================+=======+============================================================================================+=============+ | Persons-1000 dataset with additional LOC and ORG markup | Ru | :config:`ner_rus_bert.json ` | 97.9 | + + +--------------------------------------------------------------------------------------------+-------------+ | (Collection 3) | | :config:`ner_rus_convers_distilrubert_2L.json ` | 88.4 ± 0.5 | + + +--------------------------------------------------------------------------------------------+-------------+ | | | :config:`ner_rus_convers_distilrubert_6L.json ` | 93.3 ± 0.3 | +---------------------------------------------------------+-------+--------------------------------------------------------------------------------------------+-------------+ | Ontonotes | Multi | :config:`ner_ontonotes_bert_mult.json ` | 88.9 | + +-------+--------------------------------------------------------------------------------------------+-------------+ | | En | :config:`ner_ontonotes_bert.json ` | 89.2 | +---------------------------------------------------------+ +--------------------------------------------------------------------------------------------+-------------+ | ConLL-2003 | | :config:`ner_conll2003_bert.json ` | 91.7 | +---------------------------------------------------------+-------+--------------------------------------------------------------------------------------------+-------------+ Classification model :doc:`[docs] ` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Model for classification tasks (intents, sentiment, etc) on word-level. Shallow-and-wide CNN, Deep CNN, BiLSTM, BiLSTM with self-attention and other models are presented. The model also allows multilabel classification of texts. Several pre-trained models are available and presented in Table below. +------------------+---------------------+------+----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+ | Task | Dataset | Lang | Model | Metric | Valid | Test | Downloads | +==================+=====================+======+====================================================================================================+=============+==================+=================+===========+ | Insult detection | `Insults`_ | En | :config:`English BERT` | ROC-AUC | 0.9327 | 0.8602 | 1.1 Gb | +------------------+---------------------+ +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+ | Sentiment | `SST`_ | | :config:`5-classes SST on conversational BERT ` | Accuracy | 0.6293 | 0.6626 | 1.1 Gb | +------------------+---------------------+------+----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+ | Sentiment | `Twitter mokoron`_ | Ru | :config:`RuWiki+Lenta emb w/o preprocessing ` | Accuracy | 0.9918 | 0.9923 | 5.8 Gb | + +---------------------+ +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+ | | `RuSentiment`_ | | :config:`Multi-language BERT ` | F1-weighted | 0.6787 | 0.7005 | 1.3 Gb | + + + +----------------------------------------------------------------------------------------------------+ +------------------+-----------------+-----------+ | | | | :config:`Conversational RuBERT ` | | 0.739 | 0.7724 | 1.5 Gb | + + + +----------------------------------------------------------------------------------------------------+ +------------------+-----------------+-----------+ | | | | :config:`Conversational DistilRuBERT-tiny ` | | 0.703 ± 0.0031 | 0.7348 ± 0.0028 | 690 Mb | + + + +----------------------------------------------------------------------------------------------------+ +------------------+-----------------+-----------+ | | | | :config:`Conversational DistilRuBERT-base ` | | 0.7376 ± 0.0045 | 0.7645 ± 0.035 | 1.0 Gb | +------------------+---------------------+------+----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+ .. _`DSTC 2`: http://camdial.org/~mh521/dstc/ .. _`SNIPS-2017`: https://github.com/snipsco/nlu-benchmark/tree/master/2017-06-custom-intent-engines .. _`Insults`: https://www.kaggle.com/c/detecting-insults-in-social-commentary .. _`AG News`: https://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html .. _`Twitter mokoron`: http://study.mokoron.com/ .. _`RuSentiment`: http://text-machine.cs.uml.edu/projects/rusentiment/ .. _`Yahoo-L31`: https://webscope.sandbox.yahoo.com/catalog.php?datatype=l .. _`Yahoo-L6`: https://webscope.sandbox.yahoo.com/catalog.php?datatype=l .. _`SST`: https://nlp.stanford.edu/sentiment/index.html As no one had published intent recognition for DSTC-2 data, the comparison of the presented model is given on **SNIPS** dataset. The evaluation of model scores was conducted in the same way as in [3]_ to compare with the results from the report of the authors of the dataset. The results were achieved with tuning of parameters and embeddings trained on Reddit dataset. +------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+ | Model | AddToPlaylist | BookRestaurant | GetWheather | PlayMusic | RateBook | SearchCreativeWork | SearchScreeningEvent | +========================+=================+==================+===============+==============+==============+======================+========================+ | api.ai | 0.9931 | 0.9949 | 0.9935 | 0.9811 | 0.9992 | 0.9659 | 0.9801 | +------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+ | ibm.watson | 0.9931 | 0.9950 | 0.9950 | 0.9822 | 0.9996 | 0.9643 | 0.9750 | +------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+ | microsoft.luis | 0.9943 | 0.9935 | 0.9925 | 0.9815 | 0.9988 | 0.9620 | 0.9749 | +------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+ | wit.ai | 0.9877 | 0.9913 | 0.9921 | 0.9766 | 0.9977 | 0.9458 | 0.9673 | +------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+ | snips.ai | 0.9873 | 0.9921 | 0.9939 | 0.9729 | 0.9985 | 0.9455 | 0.9613 | +------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+ | recast.ai | 0.9894 | 0.9943 | 0.9910 | 0.9660 | 0.9981 | 0.9424 | 0.9539 | +------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+ | amazon.lex | 0.9930 | 0.9862 | 0.9825 | 0.9709 | 0.9981 | 0.9427 | 0.9581 | +------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+ +------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+ | Shallow-and-wide CNN | **0.9956** | **0.9973** | **0.9968** | **0.9871** | **0.9998** | **0.9752** | **0.9854** | +------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+ .. [3] https://www.slideshare.net/KonstantinSavenkov/nlu-intent-detection-benchmark-by-intento-august-2017 Automatic spelling correction model :doc:`[docs] ` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Pipelines that use candidates search in a static dictionary and an ARPA language model to correct spelling errors. .. note:: About 4.4 GB on disc required for the Russian language model and about 7 GB for the English one. Comparison on the `test set `__ for the `SpellRuEval competition `__ on Automatic Spelling Correction for Russian: +-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+ | Correction method | Precision | Recall | F-measure | Speed (sentences/s) | +=========================================================================================+===========+========+===========+=====================+ | Yandex.Speller | 83.09 | 59.86 | 69.59 | 5. | +-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+ | :config:`Damerau Levenshtein 1 + lm` | 53.26 | 53.74 | 53.50 | 29.3 | +-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+ | Hunspell + lm | 41.03 | 48.89 | 44.61 | 2.1 | +-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+ | JamSpell | 44.57 | 35.69 | 39.64 | 136.2 | +-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+ | Hunspell | 30.30 | 34.02 | 32.06 | 20.3 | +-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+ Ranking model :doc:`[docs] ` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Available pre-trained models for paraphrase identification: .. table:: :widths: auto +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+ | Dataset | Model config | Val (accuracy) | Test (accuracy) | Val (F1) | Test (F1) | Val (log_loss) | Test (log_loss) | Downloads | +========================+======================================================================================================+================+=================+============+============+================+=================+===========+ | `paraphraser.ru`_ | :config:`paraphrase_rubert ` | 89.8 | 84.2 | 92.2 | 87.4 | -- | -- | 1325M | +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+ | `paraphraser.ru`_ | :config:`paraphraser_convers_distilrubert_2L ` | 76.1 ± 0.2 | 64.5 ± 0.5 | 81.8 ± 0.2 | 73.9 ± 0.8 | -- | -- | 618M | +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+ | `paraphraser.ru`_ | :config:`paraphraser_convers_distilrubert_6L ` | 86.5 ± 0.5 | 78.9 ± 0.4 | 89.6 ± 0.3 | 83.2 ± 0.5 | -- | -- | 930M | +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+ .. _`paraphraser.ru`: https://paraphraser.ru/ References: * Yu Wu, Wei Wu, Ming Zhou, and Zhoujun Li. 2017. Sequential match network: A new architecture for multi-turn response selection in retrieval-based chatbots. In ACL, pages 372–381. https://www.aclweb.org/anthology/P17-1046 * Xiangyang Zhou, Lu Li, Daxiang Dong, Yi Liu, Ying Chen, Wayne Xin Zhao, Dianhai Yu and Hua Wu. 2018. Multi-Turn Response Selection for Chatbots with Deep Attention Matching Network. Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 1118-1127, ACL. http://aclweb.org/anthology/P18-1103 * Chongyang Tao, Wei Wu, Can Xu, Wenpeng Hu, Dongyan Zhao, and Rui Yan. Multi-Representation Fusion Network for Multi-turn Response Selection in Retrieval-based Chatbots. In WSDM'19. https://dl.acm.org/citation.cfm?id=3290985 * Gu, Jia-Chen & Ling, Zhen-Hua & Liu, Quan. (2019). Interactive Matching Network for Multi-Turn Response Selection in Retrieval-Based Chatbots. https://arxiv.org/abs/1901.01824 TF-IDF Ranker model :doc:`[docs] ` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Based on `Reading Wikipedia to Answer Open-Domain Questions `__. The model solves the task of document retrieval for a given query. +---------------+-------------------------------------------------------------------+----------------------+-----------------+-----------+ | Dataset | Model | Wiki dump | Recall@5 | Downloads | +===============+========================================================+==========+======================+=================+===========+ | `SQuAD-v1.1`_ | :config:`doc_retrieval ` | enwiki (2018-02-11) | 75.6 | 33 GB | +---------------+-------------------------------------------------+-----------------+----------------------+-----------------+-----------+ Question Answering model :doc:`[docs] ` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Models in this section solve the task of looking for an answer on a question in a given context (`SQuAD `__ task format). There are two models for this task in DeepPavlov: BERT-based and R-Net. Both models predict answer start and end position in a given context. BERT-based model is described in `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding `__. RuBERT-based model is described in `Adaptation of Deep Bidirectional Multilingual Transformers for Russian Language `__. +----------------+---------------------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+ | Dataset | Model config | lang | EM (dev) | F-1 (dev) | Downloads | +================+===============================================================================================================+=======+================+=================+=================+ | `SQuAD-v1.1`_ | :config:`DeepPavlov BERT ` | en | 81.49 | 88.86 | 1.2 Gb | +----------------+---------------------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+ | `SQuAD-v2.0`_ | :config:`DeepPavlov BERT ` | en | 75.71 | 80.72 | 1.2 Gb | +----------------+---------------------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+ | `SDSJ Task B`_ | :config:`DeepPavlov RuBERT ` | ru | 66.21 | 84.71 | 1.7 Mb | +----------------+---------------------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+ | `SDSJ Task B`_ | :config:`DeepPavlov RuBERT, trained with tfidf-retrieved negative samples ` | ru | 66.24 | 84.71 | 1.6 Gb | +----------------+---------------------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+ | `SDSJ Task B`_ | :config:`DeepPavlov DistilRuBERT-tiny ` | ru | 44.2 ± 0.46 | 65.1 ± 0.36 | 867Mb | +----------------+---------------------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+ | `SDSJ Task B`_ | :config:`DeepPavlov DistilRuBERT-base ` | ru | 61.23 ± 0.42 | 80.36 ± 0.28 | 1.18Gb | +----------------+---------------------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+ In the case when answer is not necessary present in given context we have :config:`qa_squad2_bert ` model. This model outputs empty string in case if there is no answer in context. ODQA :doc:`[docs] ` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ An open domain question answering model. The model accepts free-form questions about the world and outputs an answer based on its Wikipedia knowledge. +----------------+--------------------------------------------------------------------+-----------------------+--------+-----------+ | Dataset | Model config | Wiki dump | F1 | Downloads | +================+====================================================================+=======================+========+===========+ | `SQuAD-v1.1`_ | :config:`ODQA ` | enwiki (2018-02-11) | 46.24 | 9.7Gb | +----------------+--------------------------------------------------------------------+-----------------------+--------+-----------+ | `SDSJ Task B`_ | :config:`ODQA with RuBERT ` | ruwiki (2018-04-01) | 37.83 | 4.3Gb | +----------------+--------------------------------------------------------------------+-----------------------+--------+-----------+ AutoML -------------------- Hyperparameters optimization :doc:`[docs] ` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Hyperparameters optimization by cross-validation for DeepPavlov models that requires only some small changes in a config file. Embeddings ---------- Pre-trained embeddings :doc:`[docs] ` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Word vectors for the Russian language trained on joint `Russian Wikipedia `__ and `Lenta.ru `__ corpora. Examples of some models --------------------------- - Run insults detection model with console interface: .. code-block:: bash python -m deeppavlov interact insults_kaggle_bert -d - Run insults detection model with REST API: .. code-block:: bash python -m deeppavlov riseapi insults_kaggle_bert -d - Predict whether it is an insult on every line in a file: .. code-block:: bash python -m deeppavlov predict insults_kaggle_bert -d --batch-size 15 < /data/in.txt > /data/out.txt .. _`SQuAD-v1.1`: https://arxiv.org/abs/1606.05250 .. _`SQuAD-v2.0`: https://arxiv.org/abs/1806.03822 .. _`SDSJ Task B`: https://arxiv.org/abs/1912.09723 ================================================ FILE: docs/features/pretrained_vectors.rst ================================================ Pre-trained embeddings ====================== BERT ---- We are publishing several pre-trained BERT models: * RuBERT for Russian language * Slavic BERT for Bulgarian, Czech, Polish, and Russian * Conversational BERT for informal English * Conversational BERT for informal Russian * Sentence Multilingual BERT for encoding sentences in 101 languages * Sentence RuBERT for encoding sentences in Russian Description of these models is available in the :doc:`BERT section ` of the docs. License ~~~~~~~ The pre-trained models are distributed under the `License Apache 2.0 `__. Downloads ~~~~~~~~~ The ``TensorFlow`` models can be run with the original `BERT repo `_ code while the ``PyTorch`` models can be run with the `HuggingFace's Transformers `__ library. The download links are: +----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+ | Description | Model parameters | Download links | +============================+=======================================+======================================================================================================================+ | RuBERT | vocab size = 120K, parameters = 180M, | `[pytorch] `__, | | | size = 632MB | `[tensorflow] `__ | +----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+ | Slavic BERT | vocab size = 120K, parameters = 180M, | `[pytorch] `__, | | | size = 632MB | `[tensorflow] `__ | +----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+ | Conversational BERT | vocab size = 30K, parameters = 110M, | `[pytorch] `__, | | | size = 385MB | `[tensorflow] `__ | +----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+ | Conversational RuBERT | vocab size = 120K, parameters = 180M, | `[pytorch] `__,| | | size = 630MB | `[tensorflow] `__ | +----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+ | Sentence Multilingual BERT | vocab size = 120K, parameters = 180M, | `[pytorch] `__, | | | size = 630MB | `[tensorflow] `__ | +----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+ | Sentence RuBERT | vocab size = 120K, parameters = 180M, | `[pytorch] `__, | | | size = 630MB | `[tensorflow] `__ | +----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+ ELMo ---- The ELMo can used via Python code as following: .. code:: python import tensorflow as tf import tensorflow_hub as hub elmo = hub.Module("http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-news_wmt11-16_1.5M_steps.tar.gz", trainable=True) sess = tf.Session() sess.run(tf.global_variables_initializer()) embeddings = elmo(["это предложение", "word"], signature="default", as_dict=True)["elmo"] sess.run(embeddings) TensorFlow Hub module also supports tokenized sentences in the following format. .. code:: python tokens_input = [["мама", "мыла", "раму"], ["рама", "", ""]] tokens_length = [3, 1] embeddings = elmo(inputs={"tokens": tokens_input,"sequence_len": tokens_length},signature="tokens",as_dict=True)["elmo"] sess.run(embeddings) Downloads ~~~~~~~~~ The models can be downloaded and run by tensorflow hub module from: +--------------------------------------------------------------------+---------------------------------------------+------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | Description | Dataset parameters | Perplexity | Tensorflow hub module | +====================================================================+=============================================+==================+=======================================================================================================================================================================================================================================+ | ELMo on `Russian Wikipedia `__ | lines = 1M, tokens = 386M, size = 5GB | 43.692 | `module_spec `__ | +--------------------------------------------------------------------+---------------------------------------------+------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ELMo on `Russian WMT News `__ | lines = 63M, tokens = 946M, size = 12GB | 49.876 | `module_spec `__ | +--------------------------------------------------------------------+---------------------------------------------+------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ELMo on `Russian Twitter `__ | lines = 104M, tokens = 810M, size = 8.5GB | 94.145 | `module_spec `__ | +--------------------------------------------------------------------+---------------------------------------------+------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ fastText -------- We are publishing pre-trained word vectors for Russian language. Several models were trained on joint `Russian Wikipedia `__ and `Lenta.ru `__ corpora. We also introduce one model for Russian conversational language that was trained on `Russian Twitter `__ corpus. All vectors are 300-dimensional. We used fastText skip-gram (see `Bojanowski et al. (2016) `__) for vectors training as well as various preprocessing options (see below). You can get vectors either in binary or in text (vec) formats for FastText. License ~~~~~~~ The pre-trained word vectors are distributed under the `License Apache 2.0 `__. Downloads ~~~~~~~~~ The pre-trained **fastText skipgram** models can be downloaded from: +-----------------------+---------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | Domain | Preprocessing | Vectors | +=======================+=========================================================+====================================================================================================================================================================================================================================================================================================================================+ | Wiki+Lenta | tokenize (nltk word\_tokenize), lemmatize (pymorphy2) | `bin `__, `vec `__ | + +---------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | | tokenize (nltk word\_tokenize), lowercasing | `bin `__, `vec `__ | + +---------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | | tokenize (nltk wordpunсt\_tokenize) | `bin `__, `vec `__ | + +---------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | | tokenize (nltk word\_tokenize) | `bin `__, `vec `__ | + +---------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | | tokenize (nltk word\_tokenize), remove stopwords | `bin `__, `vec `__ | +-----------------------+---------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | Twitter | tokenize (nltk word\_tokenize) | `bin `__, `vec `__ | +-----------------------+---------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ Word vectors training parameters ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ These word vectors were trained with following parameters ([...] is for default value): fastText (skipgram) - lr [0.1] - lrUpdateRate [100] - dim 300 - ws [5] - epoch [5] - neg [5] - loss [softmax] - pretrainedVectors [] - saveOutput [0] ================================================ FILE: docs/index.rst ================================================ Welcome to DeepPavlov's documentation! ====================================== .. toctree:: :glob: :maxdepth: 1 Installation QuickStart General concepts Configuration file Python pipelines Models overview .. toctree:: :glob: :maxdepth: 2 :caption: Features Pre-trained embeddings AutoML .. toctree:: :glob: :maxdepth: 1 :caption: Models Multitask BERT Context Question Answering Classification Few-shot Classification Named Entity Recognition Entity Extraction BERT-based models Morphological Tagging Neural Ranking Spelling Correction Syntactic Parsing TF-IDF Ranking Popularity Ranking Knowledge Base Question answering Relation Extraction SuperGLUE Submission Open-Domain Question Answering .. toctree:: :glob: :maxdepth: 3 :caption: Integrations REST API Socket API Amazon AWS deployment DeepPavlov settings .. toctree:: :glob: :maxdepth: 3 :caption: Developer Guides Contribution guide Register your model .. toctree:: :glob: :maxdepth: 3 :caption: Internships Internships .. toctree:: :glob: :maxdepth: 3 :caption: Package Reference apiref/* Indices and tables ================== * :ref:`genindex` * :ref:`modindex` ================================================ FILE: docs/integrations/aws_ec2.rst ================================================ Amazon AWS deployment ===================== Here is a manual for deployment DeepPavlov (with ODQA as example) in Amazon Web Services using EC2 virtual machine. Deployment process consists of two main stages: 1. AWS EC2 machine launch 2. DeepPavlov ODQA deployment 1. AWS EC2 machine launch ------------------------- 1. Login to your AWS console and proceed to the EC2 services dashboard. .. image:: ../_static/aws_ec2/01_login_to_aws.png :width: 800 2. Choose Ubuntu Server 18.04 LTS 64-bit x86 machine. .. image:: ../_static/aws_ec2/02_choose_ubuntu.png :width: 800 3. You should select appropriate instance type because of high memory consumption by ODQA. 32 GiB memory is a minimum. Then press *"Next: ..."* .. image:: ../_static/aws_ec2/03_select_instance_type.png :width: 800 4. Proceed to Step 4. Your instance storage size should be no less than 50 GiB to store ODQA models. .. image:: ../_static/aws_ec2/04_add_storage.png :width: 800 5. Proceed to Step 7. Check your instance parameters and press *"Launch"* button. You will be prompted to create and save security key pair for further access to your instance. .. image:: ../_static/aws_ec2/05_review_instance.png :width: 800 6. Return to your EC2 services dashboard and navigate to your running instances list. .. image:: ../_static/aws_ec2/06_go_to_running_instances.png :width: 800 7. Wait until instance initializing finishes (instance status become *"running"*). .. image:: ../_static/aws_ec2/07_wait_init.png :width: 800 8. To make DeepPavlov ODQA model rest API accessible from Internet you should set corresponding inbound security rules: 8.1 Navigate to your instance security group dashboard (in this example security group has name *"launch-wizard-2"*). .. image:: ../_static/aws_ec2/08_01_set_sec_group.png :width: 800 8.2 Select *"Inbound"* rules tab, click *"Edit"*, then click *"Add Rule"*. For your new rule select *"Custom TCP Rule"* type, *"Anywhere"* source and input port for your ODQA API. Click *"Save"*. .. image:: ../_static/aws_ec2/08_02_set_inbound.png :width: 800 9. Connecting to your instance by SSH: 9.1 Navigate to your instance dashboard, right-click your instance, select *"Connect"*. .. image:: ../_static/aws_ec2/09_01_select_connect.png :width: 800 You will be redirected to connection instructions screen for your dashboard. Follow instructions for standalone SSH client. SSH connection bash command example will already contain valid user and host name. To connect to your Amazon instance just run the example with valid path to your saved key pair (instead of *"dp_key_pair.pem"* in this example). .. image:: ../_static/aws_ec2/09_02_connection_info.png :width: 800 2. DeepPavlov ODQA deployment ----------------------------- 1. Login to your AWS EC2 instance. 2. For now DeepPavlov requires Python 3.6 to run. Below are instructions for DeepPavlov ODQA deployment under Ubuntu 18.04 (which has pre-installed Python 3.6) and virtualenv. 3. Install pip3: ``sudo apt update`` ``sudo apt install python3-pip`` 4. Install virtualenv: ``sudo pip3 install virtualenv`` 5. Create and activate Python 3.6 virtual enviroment: ``virtualenv env -p python3.6`` ``source env/bin/activate`` 6. Install DeepPavlov: ``pip install deeppavlov`` 7. Install ODQA dependencies: ``python -m deeppavlov install en_odqa_infer_wiki`` 8. Download ODQA models (it will take quite a time): ``python -m deeppavlov download en_odqa_infer_wiki`` 9. Run ODQA REST API service, where is port you defined in TCP inbound rules for your AWS instance: ``python -m deeppavlov riseapi en_odqa_infer_wiki -p `` 3. Accessing your ODQA API -------------------------- 1. Get your AWS instance public DNS from the instance dashboard. 2. Get full info about your ODQA API from its Swagger by navigating to following URL in your browser: ``http://:`` ================================================ FILE: docs/integrations/rest_api.rst ================================================ REST API ======== Each DeepPavlov model can be easily made available for inference as a REST web service. The general method is: .. code:: bash python -m deeppavlov riseapi [-d] [-p ] [--https] [--key ] \ [--cert ] * ``-d``: downloads model specific data before starting the service. * ``-p ``: sets the port to ````. Overrides default value from ``deeppavlov/utils/settings/server_config.json``. * ``--https``: use https instead of http. Overrides default value from ``deeppavlov/utils/settings/server_config.json``. * ``--key ``: path to SSL key file. Overrides default value from ``deeppavlov/utils/settings/server_config.json``. * ``--cert ``: path to SSL certificate file. Overrides default value from ``deeppavlov/utils/settings/server_config.json``. The command will print the used host and port. Default web service properties (host, port, POST request arguments) can be modified via changing ``deeppavlov/utils/settings/server_config.json`` file. .. warning:: Starting from the 1.0.0rc2 model response format in riseapi mode matches :class:`~deeppavlov.core.common.chainer.Chainer` response format. To start model with the old format, give the ``COMPATIBILITY_MODE`` environment variable any non-empty value (e.g. ``COMPATIBILITY_MODE=true python -m deeppavlov riseapi ...``). ``COMPATIBILITY_MODE`` will be removed in DeepPavlov 1.2.0. API routes ---------- /model """""" Send POST request to ``:/model`` to infer model. See details at :ref:`rest_api_docs`. /probe """""" Send POST request to ``:/probe`` to check if API is working. The server will send a response ``["Test passed"]`` if it is working. Requests to ``/probe`` are not logged. /api """" To get model argument and response names send GET request to ``:/api``. Server will return dict with model input and output names. .. _rest_api_docs: /docs """"" To interact with the REST API via graphical interface open ``:/docs`` in a browser (Swagger UI). /metrics """""""" Endpoint to monitor a running service using Prometheus. Metrics: * ``http_requests_count``: Counter, tracks number of processed requests. Labels: ``endpoint``, ``status_code``. * ``http_requests_latency_seconds``: Histogram, tracks responses latency (only with 200 status code). Labels: ``endpoint``. * ``http_requests_in_progress``: Gauge, tracks inprogress requests. Labels: ``endpoint``. Advanced configuration ---------------------- By modifying ``deeppavlov/utils/settings/server_config.json`` you can change host, port, POST request arguments and other properties of the API service. Properties from ``common_defaults`` section are used by default unless they are overridden by model-specific properties, provided in ``model_defaults`` section of the ``server_config.json``. Model-specific properties are bound to the model by ``server_utils`` label in ``metadata`` section of the model config. Value of ``server_utils`` label from model config should match with properties key from ``model_defaults`` section of ``server_config.json``. For example, adding ``metadata/server_utils`` key to ``kbqa/kbqa_cq.json`` with value *KBQA* will initiate the search of *KBQA* tag at ``model_defaults`` section of ``server_config.json``. Therefore, if this section is present, all parameters with non empty (i.e. not ``""``, not ``[]`` etc.) values stored by this tag will overwrite the parameter values in ``common_defaults``. If ``model_args_names`` parameter of ``server_config.json`` is empty string, then model argument names are provided as list from ``chainer/in`` section of the model config file, where arguments order corresponds to model API. When inferencing model via REST api, JSON payload keys should match model arguments names from ``chainer/in`` section. If ``model_args_names`` parameter of ``server_config.json`` is list, its values are used as model argument names instead of the list from model config's ``chainer/in`` section. Here are POST request payload examples for some of the library models: +-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+ | Model | POST request JSON payload example | +=========================================+=====================================================================================================================================================+ | **One argument models** | +-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+ | NER model | {"x":["Elon Musk launched his cherry Tesla roadster to the Mars orbit"]} | +-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+ | Intent classification model | {"x":["I would like to go to a restaurant with Asian cuisine this evening"]} | +-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+ | Automatic spelling correction model | {"x":["errror"]} | +-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+ | Ranking model | {"x":["What is the average cost of life insurance services?"]} | +-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+ | Goal-oriented bot | {"x":["Hello, can you help me to find and book a restaurant this evening?"]} | +-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+ | **Multiple arguments models** | +-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+ | Question Answering model | | {"context_raw":["After 1765, growing philosophical and political differences strained the relationship between Great Britain and its colonies."], | | | |  "question_raw":["What strained the relationship between Great Britain and its colonies?"]} | +-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+ REST API Usage Example ====================== To start server with ``squad_bert`` model run: .. code:: bash python -m deeppavlov riseapi squad_bert -id To get response from this model on another terminal run: .. code:: bash curl -X POST http://0.0.0.0:5000/model -H 'Content-Type: application/json' -d '{ "context_raw": [ "All work and no play makes Jack a dull boy.", "I used to be an adventurer like you, then I took an arrow in the knee." ], "question_raw": [ "What makes Jack a dull boy?", "Who I used to be?" ] }' ================================================ FILE: docs/integrations/settings.rst ================================================ DeepPavlov settings =================== DeepPavlov provides some tools to facilitate its usage (e.g. dialog logging, settings management). This document is aimed to guide you through them. 1. Settings files access and management --------------------------------------- Most of DeepPavlov settings are located in settings files, which in turn are located in a settings folder. Default settings folder location is ``deeppavlov/utils/settings`` . You can override a settings directory path by setting the ``DP_SETTINGS_PATH`` environment variable. Missing files will be added automatically when running any deeppavlov script. You can get current full path to settings directory with ``python -m deeppavlov.settings``. To reset settings in the current settings directory one can use ``python -m deeppavlov.settings -d``. 2. Dialog logging ----------------- DeepPavlov supports logging of infered utterances and DeepPavlov model responses. You can manage dialog logging by editing ``dialog_logger_config.json`` file in a settings directory. Following dialog logging settings are available: 1. **enabled** (default: ``false``): turns on/off dialog logging for DeepPavlov instance; 2. **log_path** (default: ``~/.deeppavlov/dialog_logs``): sets directory where dialog logs are stored; 3. **logger_name** (default: ``default``): sets subdirectory name for storing dialog logs; 4. **logfile_max_size_kb** (default: ``10240``): sets logfile maximum size in kilobytes. If exceeded, new log file is created; 5. **ensure_ascii** (default: ``false``): If ``true``, converts all non-ASCII symbols in logged content to Unicode code points. 3. Environment variables ------------------------ - **DP_SETTINGS_PATH** — custom path to a directory that contains settings files. It's automatically populated with missing files when running any deeppavlov scripts. - **DP_SKIP_NLTK_DOWNLOAD** set to ``TRUE`` to prevent automatic downloading of **nltk** packages (``punkt``, ``stopwords``, ``perluniprops``, ``nonbreaking_prefixes``) ================================================ FILE: docs/integrations/socket_api.rst ================================================ Socket API ========== Each DeepPavlov model can be made available as a socket server. The general method is: .. code:: bash python -m deeppavlov risesocket [-d] [--socket-type ] [-p ] \ [--socket-file ] * ``-d``: downloads model specific data before starting the service. * ``--socket-type ``: sets socket address family to ``AF_INET`` if ```` is ``TCP`` or to ``AF_UNIX`` if ```` is ``UNIX``. Overrides default value from ``deeppavlov/utils/settings/server_config.json``. * ``-p ``: sets the port to ```` if socket address family is ``AF_INET``. Overrides default value from ``deeppavlov/utils/settings/server_config.json``. * ``--socket-file ``: sets the file for socket binding to ```` if socket address family is ``AF_UNIX``. Overrides default value from ``deeppavlov/utils/settings/server_config.json``. The command will print the binding address: host and port for ``AF_INET`` socket family and path to the UNIX socket file for ``AF_UNIX`` socket family. Default service properties (socket address family, host, port, path to the UNIX socket file, socket buffer size, binding message) can be modified via changing ``deeppavlov/utils/settings/server_config.json`` file. Advanced configuration ~~~~~~~~~~~~~~~~~~~~~~ By modifying ``deeppavlov/utils/settings/server_config.json`` you can change socket address family, host, port, path to the UNIX socket file and other properties of the API service. Properties from ``common_defaults`` section are used by default unless they are overridden by model-specific properties, provided in ``model_defaults`` section of the ``server_config.json``. Model-specific properties are bound to the model by ``server_utils`` label in ``metadata`` section of the model config. Value of ``server_utils`` label from model config should match with properties key from ``model_defaults`` section of ``server_config.json``. For example, adding ``metadata/server_utils`` key to ``kbqa/kbqa_cq.json`` with value *KBQA* will initiate the search of *KBQA* tag at ``model_defaults`` section of ``server_config.json``. Therefore, if this section is present, all parameters with non empty (i.e. not ``""``, not ``[]`` etc.) values stored by this tag will overwrite the parameter values in ``common_defaults``. If ``model_args_names`` parameter of ``server_config.json`` is empty string, then model argument names are provided as list from ``chainer/in`` section of the model config file, where arguments order corresponds to model API. When inferencing model via socket API, serialized JSON payload keys should match model arguments names from ``chainer/in`` section. If ``model_args_names`` parameter of ``server_config.json`` is list, its values are used as model argument names instead of the list from model config's ``chainer/in`` section. +-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+ | Model | POST request JSON payload example | +=========================================+=====================================================================================================================================================+ | **One argument models** | +-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+ | NER model | {"x":["Elon Musk launched his cherry Tesla roadster to the Mars orbit"]} | +-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+ | Intent classification model | {"x":["I would like to go to a restaurant with Asian cuisine this evening"]} | +-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+ | Automatic spelling correction model | {"x":["errror"]} | +-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+ | Ranking model | {"x":["What is the average cost of life insurance services?"]} | +-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+ | Goal-oriented bot | {"x":["Hello, can you help me to find and book a restaurant this evening?"]} | +-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+ | **Multiple arguments models** | +-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+ | Question Answering model | | {"context_raw":["After 1765, growing philosophical and political differences strained the relationship between Great Britain and its colonies."], | | | |  "question_raw":["What strained the relationship between Great Britain and its colonies?"]} | +-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+ Socket client example (Python) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Socket client for :doc:`SQuAD ` model with a batch of two elements: .. code-block:: python # squad-client.py import json import socket from struct import unpack from deeppavlov.utils.socket import encode socket_payload = { "context_raw": [ "All work and no play makes Jack a dull boy", "I used to be an adventurer like you, then I took an arrow in the knee" ], "question_raw": [ "What makes Jack a dull boy?", "Who I used to be?" ] } serialized_socket_payload = encode(socket_payload) with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.connect(('0.0.0.0', 5000)) s.sendall(serialized_socket_payload) header = s.recv(4) body_len = unpack('`_ that best match your skills and interests. `Apply now at our website `_. ================================================ FILE: docs/intro/configuration.rst ================================================ Configuration file ================== An NLP pipeline config is a JSON file that contains one required element ``chainer``: .. code:: python { "chainer": { "in": ["x"], "in_y": ["y"], "pipe": [ ... ], "out": ["y_predicted"] } } :class:`~deeppavlov.core.common.chainer.Chainer` is a core concept of DeepPavlov library: chainer builds a pipeline from heterogeneous components (Rule-Based/ML/DL) and allows to train or infer from pipeline as a whole. Each component in the pipeline specifies its inputs and outputs as arrays of names, for example: ``"in": ["tokens", "features"]`` and ``"out": ["token_embeddings", "features_embeddings"]`` and you can chain outputs of one components with inputs of other components: .. code:: python { "class_name": "deeppavlov.models.preprocessors.str_lower:str_lower", "in": ["x"], "out": ["x_lower"] }, { "class_name": "nltk_tokenizer", "in": ["x_lower"], "out": ["x_tokens"] }, Pipeline elements could be child classes of :class:`~deeppavlov.core.models.component.Component` or functions. Each :class:`~deeppavlov.core.models.component.Component` in the pipeline must implement method :meth:`__call__` and has ``class_name`` parameter, which is its registered codename, or full name of any python class in the form of ``"module_name:ClassName"``. It can also have any other parameters which repeat its :meth:`__init__` method arguments. Default values of :meth:`__init__` arguments will be overridden with the config values during the initialization of a class instance. You can reuse components in the pipeline to process different parts of data with the help of ``id`` and ``ref`` parameters: .. code:: python { "class_name": "nltk_tokenizer", "id": "tokenizer", "in": ["x_lower"], "out": ["x_tokens"] }, { "ref": "tokenizer", "in": ["y"], "out": ["y_tokens"] }, Nested configuration files -------------------------- Any configuration file could be used inside another configuration file as an element of the :class:`~deeppavlov.core.common.chainer.Chainer` or as a field of another component using ``config_path`` key. Any field of the nested configuration file could be overwritten using ``overwrite`` field: .. code:: "chainer": { "pipe": { ... { "class_name": "ner_chunk_model", "ner": { "config_path": "{CONFIGS_PATH}/ner/ner_ontonotes_bert.json", "overwrite": { "chainer.out": ["x_tokens", "tokens_offsets", "y_pred", "probas"] } }, ... } } } In this example ``ner_ontonotes_bert.json`` is used as ``ner`` argument value in ``ner_chunk_model`` component. ``chainer.out`` value is overwritten with new list. Overwritten fields names are defined using dot notation. In this notation numeric fields are treated as indexes of lists. For example, to change ``class_name`` value of the second element of the pipe to ``ner_chunker`` (1 is the index of the second element), use ``"chainer.pipe.1.class_name": "ner_chunker"`` key-value pair. Variables --------- As of *version 0.1.0* every string value in a configuration file is interpreted as a `format string `__ where fields are evaluated from ``metadata.variables`` element: .. code:: python { "chainer": { "in": ["x"], "pipe": [ { "class_name": "my_component", "in": ["x"], "out": ["x"], "load_path": "{MY_PATH}/file.obj" }, { "in": ["x"], "out": ["y_predicted"], "config_path": "{CONFIGS_PATH}/classifiers/insults_kaggle_bert.json" } ], "out": ["y_predicted"] }, "metadata": { "variables": { "MY_PATH": "/some/path", "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs" } } } Variable ``DEEPPAVLOV_PATH`` is always preset to be a path to the ``deeppavlov`` python module. One can override configuration variables using environment variables with prefix ``DP_``. So environment variable ``DP_VARIABLE_NAME`` will override ``VARIABLE_NAME`` inside a configuration file. For example, adding ``DP_ROOT_PATH=/my_path/to/large_hard_drive`` will make most configs use this path for downloading and reading embeddings/models/datasets. Training -------- There are two abstract classes for trainable components: :class:`~deeppavlov.core.models.estimator.Estimator` and :class:`~deeppavlov.core.models.nn_model.NNModel`. :class:`~deeppavlov.core.models.estimator.Estimator` are fit once on any data with no batching or early stopping, so it can be safely done at the time of pipeline initialization. :meth:`fit` method has to be implemented for each :class:`~deeppavlov.core.models.estimator.Estimator`. One example is :class:`~deeppavlov.core.data.vocab.Vocab`. :class:`~deeppavlov.core.models.nn_model.NNModel` requires more complex training. It can only be trained in a supervised mode (as opposed to :class:`~deeppavlov.core.models.estimator.Estimator` which can be trained in both supervised and unsupervised settings). This process takes multiple epochs with periodic validation and logging. :meth:`~deeppavlov.core.models.nn_model.NNModel.train_on_batch` method has to be implemented for each :class:`~deeppavlov.core.models.nn_model.NNModel`. Training is triggered by :func:`~deeppavlov.train_model` function. Train config ~~~~~~~~~~~~ :class:`~deeppavlov.core.models.estimator.Estimator` s that are trained should also have ``fit_on`` parameter which contains a list of input parameter names. An :class:`~deeppavlov.core.models.nn_model.NNModel` should have the ``in_y`` parameter which contains a list of ground truth answer names. For example: .. code:: python [ { "id": "classes_vocab", "class_name": "default_vocab", "fit_on": ["y"], "level": "token", "save_path": "vocabs/classes.dict", "load_path": "vocabs/classes.dict" }, { "in": ["x"], "in_y": ["y"], "out": ["y_predicted"], "class_name": "intent_model", "save_path": "classifiers/intent_cnn", "load_path": "classifiers/intent_cnn", "classes_vocab": { "ref": "classes_vocab" } } ] The config for training the pipeline should have three additional elements: ``dataset_reader``, ``dataset_iterator`` and ``train``: .. code:: python { "dataset_reader": { "class_name": ..., ... }, "dataset_iterator": { "class_name": ..., ... }, "chainer": { ... }, "train": { ... } } Simplified version of training pipeline contains two elements: ``dataset`` and ``train``. The ``dataset`` element currently can be used for train from classification data in ``csv`` and ``json`` formats. Train Parameters ~~~~~~~~~~~~~~~~ ``train`` element can contain a ``class_name`` parameter that references a trainer class (default value is :class:`torch_trainer `). All other parameters will be passed as keyword arguments to the trainer class's constructor. Metrics _______ .. code:: python "train": { "class_name": "torch_trainer", "metrics": [ "f1", { "name": "accuracy", "inputs": ["y", "y_labels"] }, { "name": "sklearn.metrics:accuracy_score", "alias": "unnormalized_accuracy", "inputs": ["y", "y_labels"], "normalize": false } ], ... } The first metric in the list is used for early stopping. Each metric can be described as a JSON object with ``name``, ``alias`` and ``inputs`` properties, where: - ``name`` is either a registered name of a metric function or ``module.submodules:function_name``. - ``alias`` is a metric name. Default value is ``name`` value. - ``inputs`` is a list of parameter names from chainer's inner memory that will be passed to the metric function. Default value is a concatenation of chainer's ``in_y`` and ``out`` parameters. All other arguments are interpreted as kwargs when the metric is called. If a metric is given as a string, this string is interpreted as a metric name, i.e. ``"f1"`` in the example above is equivalent to ``{"name": "f1"}``. DatasetReader ~~~~~~~~~~~~~ :class:`~deeppavlov.core.dara.dataset_reader.DatasetReader` class reads data and returns it in a specified format. A concrete :class:`DatasetReader` class should be inherited from this base class and registered with a codename: .. code:: python from deeppavlov.core.common.registry import register from deeppavlov.core.data.dataset_reader import DatasetReader @register('conll2003_reader') class Conll2003DatasetReader(DatasetReader): DataLearningIterator and DataFittingIterator ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` forms the sets of data ('train', 'valid', 'test') needed for training/inference and divides them into batches. A concrete :class:`DataLearningIterator` class should be registered and can be inherited from :class:`deeppavlov.data.data_learning_iterator.DataLearningIterator` class. This is a base class and can be used as a :class:`DataLearningIterator` as well. :class:`~deeppavlov.core.data.data_fitting_iterator.DataFittingIterator` iterates over provided dataset without train/valid/test splitting and is useful for :class:`~deeppavlov.core.models.estimator.Estimator` s that do not require training. Inference --------- All components inherited from :class:`~deeppavlov.core.models.component.Component` abstract class can be used for inference. The :meth:`__call__` method should return standard output of a component. For example, a `tokenizer` should return `tokens`, a `NER recognizer` should return `recognized entities`, a `bot` should return an `utterance`. A particular format of returned data should be defined in :meth:`__call__`. Inference is triggered by :func:`~deeppavlov.core.commands.infer.interact_model` function. There is no need in a separate JSON for inference. Model Configuration ------------------- Each DeepPavlov model is determined by its configuration file. You can use existing config files or create yours. You can also choose a config file and modify preprocessors/tokenizers/embedders/vectorizers there. The components below have the same interface and are responsible for the same functions, therefore they can be used in the same parts of a config pipeline. Here is a list of useful :class:`~deeppavlov.core.models.component.Component`\ s aimed to preprocess, postprocess and vectorize your data. Preprocessors ~~~~~~~~~~~~~ Preprocessor is a component that processes batch of samples. * Already implemented universal preprocessors of **tokenized texts** (each sample is a list of tokens): - :class:`~deeppavlov.models.preprocessors.mask.Mask` (registered as ``mask``) returns binary mask of corresponding length (padding up to the maximum length per batch. - :class:`~deeppavlov.models.preprocessors.sanitizer.Sanitizer` (registered as ``sanitizer``) removes all combining characters like diacritical marks from tokens. * Already implemented universal preprocessors of **non-tokenized texts** (each sample is a string): - :class:`~deeppavlov.models.preprocessors.dirty_comments_preprocessor.DirtyCommentsPreprocessor` (registered as ``dirty_comments_preprocessor``) preprocesses samples converting samples to lowercase, paraphrasing English combinations with apostrophe ``'``, transforming more than three the same symbols to two symbols. - :meth:`~deeppavlov.models.preprocessors.str_lower.str_lower` converts samples to lowercase. * Already implemented universal preprocessors of another type of features: - :class:`~deeppavlov.models.preprocessors.one_hotter.OneHotter` (registered as ``one_hotter``) performs one-hotting operation for the batch of samples where each sample is an integer label or a list of integer labels (can be combined in one batch). If ``multi_label`` parameter is set to ``True``, returns one one-dimensional vector per sample with several elements equal to ``1``. Tokenizers ~~~~~~~~~~ Tokenizer is a component that processes batch of samples (each sample is a text string). - :class:`~deeppavlov.models.tokenizers.nltk_tokenizer.NLTKTokenizer` (registered as ``nltk_tokenizer``) tokenizes using tokenizers from ``nltk.tokenize``, e.g. ``nltk.tokenize.wordpunct_tokenize``. - :class:`~deeppavlov.models.tokenizers.nltk_moses_tokenizer.NLTKMosesTokenizer` (registered as ``nltk_moses_tokenizer``) tokenizes and detokenizes using ``nltk.tokenize.moses.MosesDetokenizer``, ``nltk.tokenize.moses.MosesTokenizer``. - :class:`~deeppavlov.models.tokenizers.spacy_tokenizer.StreamSpacyTokenizer` (registered as ``stream_spacy_tokenizer``) tokenizes or lemmatizes texts with spacy ``en_core_web_sm`` models by default. - :class:`~deeppavlov.models.tokenizers.split_tokenizer.SplitTokenizer` (registered as ``split_tokenizer``) tokenizes using string method ``split``. Embedders ~~~~~~~~~ Embedder is a component that converts every token in a tokenized batch to a vector of a particular dimension (optionally, returns a single vector per sample). - :class:`~deeppavlov.models.embedders.fasttext_embedder.FasttextEmbedder` (registered as ``fasttext``) reads embedding file in fastText format. If ``mean`` returns one vector per sample - mean of embedding vectors of tokens. - :class:`~deeppavlov.models.embedders.tfidf_weighted_embedder.TfidfWeightedEmbedder` (registered as ``tfidf_weighted``) accepts embedder, tokenizer (for detokenization, by default, detokenize with joining with space), TFIDF vectorizer or counter vocabulary, optionally accepts tags vocabulary (to assign additional multiplcative weights to particular tags). If ``mean`` returns one vector per sample - mean of embedding vectors of tokens. Vectorizers ~~~~~~~~~~~ Vectorizer is a component that converts batch of text samples to batch of vectors. - :class:`~deeppavlov.models.sklearn.sklearn_component.SklearnComponent` (registered as ``sklearn_component``) is a DeepPavlov wrapper for most of sklearn estimators, vectorizers etc. For example, to get TFIDF-vectorizer one should assign in config ``model_class`` to ``sklearn.feature_extraction.text:TfidfVectorizer``, ``infer_method`` to ``transform``, pass ``load_path``, ``save_path`` and other sklearn model parameters. - :class:`~deeppavlov.models.vectorizers.hashing_tfidf_vectorizer.HashingTfIdfVectorizer` (registered as ``hashing_tfidf_vectorizer``) implements hashing version of usual TFIDF-vecotrizer. It creates a TFIDF matrix from collection of documents of size ``[n_documents X n_features(hash_size)]``. ================================================ FILE: docs/intro/installation.rst ================================================ Installation ============ DeepPavlov supports **Linux**, **Windows 10+** (through WSL/WSL2), **MacOS** (Big Sur+) platforms, **Python 3.6-3.11**. Depending on the model used, you may need from 4 to 16 GB RAM. Install with pip ~~~~~~~~~~~~~~~~ You should install DeepPavlov in a `virtual environment `_. If you’re unfamiliar with Python virtual environments, take a look at this `guide `_. A virtual environment makes it easier to manage different projects, and avoid compatibility issues between dependencies. #. Create a virtual environment: .. code:: bash python -m venv env #. Activate the virtual environment on Linux (`source` could be replaced with `.`): .. code:: bash source env/bin/activate #. Install DeepPavlov inside this virtual environment: .. code:: bash pip install deeppavlov Install from source ~~~~~~~~~~~~~~~~~~~ Install DeepPavlov **dev** branch from source with the following command: .. code:: bash pip install git+http://github.com/deeppavlov/DeepPavlov@dev This command installs the bleeding edge dev version rather than the latest release version. The dev version is useful for staying up-to-date with the latest developments. For instance, if a bug has been fixed since the last release but a new release hasn’t been rolled out yet. However, this means the dev version may not always be stable. Editable install ~~~~~~~~~~~~~~~~ You will need an editable install if you want to make changes in the DeepPavlov source code that immediately take place without requiring a new installation. Clone the repository and install DeepPavlov with the following commands: .. code:: bash git clone http://github.com/deeppavlov/DeepPavlov.git pip install -e DeepPavlov Docker Images ~~~~~~~~~~~~~ We have built several DeepPavlov based Docker images, which include: * DeepPavlov based Jupyter notebook Docker image; * Docker images which serve some of our models and allow to access them via REST API (:doc:`riseapi ` mode). Here is our `DockerHub repository `_ with images and deployment instructions. ================================================ FILE: docs/intro/overview.rst ================================================ Conceptual overview =================== Our goal is to enable AI-application developers and researchers with: - A set of pre-trained NLP models, pre-defined dialog system components (ML/DL/Rule-based), and pipeline templates; - A framework for implementing and testing their own dialog models; - Tools for application integration with adjacent infrastructure (messengers, helpdesk software, etc.); - Benchmarking environments for conversational models and uniform access to relevant datasets. .. image:: ../_static/dp_agnt_diag.png Key Concepts ------------ - A ``Model`` is any NLP model that doesn't necessarily communicates with the user in natural language. - A ``Component`` is a reusable functional part of a ``Model``. - ``Rule-based Models`` cannot be trained. - ``Machine Learning Models`` can be trained only stand alone. - ``Deep Learning Models`` can be trained independently and in an end-to-end mode being joined in a chain. - A ``Chainer`` builds a model pipeline from heterogeneous components (Rule-based/ML/DL). It allows one to train and infer models in a pipeline as a whole. The smallest building block of the library is a ``Component``. A ``Component`` stands for any kind of function in an NLP pipeline. It can be implemented as a neural network, a non-neural ML model, or a rule-based system. ``Component``\ s can be joined into a ``Model``. A ``Model`` solves a larger NLP task than a ``Component``. However, in terms of implementation, ``Model``\ s are not different from ``Component``\ s. Most of DeepPavlov models are built on top of `PyTorch `__. Other external libraries can be used to build basic components. ================================================ FILE: docs/intro/python.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "id": "6d5cd16b", "metadata": {}, "source": [ "#### Python pipelines" ] }, { "cell_type": "markdown", "id": "da10fd80", "metadata": {}, "source": [ "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deeppavlov/DeepPavlov/blob/master/docs/intro/python.ipynb)\n" ] }, { "cell_type": "markdown", "id": "d55ebe35", "metadata": {}, "source": [ "Python models could be used without .json configuration files.\n", "\n", "The code below is an alternative to building [insults_kaggle_bert](https://github.com/deepmipt/DeepPavlov/blob/master/deeppavlov/configs/classifiers/insults_kaggle_bert.json) model and using it with\n", "\n", "```python\n", "from deeppavlov import build_model\n", "\n", "model = build_model('insults_kaggle_bert', download=True)\n", "```" ] }, { "cell_type": "markdown", "id": "fa1db63b", "metadata": {}, "source": [ "At first, define variables for model components and download model data." ] }, { "cell_type": "code", "execution_count": null, "id": "9d6671e2", "metadata": {}, "outputs": [], "source": [ "from deeppavlov.core.commands.utils import expand_path\n", "from deeppavlov.download import download_resource\n", "\n", "\n", "classifiers_path = expand_path('~/.deeppavlov/models/classifiers')\n", "model_path = classifiers_path / 'insults_kaggle_torch_bert'\n", "transformer_name = 'bert-base-uncased'\n", "\n", "download_resource(\n", " 'http://files.deeppavlov.ai/deeppavlov_data/classifiers/insults_kaggle_torch_bert_v5.tar.gz',\n", " {classifiers_path}\n", ")\n" ] }, { "cell_type": "markdown", "id": "332d644e", "metadata": {}, "source": [ "Then, initialize model components." ] }, { "cell_type": "code", "execution_count": null, "id": "809c31ad", "metadata": {}, "outputs": [], "source": [ "from deeppavlov.core.data.simple_vocab import SimpleVocabulary\n", "from deeppavlov.models.classifiers.proba2labels import Proba2Labels\n", "from deeppavlov.models.preprocessors.torch_transformers_preprocessor import TorchTransformersPreprocessor\n", "from deeppavlov.models.torch_bert.torch_transformers_classifier import TorchTransformersClassifierModel\n", "\n", "\n", "preprocessor = TorchTransformersPreprocessor(\n", " vocab_file=transformer_name,\n", " max_seq_length=64\n", ")\n", "\n", "classes_vocab = SimpleVocabulary(\n", " load_path=model_path/'classes.dict',\n", " save_path=model_path/'classes.dict'\n", ")\n", "\n", "classifier = TorchTransformersClassifierModel(\n", " n_classes=classes_vocab.len,\n", " return_probas=True,\n", " pretrained_bert=transformer_name,\n", " save_path=model_path/'model',\n", " optimizer_parameters={'lr': 1e-05}\n", ")\n", "\n", "proba2labels = Proba2Labels(max_proba=True)" ] }, { "cell_type": "markdown", "id": "87e8ec20", "metadata": {}, "source": [ "Finally, create model from components. ``Element`` is a wrapper for a component. ``Element`` receives the component and the names of the incoming and outgoing arguments. ``Model`` combines ``Element``s into pipeline." ] }, { "cell_type": "code", "execution_count": null, "id": "acfe29de", "metadata": {}, "outputs": [], "source": [ "from deeppavlov import Element, Model\n", "\n", "model = Model(\n", " x=['x'],\n", " out=['y_pred_labels'],\n", " pipe=[\n", " Element(component=preprocessor, x=['x'], out=['bert_features']),\n", " Element(component=classifier, x=['bert_features'], out=['y_pred_probas']),\n", " Element(component=proba2labels, x=['y_pred_probas'], out=['y_pred_ids']),\n", " Element(component=classes_vocab, x=['y_pred_ids'], out=['y_pred_labels'])\n", " ]\n", ")\n", "\n", "model(['you are stupid', 'you are smart'])" ] } ], "metadata": {}, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: docs/intro/quick_start.rst ================================================ QuickStart ------------ First, follow instructions on :doc:`Installation page ` to install ``deeppavlov`` package for Python 3.6-3.11. DeepPavlov contains a bunch of great pre-trained NLP models. Each model is determined by its config file. List of models is available on :doc:`the doc page ` or in the ``deeppavlov.configs``: .. code:: python from deeppavlov import configs When you've decided on the model (+ config file), there are two ways to train, evaluate and infer it: * via `Command line interface (CLI)`_ and * via `Python`_. Before making choice of an interface, install model's package requirements (CLI): .. code:: bash python -m deeppavlov install * where ```` is model name without ``.json`` extension (e.g. ``insults_kaggle_bert``) or path to the chosen model's config file (e.g. ``deeppavlov/configs/classifiers/insults_kaggle_bert.json``) Command line interface (CLI) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ To get predictions from a model interactively through CLI, run .. code:: bash python -m deeppavlov interact [-d] [-i] * ``-d`` downloads required data -- pretrained model files and embeddings (optional). * ``-i`` installs model requirements (optional). You can train it in the same simple way: .. code:: bash python -m deeppavlov train [-d] [-i] Dataset will be downloaded regardless of whether there was ``-d`` flag or not. To train on your own data, you need to modify dataset reader path in the `train section doc `__. The data format is specified in the corresponding model doc page. There are even more actions you can perform with configs: .. code:: bash python -m deeppavlov [-d] [-i] * ```` can be * ``install`` to install model requirements (same as ``-i``), * ``download`` to download model's data (same as ``-d``), * ``train`` to train the model on the data specified in the config file, * ``evaluate`` to calculate metrics on the same dataset, * ``interact`` to interact via CLI, * ``riseapi`` to run a REST API server (see :doc:`docs `), * ``risesocket`` to run a socket API server (see :doc:`docs `), * ``predict`` to get prediction for samples from ``stdin`` or from ```` if ``-f `` is specified. * ```` specifies path (or name) of model's config file * ``-d`` downloads required data * ``-i`` installs model requirements Python ~~~~~~ To get predictions from a model interactively through Python, run .. code:: python from deeppavlov import build_model model = build_model(, install=True, download=True) # get predictions for 'input_text1', 'input_text2' model(['input_text1', 'input_text2']) where * ``install=True`` installs model requirements (optional), * ``download=True`` downloads required data from web -- pretrained model files and embeddings (optional), * ```` is path to the chosen model's config file (e.g. ``"deeppavlov/configs/ner/ner_ontonotes_bert_mult.json"``) or ``deeppavlov.configs`` attribute (e.g. ``deeppavlov.configs.ner.ner_ontonotes_bert_mult`` without quotation marks). You can train it in the same simple way: .. code:: python from deeppavlov import train_model model = train_model(, install=True, download=True) * ``download=True`` downloads pretrained model, therefore the pretrained model will be, first, loaded and then trained (optional). Dataset will be downloaded regardless of whether there was ``-d`` flag or not. To train on your own data, you need to modify dataset reader path in the `train section doc `__. The data format is specified in the corresponding model doc page. You can also calculate metrics on the dataset specified in your config file: .. code:: python from deeppavlov import evaluate_model model = evaluate_model(, install=True, download=True) Using GPU ~~~~~~~~~ To run or train **PyTorch**-based DeepPavlov models on GPU you should have `CUDA `__ installed on your host machine, and install model's package requirements. CUDA version should be compatible with DeepPavlov :dp_file:`required PyTorch version `. GPU with Pascal or newer architecture and 4+ GB VRAM is recommended. .. warning:: If you use latest NVIDIA architecture, PyTorch installed from PyPI using DeepPavlov could not support your device CUDA capability. You will receive incompatible device warning after model initialization. You can install compatible package from `download.pytorch.org `_. For example: .. code:: bash pip3 install torch==1.8.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html If you want to run the code on GPU, just make the device visible for the script. If you want to use a particular device, you may set it in command line: .. code:: bash export CUDA_VISIBLE_DEVICES=3; python -m deeppavlov train or in Python script: .. code:: python import os os.environ["CUDA_VISIBLE_DEVICES"]="3" In case you want to keep GPU visible but disable GPU acceleration for specific component, use ``device`` paramenter (available for :class:`~deeppavlov.core.models.torch_model.TorchModel` child classes): ``"device": "cpu"``. Pretrained models ~~~~~~~~~~~~~~~~~ DeepPavlov provides a wide range of pretrained models. See :doc:`features overview ` for more info. Please note that most of our models are trained on specific datasets for specific tasks and may require further training on your data. You can find a list of our out-of-the-box models `below <#out-of-the-box-pretrained-models>`_. Docker images ~~~~~~~~~~~~~ You can run DeepPavlov models in :doc:`riseapi ` mode or start Jupyter server via Docker without installing DeepPavlov. Both your CPU and GPU (we support NVIDIA graphic processors) can be utilised, please refer our `Docker `_ images run instructions. Out-of-the-box pretrained models ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ While the best way to solve most of the NLP tasks lies through collecting datasets and training models according to the domain and an actual task itself, DeepPavlov offers several pretrained models, which can be strong baselines for a wide range of tasks. You can run these models `via Docker <#docker-images>`_ or in ``riseapi``/``risesocket`` mode to use in solutions. See :doc:`riseapi ` and :doc:`risesocket ` modes documentation for API details. Text Question Answering ======================= Text Question Answering component answers a question based on a given context (e.g, a paragraph of text), where the answer to the question is a segment of the context. .. code:: python from deeppavlov import build_model model = build_model('squad_bert', download=True, install=True) contexts = ['DeepPavlov is a library for NLP and dialog systems.', 'All work and no play makes Jack a dull boy'] questions = ['What is DeepPavlov?', 'What makes Jack a dull boy?'] answer, answers_start_idx, score = model(contexts, questions) print(answer) .. code:: bash ['a library for NLP and dialog systems', 'All work and no play'] To get list of available models for Text Question Answering see :doc:`documentation `. Open-Domain Question Answering ============================== Open Domain Question Answering (ODQA) answers any question based on the document collection covering a wide range of topics. The ODQA task combines two challenges of document retrieval (finding the relevant articles) with that of machine comprehension of text (identifying the answer span from those articles). This component can be used to answer questions based on the company knowledge base. .. code:: python from deeppavlov import build_model model = build_model('en_odqa_infer_wiki', download=True, install=True) questions = ["What is the name of Darth Vader's son?", 'Who was the first president of France?'] answer, answer_score, answer_place = model(questions) print(answer) .. code:: bash ['Luke Skywalker', 'Louis-Napoleon Bonaparte'] To get list of available models for Open-Domain Question Answering see :doc:`documentation `. Knowledge Base Question Answering ================================= Knowledge Base Question Answering (KBQA) answers any question based on Knowledge Base (Knowledge Graph) - a comprehensive repository of information about a given domain or a number of domains that reflects the ways we model knowledge about a given subject or subjects, in terms of concepts, entities, properties, and relationships. KBQA models validate questions against a preconfigured list of question templates, disambiguate entities using Entity Linking, and answer questions asked in natural language. .. code:: python from deeppavlov import build_model model = build_model('kbqa_cq_en', download=True, install=True) questions = ['What is the currency of Sweden?', 'When did the Korean War end?'] answers, answer_ids, query = model(questions) print(answers) .. code:: bash ['Swedish krona', '27 July 1953'] To get list of available models for Knowledge Base Question Answering see :doc:`documentation `. Classification (insult and paraphrase detection, sentiment analysis, topic classification) ========================================================================================== Insult detection predicts whether a text (e.g, post or speech in some public discussion) is considered insulting to one of the persons it is related to. Sentiment analysis is a task of classifying the polarity of the the given sequence. The models trained for the paraphrase detection task identify whether two sentences expressed with different words convey the same meaning. Topic classification refers to the task of classifying an utterance by the topic which belongs to the conversational domain. .. code:: python from deeppavlov import build_model model = build_model('insults_kaggle_bert', download=True, install=True) phrases = ['You are kind of stupid', 'You are a wonderful person!'] labels = model(phrases) print(labels) .. code:: bash ['Insult', 'Not Insult'] To get list of available models for Classification see :doc:`documentation `. Name Entity Recognition ======================= Named Entity Recognition (NER) classifies tokens in text into predefined categories (tags), such as person names, quantity expressions, percentage expressions, names of locations, organizations, as well as expression of time, currency and others. .. code:: python from deeppavlov import build_model model = build_model('ner_ontonotes_bert', download=True, install=True) phrases = ['Bob Ross lived in Florida', 'Elon Musk founded Tesla'] tokens, tags = model(phrases) print(tokens, tags, sep='\n') .. code:: bash [['Bob', 'Ross', 'lived', 'in', 'Florida'], ['Elon', 'Musk', 'founded', 'Tesla']] [['B-PERSON', 'I-PERSON', 'O', 'O', 'B-GPE'], ['B-PERSON', 'I-PERSON', 'O', 'B-ORG']] To get list of available models for Name Entity Recognition see :doc:`documentation `. Entity Extraction ================= Entity Detection is the task of identifying entity mentions in text with corresponding entity types. Entity Linking is the task of finding knowledge base entity ids for entity mentions in text. Entity Extraction configs perform subsequent Entity Detection and Entity Linking of extracted entity mentions. .. code:: python from deeppavlov import build_model model = build_model('entity_extraction_en', download=True, install=True) phrases = ['Forrest Gump is a comedy-drama film directed by Robert Zemeckis and written by Eric Roth.'] entity_substr, tags, entity_offsets, entity_ids, entity_conf, entity_pages, entity_labels = model(phrases) print(entity_substr, tags, entity_ids, entity_labels, sep='\n') .. code:: bash [['forrest gump', 'robert zemeckis', 'eric roth']] [['WORK_OF_ART', 'PERSON', 'PERSON']] [[['Q134773', 'Q552213', 'Q12016774'], ['Q187364', 'Q36951156'], ['Q942932', 'Q89320386', 'Q89909683']]] [[['Forrest Gump', 'Forrest Gump', 'Forrest Gump'], ['Robert Zemeckis', 'Welcome to Marwen'], ['Eric Roth', 'Eric Roth', 'Eric W Roth']]] To get list of available models for Entity Extraction see :doc:`documentation `. Spelling Correction =================== Spelling Correction models detect and correct spelling errors in texts. .. code:: python from deeppavlov import build_model model = build_model('brillmoore_wikitypos_en', download=True, install=True) phrases_w_typos = ['I think this is the begining of a beautifull frendship.', "I'll be bak"] correct_phrases = model(phrases_w_typos) print(correct_phrases) .. code:: bash ['i think this is the beginning of a beautiful friendship.', "i'll be back"] To get list of available models for Spelling Correction see :doc:`documentation `. ================================================ FILE: requirements.txt ================================================ fastapi>=0.47.0,<=0.89.1 filelock>=3.0.0,<3.10.0 nltk>=3.2.4,<3.10.0 numpy<1.24 pandas>=1.0.0,<1.6.0 prometheus-client>=0.13.0,<=1.16.0 pydantic<2 pybind11==2.10.3 requests>=2.19.0,<3.0.0 scikit-learn>=0.24,<1.1.0;python_version<="3.10" scikit-learn==1.4.0;python_version=="3.11.*" tqdm>=4.42.0,<4.65.0 uvicorn>=0.13.0,<0.19.0 wheel scipy<1.10.0;python_version<"3.8" scipy==1.10.0;python_version>="3.8" ================================================ FILE: setup.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import re from setuptools import setup, find_packages __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) meta_path = os.path.join(__location__, 'deeppavlov', '_meta.py') with open(meta_path) as meta: exec(meta.read()) def read_requirements(): """parses requirements from requirements.txt""" reqs_path = os.path.join(__location__, 'requirements.txt') with open(reqs_path, encoding='utf8') as f: reqs = [line.strip() for line in f if not line.strip().startswith('#')] names = [] links = [] for req in reqs: if '://' in req: links.append(req) else: names.append(req) return {'install_requires': names, 'dependency_links': links} def readme(): with open(os.path.join(__location__, 'README.md'), encoding='utf8') as f: text = f.read() text = re.sub(r']\((?!https?://)', r'](https://github.com/deeppavlov/DeepPavlov/blob/master/', text) text = re.sub(r'\ssrc="(?!https?://)', r' src="https://raw.githubusercontent.com/deeppavlov/DeepPavlov/master/', text) return text if __name__ == '__main__': setup( name='deeppavlov', packages=find_packages(exclude=('tests', 'docs', 'utils')), version=__version__, description=__description__, long_description=readme(), long_description_content_type='text/markdown', author=__author__, author_email=__email__, license=__license__, url='https://github.com/deeppavlov/DeepPavlov', download_url=f'https://github.com/deeppavlov/DeepPavlov/archive/{__version__}.tar.gz', keywords=__keywords__, include_package_data=True, extras_require={ 'tests': [ 'flake8', 'pytest', 'pytest-instafail', 'pexpect' ], 'docs': [ 'sphinx==3.5.4;python_version<="3.7"', 'sphinx==5.0.0;python_version=="3.8"', 'sphinx==5.0.0;python_version=="3.9"', 'sphinx==5.0.0;python_version=="3.10"', 'sphinx==7.2.*;python_version=="3.11.*"', 'sphinx_rtd_theme==0.5.2;python_version<="3.10"', 'sphinx_rtd_theme==2.0.0;python_version=="3.11.*"', 'docutils<0.17,>=0.12;python_version<="3.10"', 'docutils==0.20.1;python_version=="3.11.*"', 'nbsphinx==0.8.4;python_version<="3.10"', 'nbsphinx==0.9.3;python_version=="3.11.*"', 'ipykernel==5.5.4', 'jinja2<=3.0.3', 'sphinx-copybutton==0.5.0', 'pandoc==2.3', 'ipython_genutils==0.2.0' ], 's3': [ 'boto3' ] }, **read_requirements() ) ================================================ FILE: tests/__init__.py ================================================ ================================================ FILE: tests/test_configs/doc_retrieval/en_ranker_pop_wiki_test.json ================================================ { "dataset_reader": { "class_name": "odqa_reader", "data_path": "{DOWNLOADS_PATH}/odqa/enwiki_test", "save_path": "{DOWNLOADS_PATH}/odqa/enwiki_test.db", "dataset_format": "txt" }, "dataset_iterator": { "class_name": "sqlite_iterator", "shuffle": false, "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_test.db" }, "chainer": { "in": [ "docs" ], "in_y": [ "doc_ids", "doc_nums" ], "out": [ "pop_doc_ids" ], "pipe": [ { "class_name": "hashing_tfidf_vectorizer", "id": "vectorizer", "fit_on": [ "docs", "doc_ids", "doc_nums" ], "save_path": "{DOWNLOADS_PATH}/odqa/enwiki_test_tfidf.npz", "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_test_tfidf.npz", "tokenizer": { "class_name": "stream_spacy_tokenizer", "lemmas": true, "ngram_range": [ 1, 2 ] } }, { "class_name": "tfidf_ranker", "top_n": 20, "in": [ "docs" ], "out": [ "tfidf_doc_ids", "tfidf_doc_scores" ], "vectorizer": "#vectorizer" }, { "class_name": "pop_ranker", "pop_dict_path": "{DOWNLOADS_PATH}/odqa/enwiki_popularities.json", "load_path": "{MODELS_PATH}/odqa/logreg_3features_v2.joblib", "top_n": 10, "in": [ "tfidf_doc_ids", "tfidf_doc_scores" ], "out": [ "pop_doc_ids", "pop_doc_scores" ] } ] }, "train": { "batch_size": 10000, "evaluation_targets": [], "class_name": "fit_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models" }, "download": [ { "url": "http://files.deeppavlov.ai/datasets/wikipedia/enwiki_test.tar.gz", "subdir": "{DOWNLOADS_PATH}/odqa" }, { "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_popularities.tar.gz", "subdir": "{DOWNLOADS_PATH}/odqa" }, { "url": "http://files.deeppavlov.ai/deeppavlov_data/ranking/logreg_3features_v2.joblib", "subdir": "{MODELS_PATH}/odqa" } ] } } ================================================ FILE: tests/test_configs/doc_retrieval/en_ranker_tfidf_wiki_test.json ================================================ { "dataset_reader": { "class_name": "odqa_reader", "data_path": "{DOWNLOADS_PATH}/odqa/enwiki_test", "save_path": "{DOWNLOADS_PATH}/odqa/enwiki_test.db", "dataset_format": "txt" }, "dataset_iterator": { "class_name": "sqlite_iterator", "shuffle": false, "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_test.db" }, "chainer": { "in": [ "docs" ], "in_y": [ "doc_ids", "doc_nums" ], "out": [ "tfidf_doc_ids" ], "pipe": [ { "class_name": "hashing_tfidf_vectorizer", "id": "vectorizer", "fit_on": [ "docs", "doc_ids", "doc_nums" ], "save_path": "{DOWNLOADS_PATH}/odqa/enwiki_test_tfidf.npz", "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_test_tfidf.npz", "tokenizer": { "class_name": "stream_spacy_tokenizer", "lemmas": true, "ngram_range": [ 1, 2 ] } }, { "class_name": "tfidf_ranker", "top_n": 20, "in": [ "docs" ], "out": [ "tfidf_doc_ids", "tfidf_doc_scores" ], "vectorizer": "#vectorizer" } ] }, "train": { "batch_size": 2, "evaluation_targets": [], "class_name": "fit_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models" }, "download": [ { "url": "http://files.deeppavlov.ai/datasets/wikipedia/enwiki_test.tar.gz", "subdir": "{DOWNLOADS_PATH}/odqa" } ] } } ================================================ FILE: tests/test_configs/doc_retrieval/ru_ranker_tfidf_wiki_test.json ================================================ { "dataset_reader": { "class_name": "odqa_reader", "data_path": "{DOWNLOADS_PATH}/odqa/ruwiki_test", "save_path": "{DOWNLOADS_PATH}/odqa/ruwiki_test.db", "dataset_format": "txt" }, "dataset_iterator": { "class_name": "sqlite_iterator", "shuffle": false, "load_path": "{DOWNLOADS_PATH}/odqa/ruwiki_test.db" }, "chainer": { "in": [ "docs" ], "in_y": [ "doc_ids", "doc_nums" ], "out": [ "tfidf_doc_ids" ], "pipe": [ { "class_name": "hashing_tfidf_vectorizer", "id": "vectorizer", "fit_on": [ "docs", "doc_ids", "doc_nums" ], "save_path": "{DOWNLOADS_PATH}/odqa/ruwiki_test_tfidf.npz", "load_path": "{DOWNLOADS_PATH}/odqa/ruwiki_test_tfidf.npz", "tokenizer": { "class_name": "stream_spacy_tokenizer", "spacy_model": "ru_core_news_sm", "lemmas": true, "lowercase": true, "filter_stopwords": true, "ngram_range": [ 1, 2 ] } }, { "class_name": "tfidf_ranker", "top_n": 20, "in": [ "docs" ], "out": [ "tfidf_doc_ids", "tfidf_doc_scores" ], "vectorizer": "#vectorizer" } ] }, "train": { "batch_size": 2, "evaluation_targets": [], "class_name": "fit_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models" }, "download": [ { "url": "http://files.deeppavlov.ai/datasets/wikipedia/ruwiki_test.tar.gz", "subdir": "{DOWNLOADS_PATH}/odqa" } ] } } ================================================ FILE: tests/test_quick_start.py ================================================ import io import json import logging import os import shutil import signal import socket import sys from concurrent.futures import ProcessPoolExecutor from pathlib import Path from struct import unpack from time import sleep from typing import Optional, Union from urllib.parse import urljoin import pexpect import pexpect.popen_spawn import pytest import requests import deeppavlov from deeppavlov import build_model from deeppavlov.core.commands.utils import parse_config, parse_value_with_config from deeppavlov.core.common.aliases import ALIASES from deeppavlov.core.data.utils import get_all_elems_from_json from deeppavlov.download import deep_download from deeppavlov.utils.server import get_server_params from deeppavlov.utils.socket import encode tests_dir = Path(__file__).parent test_configs_path = tests_dir / "deeppavlov" / "configs" src_dir = Path(deeppavlov.__path__[0]) / "configs" test_src_dir = tests_dir / "test_configs" download_path = tests_dir / "download" cache_dir: Optional[Path] = None if not os.getenv('DP_PYTEST_NO_CACHE'): cache_dir = tests_dir / 'download_cache' SKIP_TF = os.getenv('SKIP_TF', False) api_port = os.getenv('DP_PYTEST_API_PORT') if api_port is not None: api_port = int(api_port) TEST_MODES = ['IP', # test_inferring_pretrained_model 'TI', # test_consecutive_training_and_inferring ] ALL_MODES = ('IP', 'TI') ONE_ARGUMENT_INFER_CHECK = ('Dummy text', None) TWO_ARGUMENTS_INFER_CHECK = ('Dummy text', 'Dummy text', None) FOUR_ARGUMENTS_INFER_CHECK = ('Dummy text', 'Dummy text', 'Dummy text', 'Dummy_text', None) LIST_ARGUMENTS_INFER_CHECK = (['Dummy text', 'Dummy text'], ['Dummy text', 'Dummy text'], None) RECORD_ARGUMENTS_INFER_CHECK = ("Index", "Dummy query text", "Dummy passage text", "Dummy entity", 1, None) # Mapping from model name to config-model_dir-ispretrained and corresponding queries-response list. PARAMS = { "relation_extraction": { ("relation_extraction/re_docred.json", "relation_extraction", ('IP',)): [ ( [["Barack", "Obama", "is", "married", "to", "Michelle", "Obama", ",", "born", "Michelle", "Robinson", "."]], [[[(0, 2)], [(5, 7), (9, 11)]]], [["PER", "PER"]], ( 'P26', 'spouse' ) ) ], ("relation_extraction/re_rured.json", "relation_extraction", ('IP',)): [ ( [["Илон", "Маск", "живет", "в", "Сиэттле", "."]], [[[(0, 2)], [(4, 6)]]], [["PERSON", "CITY"]], ( 'P495', 'страна происхождения' ) ), ] }, "faq": { ("faq/fasttext_logreg.json", "fasttext_logreg", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], # TODO: add ru test }, "spelling_correction": { ("spelling_correction/brillmoore_wikitypos_en.json", "error_model", ALL_MODES): [ ("helllo", ("hello",)), ("datha", ("data",)) ], ("spelling_correction/levenshtein_corrector_ru.json", "error_model", ('IP',)): [ ("преветствую", ("приветствую",)), ("Я джва года хочу такую игру", ("я два года хочу такую игру",)) ] }, "classifiers": { ("classifiers/paraphraser_rubert.json", "classifiers", ('IP', 'TI')): [TWO_ARGUMENTS_INFER_CHECK], ("classifiers/insults_kaggle_bert.json", "classifiers", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK], ("classifiers/rusentiment_bert.json", "classifiers", ('IP',)): [ONE_ARGUMENT_INFER_CHECK], ("classifiers/sentiment_twitter.json", "classifiers", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], ("classifiers/sentiment_sst_conv_bert.json", "classifiers", ('IP',)): [ONE_ARGUMENT_INFER_CHECK], ("classifiers/glue/glue_mrpc_roberta.json", "classifiers", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK], ("classifiers/glue/glue_stsb_roberta.json", "classifiers", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK], ("classifiers/glue/glue_mnli_roberta.json", "classifiers", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK], ("classifiers/glue/glue_rte_roberta_mnli.json", "classifiers", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK], ("classifiers/glue/glue_cola_roberta.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], ("classifiers/glue/glue_qnli_roberta.json", "classifiers", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK], ("classifiers/glue/glue_qqp_roberta.json", "classifiers", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK], ("classifiers/glue/glue_sst2_roberta.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], ("classifiers/glue/glue_wnli_roberta.json", "classifiers", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK], ("classifiers/superglue/superglue_copa_roberta.json", "classifiers", ('TI',)): [LIST_ARGUMENTS_INFER_CHECK], ("classifiers/superglue/superglue_boolq_roberta_mnli.json", "classifiers", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK], ("classifiers/superglue/superglue_record_roberta.json", "classifiers", ('TI',)): [RECORD_ARGUMENTS_INFER_CHECK], ("classifiers/superglue/superglue_wic_bert.json", "classifiers", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK], ("classifiers/topics_distilbert_base_uncased.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], ("classifiers/few_shot_roberta.json", "classifiers", ('IP',)): [ ('Dummy text', ['Dummy text Dummy text', 'Dummy class'], ('Dummy class',)) ] }, "distil": { ("classifiers/paraphraser_convers_distilrubert_2L.json", "distil", ('IP')): [TWO_ARGUMENTS_INFER_CHECK], ("classifiers/paraphraser_convers_distilrubert_6L.json", "distil", ('IP')): [TWO_ARGUMENTS_INFER_CHECK], ("classifiers/rusentiment_convers_distilrubert_2L.json", "distil", ('IP')): [ONE_ARGUMENT_INFER_CHECK], ("classifiers/rusentiment_convers_distilrubert_6L.json", "distil", ('IP')): [ONE_ARGUMENT_INFER_CHECK], ("ner/ner_rus_convers_distilrubert_2L.json", "distil", ('IP')): [ONE_ARGUMENT_INFER_CHECK], ("ner/ner_rus_convers_distilrubert_6L.json", "distil", ('IP')): [ONE_ARGUMENT_INFER_CHECK], ("ner/ner_case_agnostic_mdistilbert.json", "distil", ('IP')): [ONE_ARGUMENT_INFER_CHECK], ("squad/squad_ru_convers_distilrubert_2L.json", "distil", ('IP')): [TWO_ARGUMENTS_INFER_CHECK], ("squad/squad_ru_convers_distilrubert_6L.json", "distil", ('IP')): [TWO_ARGUMENTS_INFER_CHECK] }, "russian_super_glue": { ("russian_super_glue/russian_superglue_lidirus_rubert.json", "russian_super_glue", ('IP',)): [TWO_ARGUMENTS_INFER_CHECK], ("russian_super_glue/russian_superglue_danetqa_rubert.json", "russian_super_glue", ('IP',)): [TWO_ARGUMENTS_INFER_CHECK], ("russian_super_glue/russian_superglue_terra_rubert.json", "russian_super_glue", ('IP',)): [TWO_ARGUMENTS_INFER_CHECK], ("russian_super_glue/russian_superglue_rcb_rubert.json", "russian_super_glue", ('IP',)): [TWO_ARGUMENTS_INFER_CHECK], ("russian_super_glue/russian_superglue_russe_rubert.json", "russian_super_glue", ('IP',)): [TWO_ARGUMENTS_INFER_CHECK], ("russian_super_glue/russian_superglue_rwsd_rubert.json", "russian_super_glue", ('IP',)): [TWO_ARGUMENTS_INFER_CHECK], ("russian_super_glue/russian_superglue_muserc_rubert.json", "russian_super_glue", ('IP',)): [TWO_ARGUMENTS_INFER_CHECK], ("russian_super_glue/russian_superglue_parus_rubert.json", "russian_super_glue", ('IP',)): [LIST_ARGUMENTS_INFER_CHECK], ("russian_super_glue/russian_superglue_rucos_rubert.json", "russian_super_glue", ('IP',)): [RECORD_ARGUMENTS_INFER_CHECK] }, "multitask":{ ("multitask/multitask_example.json", "multitask", ALL_MODES): [ ('Dummy text',) + (('Dummy text', 'Dummy text'),) * 3 + ('Dummy text',) + (None,)], ("multitask/mt_glue.json", "multitask", ALL_MODES): [ ('Dummy text',) * 2 + (('Dummy text', 'Dummy text'),) * 6 + (None,)] }, "entity_extraction": { ("entity_extraction/entity_detection_en.json", "entity_extraction", ('IP',)): [ ("Forrest Gump is a comedy-drama film directed by Robert Zemeckis and written by Eric Roth.", (['forrest gump', 'robert zemeckis', 'eric roth'], [(0, 12), (48, 63), (79, 88)], [[0, 1], [10, 11], [15, 16]], ['WORK_OF_ART', 'PERSON', 'PERSON'], [(0, 89)], ['Forrest Gump is a comedy-drama film directed by Robert Zemeckis and written by Eric Roth.'], [0.8798, 0.9986, 0.9985])) ], ("entity_extraction/entity_detection_ru.json", "entity_extraction", ('IP',)): [ ("Москва — столица России, центр Центрального федерального округа и центр Московской области.", (['москва', 'россии', 'центрального федерального округа', 'московской области'], [(0, 6), (17, 23), (31, 63), (72, 90)], [[0], [3], [6, 7, 8], [11, 12]], ['CITY', 'COUNTRY', 'LOC', 'LOC'], [(0, 91)], ['Москва — столица России, центр Центрального федерального округа и центр Московской области.'], [0.8359, 0.938, 0.9917, 0.9803])) ], ("entity_extraction/entity_extraction_en.json", "entity_extraction", ('IP',)): [ ("Forrest Gump is a comedy-drama film directed by Robert Zemeckis and written by Eric Roth.", (['forrest gump', 'robert zemeckis', 'eric roth'], ['WORK_OF_ART', 'PERSON', 'PERSON'], [(0, 12), (48, 63), (79, 88)], [['Q134773', 'Q552213', 'Q12016774'], ['Q187364', 'Q36951156'], ['Q942932', 'Q89320386', 'Q89909683']], [[[1.1, 110, 1.0], [1.1, 13, 0.73], [1.1, 8, 0.04]], [[1.1, 73, 1.0], [0.5, 52, 0.29]], [[1.1, 37, 0.95], [1.1, 2, 0.35], [0.67, 2, 0.35]]], [['Forrest Gump', 'Forrest Gump (novel)', ''], ['Robert Zemeckis', 'Welcome to Marwen'], ['Eric Roth', '', '']], [['Forrest Gump', 'Forrest Gump', 'Forrest Gump'], ['Robert Zemeckis', 'Welcome to Marwen'], ['Eric Roth', 'Eric Roth', 'Eric W Roth']])) ], ("entity_extraction/entity_extraction_ru.json", "entity_extraction", ('IP',)): [ ("Москва — столица России, центр Центрального федерального округа и центр Московской области.", (['москва', 'россии', 'центрального федерального округа', 'московской области'], ['CITY', 'COUNTRY', 'LOC', 'LOC'], [(0, 6), (17, 23), (31, 63), (72, 90)], [['Q649', 'Q1023006', 'Q2380475'], ['Q159', 'Q2184', 'Q139319'], ['Q190778', 'Q4504288', 'Q27557290'], ['Q1697', 'Q4303932', 'Q24565285']], [[[1.1, 200, 1.0], [1.0, 20, 0.0], [1.0, 18, 0.0]], [[1.1, 200, 1.0], [1.0, 58, 1.0], [1.0, 29, 0.85]], [[1.1, 200, 1.0], [0.67, 3, 0.92], [0.67, 3, 0.89]], [[0.9, 200, 1.0], [0.9, 6, 0.83], [0.61, 8, 0.03]]], [['Москва', 'Москоу (Канзас)', 'Москоу (Теннесси)'], ['Россия', 'Российская Советская Федеративная Социалистическая Республика', 'Российская республика'], ['Центральный федеральный округ', 'Центральный округ (Краснодар)', ''], ['Московская область', 'Московская область (1917—1918)', 'Мостовский (Волгоградская область)']], [['Москва', 'Москоу', 'Москоу'], ['Россия', 'Российская Советская Федеративная Социалистическая Республика', 'Российская республика'], ['Центральный федеральный округ', 'Центральный округ (Краснодар)', 'Центральный округ (Братск)'], ['Московская область', 'Московская область', 'Мостовский']])) ] }, "ner": { ("ner/ner_bert_base.json", "ner_bert_base", ('IP',)): [ONE_ARGUMENT_INFER_CHECK], ("ner/ner_conll2003_bert.json", "ner_conll2003_bert", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK], ("ner/ner_ontonotes_bert.json", "ner_ontonotes_bert", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK], ("ner/ner_ontonotes_bert_mult.json", "ner_ontonotes_bert_mult", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK], ("ner/ner_rus_bert.json", "ner_rus_bert", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK], ("ner/ner_collection3_bert.json", "ner_collection3_bert", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK], ("ner/ner_conll2003_deberta_crf.json", "ner_conll2003_deberta_crf", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK], ("ner/ner_ontonotes_deberta_crf.json", "ner_ontonotes_deberta_crf", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK], }, "sentence_segmentation": { ("sentence_segmentation/sentseg_dailydialog_bert.json", "sentseg_dailydialog_bert", ('IP', 'TI')): [ (["hey", "alexa", "how", "are", "you"], None)] }, "kbqa": { ("kbqa/kbqa_cq_en.json", "kbqa", ('IP',)): [ ("What is the currency of Sweden?", ("Swedish krona", ["Q122922"], ["SELECT ?answer WHERE { wd:Q34 wdt:P38 ?answer. }"])), ("Where was Napoleon Bonaparte born?", ("Ajaccio", ["Q40104"], ["SELECT ?answer WHERE { wd:Q517 wdt:P19 ?answer. }"])), ("When did the Korean War end?", ("27 July 1953", ["+1953-07-27^^T"], ["SELECT ?answer WHERE { wd:Q8663 wdt:P582 ?answer. }"])), (" ", ("Not Found", [], [])) ], ("kbqa/kbqa_cq_ru.json", "kbqa", ('IP',)): [ ("Кто такой Оксимирон?", ("российский рэп-исполнитель", ['российский рэп-исполнитель"@ru'], ["SELECT ?answer WHERE { wd:Q4046107 wdt:P0 ?answer. }"])), ("Кто написал «Евгений Онегин»?", ("Александр Сергеевич Пушкин", ["Q7200"], ["SELECT ?answer WHERE { wd:Q50948 wdt:P50 ?answer. }"])), ("абв", ("Not Found", [], [])) ] }, "ranking": { ("ranking/ranking_ubuntu_v2_torch_bert_uncased.json", "ranking", ('TI',)): [ONE_ARGUMENT_INFER_CHECK] }, "doc_retrieval": { ("doc_retrieval/en_ranker_tfidf_wiki_test.json", "doc_retrieval", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], ("doc_retrieval/ru_ranker_tfidf_wiki_test.json", "doc_retrieval", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], ("doc_retrieval/en_ranker_pop_wiki_test.json", "doc_retrieval", ('TI',)): [ONE_ARGUMENT_INFER_CHECK] }, "squad": { ("squad/squad_ru_bert.json", "squad_ru_bert", ('IP', 'TI')): [TWO_ARGUMENTS_INFER_CHECK], ("squad/squad_bert.json", "squad_bert", ('IP', 'TI')): [TWO_ARGUMENTS_INFER_CHECK] }, "odqa": { ("odqa/en_odqa_infer_wiki.json", "odqa", ('IP',)): [ONE_ARGUMENT_INFER_CHECK], ("odqa/ru_odqa_infer_wiki.json", "odqa", ('IP',)): [ONE_ARGUMENT_INFER_CHECK], ("odqa/en_odqa_pop_infer_wiki.json", "odqa", ('IP',)): [ONE_ARGUMENT_INFER_CHECK] }, "morpho_tagger": { ("morpho_syntax_parser/morpho_ru_syntagrus_bert.json", "morpho_tagger_bert", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK] }, "syntax_tagger": { ("morpho_syntax_parser/syntax_ru_syntagrus_bert.json", "syntax_ru_bert", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK], ("morpho_syntax_parser/ru_syntagrus_joint_parsing.json", "syntax_ru_bert", ('IP',)): [ONE_ARGUMENT_INFER_CHECK] }, } MARKS = {"gpu_only": ["squad"], "slow": ["error_model", "squad"]} # marks defined in pytest.ini TEST_GRID = [] for model in PARAMS.keys(): for conf_file, model_dir, mode in PARAMS[model].keys(): marks = [] for mark in MARKS.keys(): if model in MARKS[mark]: marks.append(eval("pytest.mark." + mark)) grid_unit = pytest.param(model, conf_file, model_dir, mode, marks=marks) TEST_GRID.append(grid_unit) def _override_with_test_values(item: Union[dict, list]) -> None: if isinstance(item, dict): keys = [k for k in item.keys() if k.startswith('pytest_')] for k in keys: item[k[len('pytest_'):]] = item.pop(k) item = item.values() for child in item: if isinstance(child, (dict, list)): _override_with_test_values(child) def download_config(config_path): src_file = src_dir / config_path if not src_file.is_file(): src_file = test_src_dir / config_path if not src_file.is_file(): raise RuntimeError('No config file {}'.format(config_path)) with src_file.open(encoding='utf8') as fin: config: dict = json.load(fin) # Download referenced config files config_references = get_all_elems_from_json(parse_config(config), 'config_path') for config_ref in config_references: splitted = config_ref.split("/") first_subdir_index = splitted.index("configs") + 1 m_name = config_ref.split('/')[first_subdir_index] config_ref = '/'.join(config_ref.split('/')[first_subdir_index:]) test_configs_path.joinpath(m_name).mkdir(exist_ok=True) if not test_configs_path.joinpath(config_ref).exists(): download_config(config_ref) # Update config for testing config.setdefault('train', {}).setdefault('pytest_epochs', 1) config['train'].setdefault('pytest_max_batches', 2) config['train'].setdefault('pytest_max_test_batches', 2) _override_with_test_values(config) config_path = test_configs_path / config_path config_path.parent.mkdir(exist_ok=True, parents=True) with config_path.open("w", encoding='utf8') as fout: json.dump(config, fout) def install_config(config_path): logfile = io.BytesIO(b'') p = pexpect.popen_spawn.PopenSpawn(sys.executable + " -m deeppavlov install " + str(config_path), timeout=None, logfile=logfile) p.readlines() if p.wait() != 0: raise RuntimeError('Installing process of {} returned non-zero exit code: \n{}' .format(config_path, logfile.getvalue().decode())) def setup_module(): shutil.rmtree(str(test_configs_path), ignore_errors=True) shutil.rmtree(str(download_path), ignore_errors=True) test_configs_path.mkdir(parents=True) for m_name, conf_dict in PARAMS.items(): test_configs_path.joinpath(m_name).mkdir(exist_ok=True, parents=True) for (config_path, _, _), _ in conf_dict.items(): download_config(config_path) os.environ['DP_ROOT_PATH'] = str(download_path) os.environ['DP_CONFIGS_PATH'] = str(test_configs_path) if cache_dir: cache_dir.mkdir(parents=True, exist_ok=True) os.environ['DP_CACHE_DIR'] = str(cache_dir.resolve()) def teardown_module(): shutil.rmtree(str(test_configs_path.parent), ignore_errors=True) shutil.rmtree(str(download_path), ignore_errors=True) if cache_dir: shutil.rmtree(str(cache_dir), ignore_errors=True) def _infer(config, inputs, download=False): chainer = build_model(config, download=download) if inputs: prediction = chainer(*inputs) if len(chainer.out_params) == 1: prediction = [prediction] else: prediction = [] return prediction @pytest.mark.parametrize("model,conf_file,model_dir,mode", TEST_GRID, scope='class') class TestQuickStart(object): @staticmethod def infer(config_path, qr_list=None, check_outputs=True): *inputs, expected_outputs = zip(*qr_list) if qr_list else ([],) with ProcessPoolExecutor(max_workers=1) as executor: f = executor.submit(_infer, config_path, inputs) outputs = list(zip(*f.result())) if check_outputs: errors = ';'.join([f'expected `{expected}` got `{output}`' for output, expected in zip(outputs, expected_outputs) if expected is not None and expected != output]) if errors: raise RuntimeError(f'Unexpected results for {config_path}: {errors}') @staticmethod def infer_api(config_path, qr_list): *inputs, expected_outputs = zip(*qr_list) server_params = get_server_params(config_path) url_base = 'http://{}:{}'.format(server_params['host'], api_port or server_params['port']) url = urljoin(url_base.replace('http://0.0.0.0:', 'http://127.0.0.1:'), server_params['model_endpoint']) post_headers = {'Accept': 'application/json'} logfile = io.BytesIO(b'') args = [sys.executable, "-m", "deeppavlov", "riseapi", str(config_path)] if api_port: args += ['-p', str(api_port)] p = pexpect.popen_spawn.PopenSpawn(' '.join(args), timeout=None, logfile=logfile) try: p.expect(url_base) get_url = urljoin(url_base.replace('http://0.0.0.0:', 'http://127.0.0.1:'), '/api') get_response = requests.get(get_url) response_code = get_response.status_code assert response_code == 200, f"GET /api request returned error code {response_code} with {config_path}" model_args_names = get_response.json()['in'] post_payload = dict(zip(model_args_names, inputs)) # TODO: remove this if from here and socket if 'docred' in str(config_path) or 'rured' in str(config_path): post_payload = {k: v[0] for k, v in post_payload.items()} post_response = requests.post(url, json=post_payload, headers=post_headers) response_code = post_response.status_code assert response_code == 200, f"POST request returned error code {response_code} with {config_path}" except pexpect.exceptions.EOF: raise RuntimeError('Got unexpected EOF: \n{}'.format(logfile.getvalue().decode())) finally: p.kill(signal.SIGTERM) p.wait() # if p.wait() != 0: # raise RuntimeError('Error in shutting down API server: \n{}'.format(logfile.getvalue().decode())) @staticmethod def infer_socket(config_path, socket_type): socket_params = get_server_params(config_path) model_args_names = socket_params['model_args_names'] host = socket_params['host'] host = host.replace('0.0.0.0', '127.0.0.1') port = api_port or socket_params['port'] socket_payload = {} for arg_name in model_args_names: arg_value = ' '.join(['qwerty'] * 10) socket_payload[arg_name] = [arg_value] if 'parus' in str(config_path): socket_payload = {k: [v] for k, v in socket_payload.items()} logfile = io.BytesIO(b'') args = [sys.executable, "-m", "deeppavlov", "risesocket", str(config_path), '--socket-type', socket_type] if socket_type == 'TCP': args += ['-p', str(port)] address_family = socket.AF_INET connect_arg = (host, port) else: address_family = socket.AF_UNIX connect_arg = socket_params['unix_socket_file'] p = pexpect.popen_spawn.PopenSpawn(' '.join(args), timeout=None, logfile=logfile) try: p.expect(socket_params['socket_launch_message']) with socket.socket(address_family, socket.SOCK_STREAM) as s: try: s.connect(connect_arg) except ConnectionRefusedError: sleep(1) s.connect(connect_arg) s.sendall(encode(socket_payload)) s.settimeout(120) header = s.recv(4) body_len = unpack(' Dict[str, str]: tar = tarfile.open(fpath) res = {} while True: item: tarfile.TarInfo = tar.next() if item is None: break if not item.isfile(): continue file_hash = md5() with tar.extractfile(item) as f: for chunk in iter(lambda: f.read(chunk_size), b""): file_hash.update(chunk) res[item.name] = file_hash.hexdigest() return res def gzip_md5(fpath: Union[str, Path], chunk_size: int = 2 ** 16) -> str: file_hash = md5() with gzip.open(fpath, 'rb') as f: for chunk in iter(lambda: f.read(chunk_size), b""): file_hash.update(chunk) return file_hash.hexdigest() def zip_md5(fpath: Union[str, Path], chunk_size: int = 2 ** 16) -> Dict[str, str]: res = {} with ZipFile(fpath) as zip_f: for item in zip_f.infolist(): if item.is_dir(): continue file_hash = md5() with zip_f.open(item) as f: for chunk in iter(lambda: f.read(chunk_size), b""): file_hash.update(chunk) res[item.filename] = file_hash.hexdigest() return res def compute_hashes(fpath: Union[str, Path]) -> Dict[str, str]: p = Path(fpath).expanduser() if not p.is_file(): raise RuntimeError(f'{p} is not a file') if '.tar' in {s.lower() for s in p.suffixes}: hashes = tar_md5(p) elif p.suffix.lower() == '.gz': hashes = {p.with_suffix('').name: gzip_md5(p)} elif p.suffix.lower() == '.zip': hashes = zip_md5(p) else: hashes = {p.name: file_md5(p)} return hashes def main(fname: str, outfile: Optional[str] = None) -> None: p = Path(fname).expanduser() hashes = compute_hashes(p) if outfile is None: outfile = p.with_suffix(p.suffix + '.md5').open('w', encoding='utf-8') elif outfile == '-': outfile = sys.stdout else: outfile = Path(outfile).expanduser().open('w', encoding='utf-8') for fname, fhash in hashes.items(): print(f'{fhash} *{fname}', file=outfile, flush=True) if outfile is not sys.stdout: outfile.close() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("fname", help="path to a file to compute hash for", type=str) parser.add_argument('-o', '--outfile', help='where to write the hashes', default=None, type=str) args = parser.parse_args() main(args.fname, args.outfile) ================================================ FILE: utils/prepare/optimize_ipynb.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import logging from pathlib import Path try: import nbformat as nbf except ModuleNotFoundError: raise ModuleNotFoundError(f"Please, run `pip install nbformat==5.8.0` before using this script.") logging.basicConfig(level=logging.INFO, format="%(message)s") def merge_markdown(nb: nbf.notebooknode.NotebookNode) -> None: """Merges consequent markdown cells into one.""" start_idx = None slices = [] for i, cell in enumerate(nb["cells"]): if cell["cell_type"] == "markdown": if start_idx is None: start_idx = i else: if start_idx is not None: if i - start_idx > 1: slices.append(slice(start_idx, i)) start_idx = None for sl in slices[::-1]: nb["cells"][sl.start]["source"] = "\n\n".join([c["source"].rstrip() for c in nb["cells"][sl]]) del nb["cells"][sl.start + 1: sl.stop] # nb["cells"][sl] does not work properly def drop_metadata(nb: nbf.notebooknode.NotebookNode) -> None: """Replaces notebook and cells metadata with empty dicts.""" nb["metadata"] = dict() for i in range(len(nb["cells"])): nb["cells"][i]["metadata"] = dict() def update_file(path: Path, update_ckpts: bool) -> None: """Optimizes ipynb files in order to reduce further git diffs. Args: path: File to update, if this is file. If this is dir - recursively searches and updates .ipynb files in it. update_ckpts: If False and path is dir, will skip all found ipynb files from .ipynb_checkpoints. """ if path.is_dir(): logging.info(f"Updating .ipynb files in {path} dir" f"{', excluding files from .ipynb_checkpoints subdirs' if update_ckpts is False else ''}.") for f in path.rglob('*.ipynb'): if update_ckpts is False and '.ipynb_checkpoints' in f.parts: continue update_file(f, update_ckpts) else: logging.info(f"Updating {path}.") nb = nbf.read(path, nbf.NO_CONVERT) merge_markdown(nb) drop_metadata(nb) with open(path, "w") as fout: nbf.write(nb, fout) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("fname", help="path to an ipynb file to optimize", type=Path) parser.add_argument("--update-ckpts", help="update checkpoints in .ipynb_checkpoints subdirs", action="store_true") args = parser.parse_args() update_file(args.fname.resolve(), args.update_ckpts) if __name__ == "__main__": main() ================================================ FILE: utils/prepare/registry.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json import pkgutil from importlib import import_module, reload import deeppavlov from deeppavlov.core.common.metrics_registry import _registry_path as m_registry_path, _REGISTRY as M_REGISTRY from deeppavlov.core.common.registry import _registry_path as c_registry_path, _REGISTRY as C_REGISTRY if __name__ == '__main__': C_REGISTRY.clear() M_REGISTRY.clear() for _, pkg_name, _ in pkgutil.walk_packages(deeppavlov.__path__, deeppavlov.__name__ + '.'): if pkg_name not in ('deeppavlov.core.common.registry', 'deeppavlov.core.common.metrics_registry'): reload(import_module(pkg_name)) with c_registry_path.open('w', encoding='utf-8') as f: json.dump(dict(sorted(C_REGISTRY.items())), f, indent=2) with m_registry_path.open('w', encoding='utf-8') as f: json.dump(dict(sorted(M_REGISTRY.items())), f, indent=2) ================================================ FILE: utils/prepare/upload.py ================================================ # Copyright 2017 Neural Networks and Deep Learning lab, MIPT # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import pathlib import tarfile from pathlib import Path from deeppavlov.core.commands.utils import parse_config from deeppavlov.core.common.file import find_config from hashes import main def upload(config_in_file: str, tar_name: str, tar_output_dir: Path): if not tar_output_dir.exists(): raise RuntimeError(f'A folder {tar_output_dir} does not exist') print(f'Config: {config_in_file}') if not Path(config_in_file).exists(): raise RuntimeError(f'A config {config_in_file} does not exist') config_in = parse_config(config_in_file) config_in_file = find_config(config_in_file) model_path = Path(config_in['metadata']['variables']['MODEL_PATH']).expanduser() model_name, class_name = config_in_file.stem, config_in_file.parent.name if tar_name is None: tar_name = f'{model_name}' print(f'tar_name set to {tar_name}') full_tar_name = tar_output_dir / f'{tar_name}.tar.gz' if Path(full_tar_name).exists(): raise RuntimeError(f'An archive {Path(full_tar_name)} already exists') print(f'model_path: {model_path}') print(f'class_name: {class_name}') print(f'model_name: {model_name}') print(f'Start tarring to {full_tar_name}') with tarfile.open(str(full_tar_name), "w|gz") as archive: archive.add(model_path, arcname=pathlib.os.sep) print("Stop tarring") print(f'Tar archive: {Path(full_tar_name)} has been created') print("Calculating hash") main(full_tar_name) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-c', '--config_in', help='path to a config', type=str) parser.add_argument('-n', '--tar_name', help='name of the tar archive (without tar.gz extension)', default=None, required=False, type=str) parser.add_argument('-o', '--tar_output_dir', help='dir to save a tar archive', default='./', required=False, type=Path) args = parser.parse_args() upload(args.config_in, args.tar_name, args.tar_output_dir)