Showing preview only (2,270K chars total). Download the full file or copy to clipboard to get everything.
Repository: microsoft/nlp-recipes
Branch: master
Commit: 7db6d204e511
Files: 249
Total size: 2.1 MB
Directory structure:
gitextract_zstwuz6r/
├── .amlignore
├── .bumpversion.cfg
├── .flake8
├── .github/
│ ├── ISSUE_TEMPLATE/
│ │ ├── bug_report.md
│ │ ├── feature_request.md
│ │ └── general-ask.md
│ ├── ISSUE_TEMPLATE.md
│ └── PULL_REQUEST_TEMPLATE.md
├── .gitignore
├── .pre-commit-config.yaml
├── CONTRIBUTING.md
├── DatasetReferences.md
├── LICENSE
├── MANIFEST.in
├── NOTICE.txt
├── README.md
├── SETUP.md
├── VERSIONING.md
├── _config.yml
├── cgmanifest.json
├── docker/
│ └── Dockerfile
├── docs/
│ ├── Makefile
│ ├── README.md
│ ├── _config.yml
│ └── source/
│ ├── azureml.rst
│ ├── conf.py
│ └── index.rst
├── examples/
│ ├── README.md
│ ├── annotation/
│ │ ├── Doccano.md
│ │ └── README.md
│ ├── embeddings/
│ │ ├── README.md
│ │ └── embedding_trainer.ipynb
│ ├── entailment/
│ │ ├── README.md
│ │ ├── entailment_multinli_transformers.ipynb
│ │ └── entailment_xnli_bert_azureml.ipynb
│ ├── model_explainability/
│ │ ├── README.md
│ │ └── interpret_dnn_layers.ipynb
│ ├── named_entity_recognition/
│ │ ├── README.md
│ │ └── ner_wikigold_transformer.ipynb
│ ├── question_answering/
│ │ ├── README.md
│ │ ├── bert_run_squad_azureml.py
│ │ ├── bidaf_aml_deep_dive.ipynb
│ │ ├── bidaf_config.json
│ │ ├── pretrained-BERT-SQuAD-deep-dive-aml.ipynb
│ │ ├── question_answering_squad_transformers.ipynb
│ │ └── question_answering_system_bidaf_quickstart.ipynb
│ ├── sentence_similarity/
│ │ ├── README.md
│ │ ├── automl_local_deployment_aci.ipynb
│ │ ├── automl_with_pipelines_deployment_aks.ipynb
│ │ ├── baseline_deep_dive.ipynb
│ │ ├── bert_encoder.ipynb
│ │ ├── bert_senteval.ipynb
│ │ ├── gensen_aml_deep_dive.ipynb
│ │ ├── gensen_config.json
│ │ ├── gensen_local.ipynb
│ │ ├── gensen_train.py
│ │ └── gensen_wrapper.py
│ ├── sentiment_analysis/
│ │ └── absa/
│ │ ├── README.md
│ │ ├── absa.ipynb
│ │ ├── absa_azureml.ipynb
│ │ └── dataset/
│ │ └── data.md
│ ├── text_classification/
│ │ ├── README.md
│ │ ├── tc_bert_azureml.ipynb
│ │ ├── tc_mnli_mtdnn.ipynb
│ │ ├── tc_mnli_transformers.ipynb
│ │ └── tc_multi_languages_transformers.ipynb
│ └── text_summarization/
│ ├── abstractive_summarization_bertsum_cnndm_distributed_train.py
│ ├── abstractive_summarization_bertsumabs_cnndm.ipynb
│ ├── abstractive_summarization_minilm_cnndm.ipynb
│ ├── abstractive_summarization_unilm_cnndm.ipynb
│ ├── abstractive_summarization_unilm_cnndm.py
│ ├── extractive_summarization_cnndm_aml_distributed.ipynb
│ ├── extractive_summarization_cnndm_distributed_train.py
│ ├── extractive_summarization_cnndm_transformer.ipynb
│ └── summarization_evaluation.ipynb
├── pyproject.toml
├── setup.py
├── tests/
│ ├── README.md
│ ├── __init__.py
│ ├── ci/
│ │ ├── azureml_integration_tests.yml
│ │ ├── component_governance.yml
│ │ ├── cpu_integration_tests_linux.yml
│ │ ├── cpu_unit_tests_linux.yml
│ │ ├── gpu_integration_tests_linux.yml
│ │ ├── gpu_unit_tests_linux.yml
│ │ ├── notebooks_cpu_unit_tests_linux.yml
│ │ └── notebooks_gpu_unit_tests_linux.yml
│ ├── conftest.py
│ ├── integration/
│ │ ├── test_ddp_summarization.py
│ │ ├── test_gpu_utils.py
│ │ ├── test_notebooks_abstractive_summarization_bertsumabs.py
│ │ ├── test_notebooks_embeddings.py
│ │ ├── test_notebooks_entailment.py
│ │ ├── test_notebooks_extractive_summarization.py
│ │ ├── test_notebooks_interpretability.py
│ │ ├── test_notebooks_minilm_abstractive_summarization.py
│ │ ├── test_notebooks_named_entity_recognition.py
│ │ ├── test_notebooks_question_answering.py
│ │ ├── test_notebooks_sentence_similarity.py
│ │ ├── test_notebooks_text_classification.py
│ │ └── test_notebooks_unilm_abstractive_summarization.py
│ ├── notebooks_common.py
│ ├── smoke/
│ │ ├── test_dataset.py
│ │ ├── test_gpu_utils.py
│ │ └── test_word_embeddings.py
│ └── unit/
│ ├── test_abstractive_summarization_bertsum.py
│ ├── test_abstractive_summarization_seq2seq.py
│ ├── test_bert_common.py
│ ├── test_bert_encoder.py
│ ├── test_bert_sentence_encoding.py
│ ├── test_common_pytorch_utils.py
│ ├── test_data_loaders.py
│ ├── test_dataset.py
│ ├── test_dataset_pytorch.py
│ ├── test_distributed_sampler.py
│ ├── test_eval_classification.py
│ ├── test_eval_compute_rouge.py
│ ├── test_extractive_summarization.py
│ ├── test_gensen_utils.py
│ ├── test_interpreter.py
│ ├── test_models_transformers_question_answering.py
│ ├── test_notebooks_cpu.py
│ ├── test_notebooks_gpu.py
│ ├── test_preprocess.py
│ ├── test_timer.py
│ ├── test_transformers_sequence_classification.py
│ └── test_transformers_token_classification.py
├── tools/
│ ├── README.md
│ ├── __init__.py
│ ├── generate_conda_file.py
│ ├── generate_requirements_txt.py
│ └── remove_pixelserver.py
└── utils_nlp/
├── README.md
├── __init__.py
├── azureml/
│ ├── README.md
│ ├── __init__.py
│ ├── azureml_bert_util.py
│ └── azureml_utils.py
├── common/
│ ├── README.md
│ ├── __init__.py
│ ├── pytorch_utils.py
│ └── timer.py
├── dataset/
│ ├── README.md
│ ├── __init__.py
│ ├── bbc_hindi.py
│ ├── cnndm.py
│ ├── dac.py
│ ├── data_loaders.py
│ ├── msrpc.py
│ ├── multinli.py
│ ├── ner_utils.py
│ ├── preprocess.py
│ ├── sentence_selection.py
│ ├── snli.py
│ ├── squad.py
│ ├── stsbenchmark.py
│ ├── url_utils.py
│ ├── wikigold.py
│ ├── xnli.py
│ └── xnli_torch_dataset.py
├── eval/
│ ├── README.md
│ ├── SentEval/
│ │ ├── .gitignore
│ │ ├── LICENSE
│ │ ├── README.md
│ │ ├── senteval/
│ │ │ ├── __init__.py
│ │ │ ├── binary.py
│ │ │ ├── engine.py
│ │ │ ├── mrpc.py
│ │ │ ├── probing.py
│ │ │ ├── rank.py
│ │ │ ├── sick.py
│ │ │ ├── snli.py
│ │ │ ├── sst.py
│ │ │ ├── sts.py
│ │ │ ├── tools/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── classifier.py
│ │ │ │ ├── ranking.py
│ │ │ │ ├── relatedness.py
│ │ │ │ └── validation.py
│ │ │ ├── trec.py
│ │ │ └── utils.py
│ │ └── setup.py
│ ├── __init__.py
│ ├── classification.py
│ ├── evaluate_squad.py
│ ├── evaluate_summarization.py
│ ├── question_answering.py
│ ├── rouge/
│ │ ├── compute_rouge.py
│ │ └── rouge_ext.py
│ └── senteval.py
├── interpreter/
│ ├── Interpreter.py
│ ├── README.md
│ └── __init__.py
├── language_utils/
│ └── hi/
│ └── hindi_stemmer.py
└── models/
├── README.md
├── bert/
│ ├── README.md
│ ├── __init__.py
│ ├── common.py
│ ├── sequence_classification.py
│ ├── sequence_classification_distributed.py
│ ├── sequence_encoding.py
│ └── token_classification.py
├── gensen/
│ ├── README.md
│ ├── __init__.py
│ ├── create_gensen_model.py
│ ├── gensen.py
│ ├── multi_task_model.py
│ ├── preprocess_utils.py
│ └── utils.py
├── glove/
│ ├── Makefile
│ ├── README.md
│ ├── demo.sh
│ └── src/
│ ├── README.md
│ ├── cooccur.c
│ ├── glove.c
│ ├── shuffle.c
│ └── vocab_count.c
├── pretrained_embeddings/
│ ├── README.md
│ ├── __init__.py
│ ├── fasttext.py
│ ├── glove.py
│ └── word2vec.py
├── pytorch_modules/
│ ├── README.md
│ ├── __init__.py
│ └── conditional_gru.py
├── transformers/
│ ├── abstractive_summarization_bertsum.py
│ ├── abstractive_summarization_seq2seq.py
│ ├── bertsum/
│ │ ├── __init__.py
│ │ ├── adam.py
│ │ ├── beam.py
│ │ ├── data_loader.py
│ │ ├── dataset.py
│ │ ├── decoder.py
│ │ ├── encoder.py
│ │ ├── loss.py
│ │ ├── model_builder.py
│ │ ├── neural.py
│ │ ├── optimizers.py
│ │ ├── penalties.py
│ │ └── predictor.py
│ ├── common.py
│ ├── datasets.py
│ ├── extractive_summarization.py
│ ├── named_entity_recognition.py
│ ├── question_answering.py
│ └── sequence_classification.py
└── xlnet/
├── README.md
├── common.py
└── sequence_classification.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .amlignore
================================================
data/
examples/
================================================
FILE: .bumpversion.cfg
================================================
[bumpversion]
current_version = 1.0.0
commit = True
tag = True
message = "Bump version: {current_version} -> {new_version}"
[bumpversion:file:setup.py]
search = version='{current_version}'
replace = version='{new_version}'
[bumpversion:file:utils_nlp/__init__.py]
search = __version__ = '{current_version}'
replace = __version__ = '{new_version}'
================================================
FILE: .flake8
================================================
[flake8]
# Intial set of rules
# Feel Free to add any new rule here with description of what it does.
# E203 Whitespace before ':'
# E266 Too many leading '#' for block comment
# E501 Line too long (82 > 79 characters)
# W503 Line break occurred before a binary operator
# F403 'from module import *' used; unable to detect undefined names
# F405 '<function>' may be undefined, or defined from star imports
# E402 module level import not at top of file
# E731 do not assign a lambda expression, use a def
# F821 undefined name 'get_ipython' --> from generated python files using nbconvert
# E722: do not use bare except
# E231: missing white space after "," --> black generates autoformat [,] which fails flake8
ignore = E203, E266, W503, F403, F405, E402, E731, F821, E722, E231
max-line-length = 88
================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: Bug report
about: Create a report to help us improve
title: "[BUG] "
labels: 'bug'
assignees: ''
---
### Description
<!--- Describe your bug in detail -->
### How do we replicate the bug?
<!--- Please be specific as possible (use a list if needed). -->
<!--- For example: -->
<!--- * Create a conda environment for gpu -->
<!--- * Run unit test `test_timer.py` -->
<!--- * ... -->
### Expected behavior (i.e. solution)
<!--- For example: -->
<!--- * The tests for the timer should pass successfully. -->
### Other Comments
================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.md
================================================
---
name: Feature request
about: Suggest an idea for this project
title: "[FEATURE] "
labels: 'enhancement'
assignees: ''
---
### Description
<!--- Describe your expected feature in detail -->
### Expected behavior with the suggested feature
<!--- For example: -->
<!--- *Adding algorithm xxx will help people understand more about xxx use case scenarios. -->
### Other Comments
================================================
FILE: .github/ISSUE_TEMPLATE/general-ask.md
================================================
---
name: General ask
about: Technical/non-technical asks about the repo
title: "[ASK] "
labels: ''
assignees: ''
---
### Description
<!--- Describe your general ask in detail -->
### Other Comments
================================================
FILE: .github/ISSUE_TEMPLATE.md
================================================
### Description
<!--- Describe your issue/bug/request in detail -->
### In which platform does it happen?
<!--- Describe the platform where the issue is happening (use a list if needed) -->
<!--- For example: -->
<!--- * Azure Ubuntu Data Science Virtual Machine. -->
<!--- * Other platforms. -->
### How do we replicate the issue?
<!--- Please be specific as possible (use a list if needed). -->
<!--- For example: -->
<!--- * Create a conda environment for gpu -->
<!--- * Run unit test `test_timer.py` -->
<!--- * ... -->
### Expected behavior (i.e. solution)
<!--- For example: -->
<!--- * The tests for the timer should pass successfully. -->
### Other Comments
================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
### Description
<!--- Describe your changes in detail -->
<!--- Why is this change required? What problem does it solve? -->
### Related Issues
<!--- If it fixes an open issue, please link to the issue here. -->
### Checklist:
<!--- Go over all the following points, and put an `x` in all the boxes that apply. -->
<!--- If you're unsure about any of these, don't hesitate to ask. We're here to help! -->
- [ ] My code follows the code style of this project, as detailed in our [contribution guidelines](https://github.com/microsoft/nlp-recipes/blob/master/CONTRIBUTING.md).
- [ ] I have added tests.
- [ ] I have updated the documentation accordingly.
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
pip-wheel-metadata/
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
##########################
.DS_Store
.~*
Untitled*.ipynb
*-Copy*.ipynb
~$*
output.ipynb
.idea/
*.npz
*.data
*.dat
*.csv
*.tsv
*.zip
.vscode/
tools/repo_metrics/config.py
*.jar
*.item
*.pkl
nlp_*.yaml
nohup.out
temp/
tmp/
logs/
score.py
# Data
data/
squad/
bidaf-question-answering/
*/question_answering/bidaf.tar.gz
*/question_answering/bidafenv.yml
*/question_answering/config.json
*/question_answering/vocabulary/
*/question_answering/weights.th
# AML Config
aml_config/
.azureml/
================================================
FILE: .pre-commit-config.yaml
================================================
repos:
- repo: https://github.com/ambv/black
rev: stable
hooks:
- id: black
language_version: python3.6
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v1.2.3
hooks:
- id: flake8
================================================
FILE: CONTRIBUTING.md
================================================
# Contribution Guidelines
Contribution are welcome! Here's a few things to know:
- [Contribution Guidelines](#contribution-guidelines)
- [Microsoft Contributor License Agreement](#microsoft-contributor-license-agreement)
- [Steps to Contributing](#steps-to-contributing)
- [Coding Guidelines](#coding-guidelines)
- [Code of Conduct](#code-of-conduct)
- [Do not point fingers](#do-not-point-fingers)
- [Provide code feedback based on evidence](#provide-code-feedback-based-on-evidence)
- [Ask questions do not give answers](#ask-questions-do-not-give-answers)
## Microsoft Contributor License Agreement
Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit https://cla.microsoft.com.
When you submit a pull request, a CLA-bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA.
## Steps to Contributing
Here are the basic steps to get started with your first contribution. Please reach out with any questions.
1. Use [open issues](https://github.com/Microsoft/Recommenders/issues) to discuss the proposed changes. Create an issue describing changes if necessary to collect feedback. Also, please use provided labels to tag issues so everyone can easily sort issues of interest.
2. [Fork the repo](https://help.github.com/articles/fork-a-repo/) so you can make and test local changes.
3. Create a new branch for the issue. We suggest prefixing the branch with your username and then a descriptive title: (e.g. gramhagen/update_contributing_docs)
4. Create a test that replicates the issue.
5. Make code changes.
6. Ensure unit tests pass and code style / formatting is consistent (see [wiki](https://github.com/Microsoft/Recommenders/wiki/Coding-Guidelines#python-and-docstrings-style) for more details).
7. We use [pre-commit](https://pre-commit.com/) package to run our pre-commit hooks. We use black formatter and flake8 linting on each commit. In order to set up pre-commit on your machine, follow the steps here, please note that you only need to run these steps the first time you use pre-commit for this project.
* Update your conda environment, pre-commit is part of the yaml file or just do
```
$ pip install pre-commit
```
* Set up pre-commit by running following command, this will put pre-commit under your .git/hooks directory.
```
$ pre-commit install
```
```
$ git commit -m "message"
```
* Each time you commit, git will run the pre-commit hooks (black and flake8 for now) on any python files that are getting committed and are part of the git index. If black modifies/formats the file, or if flake8 finds any linting errors, the commit will not succeed. You will need to stage the file again if black changed the file, or fix the issues identified by flake8 and and stage it again.
* To run pre-commit on all files just run
```
$ pre-commit run --all-files
8. Create a pull request against <b>staging</b> branch.
Note: We use the staging branch to land all new features, so please remember to create the Pull Request against staging.
Once the features included in a milestone are complete we will merge staging into master and make a release. See the wiki for more detail about our [merge strategy](https://github.com/Microsoft/Recommenders/wiki/Strategy-to-merge-the-code-to-master-branch).
## Coding Guidelines
We strive to maintain high quality code to make the utilities in the repository easy to understand, use, and extend. We also work hard to maintain a friendly and constructive environment. We've found that having clear expectations on the development process and consistent style helps to ensure everyone can contribute and collaborate effectively.
Please review the [coding guidelines](https://github.com/Microsoft/Recommenders/wiki/Coding-Guidelines) wiki page to see more details about the expectations for development approach and style.
## Code of Conduct
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
Apart from the official Code of Conduct developed by Microsoft, in the Recommenders team we adopt the following behaviors, to ensure a great working environment:
#### Do not point fingers
Let’s be constructive.
<details>
<summary><em>Click here to see some examples</em></summary>
"This method is missing docstrings" instead of "YOU forgot to put docstrings".
</details>
#### Provide code feedback based on evidence
When making code reviews, try to support your ideas based on evidence (papers, library documentation, stackoverflow, etc) rather than your personal preferences.
<details>
<summary><em>Click here to see some examples</em></summary>
"When reviewing this code, I saw that the Python implementation the metrics are based on classes, however, [scikit-learn](https://scikit-learn.org/stable/modules/classes.html#sklearn-metrics-metrics) and [tensorflow](https://www.tensorflow.org/api_docs/python/tf/metrics) use functions. We should follow the standard in the industry."
</details>
#### Ask questions do not give answers
Try to be empathic.
<details>
<summary><em>Click here to see some examples</em></summary>
* Would it make more sense if ...?
* Have you considered this ... ?
</details>
================================================
FILE: DatasetReferences.md
================================================
MICROSOFT PROVIDES THE DATASETS ON AN "AS IS" BASIS. MICROSOFT MAKES NO WARRANTIES, EXPRESS OR IMPLIED, GUARANTEES OR CONDITIONS WITH RESPECT TO YOUR USE OF THE DATASETS. TO THE EXTENT PERMITTED UNDER YOUR LOCAL LAW, MICROSOFT DISCLAIMS ALL LIABILITY FOR ANY DAMAGES OR LOSSES, INLCUDING DIRECT, CONSEQUENTIAL, SPECIAL, INDIRECT, INCIDENTAL OR PUNITIVE, RESULTING FROM YOUR USE OF THE DATASETS.
The datasets are provided under the original terms that Microsoft received such datasets. See below for more information about each dataset.
### <a name="cnndm"></a> CNN/Daily Mail (CNN/DM) Dataset
The training and evaluation for CNN/DM dataset is available https://s3.amazonaws.com/opennmt-models/Summary/cnndm.tar.gz and released under MIT License. This is a processed version of data that's originally released by Hermann et al. (2015) in ["Teaching machines to read and comprehend"](https://arxiv.org/abs/1506.03340) and then made available by Kyunghyun Cho at https://cs.nyu.edu/~kcho/DMQA/.
### Preprocessed CNN/Daily Mail (CNN/DM) Dataset by BERTSUM
The preprocessed dataset of [CNN/DM dataset](#cnndm), originally published by BERTSUM paper ["Fine-tune BERT for Extractive Summarization"](https://arxiv.org/pdf/1903.10318.pdf), can be found at https://github.com/nlpyang/BertSum and released under Apache License 2.0.
### Microsoft Research Paraphrase Corpus
>Original source: https://www.microsoft.com/en-us/download/details.aspx?id=52398
### The Multi-Genre NLI Corpus (MultiNLI)
>The majority of the corpus is released under the [OANC](https://www.anc.org/OANC/license.txt)’s license, The data in the FICTION section falls under several permissive licenses. See the [data description paper](https://www.nyu.edu/projects/bowman/multinli/paper.pdf) for details.
Redistributing the datasets "MultiNLI 1.0.zip", "MultiNLI Matched.zip", and "MultiNLI Mismatched.zip" with attribution:
Adina Williams, Nikita Nangia, Samuel R. Bowman. 2018. A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference. Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers).
Original source: https://www.nyu.edu/projects/bowman/multinli/
### The Stanford Natural Language Inference (SNLI) Corpus
>This dataset is provided under [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/).
Redistributing the dataset "snli_1.0.zip" with attribution:
Samuel R. Bowman, Gabor Angeli, Christopher Potts, and Christopher D. Manning. 2015. A large annotated corpus for learning natural language inference. In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing (EMNLP).
Original source: https://nlp.stanford.edu/projects/snli/
The dataset is preprocessed to remove unused columns and badly formatted rows.
### Wikigold dataset
>This dataset is provided under [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/deed.ast).
Redistributing the dataset "wikigold.conll.txt" with attribution:
Balasuriya, Dominic, et al. "Named entity recognition in wikipedia."
Proceedings of the 2009 Workshop on The People's Web Meets NLP: Collaboratively Constructed Semantic Resources. Association for Computational Linguistics, 2009.
Original source: https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold/CONLL-format/data
The dataset is preprocessed to fit data format requirement of BERT.
### The Cross-Lingual NLI Corpus (XNLI)
>The majority of the corpus sentences are released under the [OANC](https://www.anc.org/OANC/license.txt)’s license. The data in the Fiction genre from Captain Blood are under [The_Project_Gutenberg_License](http://www.gutenberg.org/wiki/Gutenberg:The_Project_Gutenberg_License). See details in the [XNLI paper](https://arxiv.org/pdf/1809.05053.pdf).
Redistributing the datasets "XNLI 1.0.zip" and "XNLI-MT 1.0.zip" with attribution:
Alexis Conneau, Guillaume Lample, Ruty Rinott, Holger Schwenk, Ves Stoyanov. 2018. XNLI: Evaluating Cross-lingual Sentence Representations. Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing.
Original source: https://www.nyu.edu/projects/bowman/xnli/
The dataset is preprocessed to remove unused columns.
### The Stanford Question Answering Dataset (SQuAD)
>This dataset is provided under [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/legalcode).
Redistributing the datasets "train-v1.1.json" and "dev-v1.1.json" with attribution:
Pranav Rajpurkar, Jian Zhang, Konstantin Lopyrev, and Percy Liang. 2016. SQuAD: 100,000+ Questions for Machine Comprehension of Text. Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing (EMNLP).
Original source: https://github.com/rajpurkar/SQuAD-explorer
### The STSbenchmark dataset
>Redistributing the dataset "Stsbenchmark.tar.gz" with attribution:
Eneko Agirre, Daniel Cer, Mona Diab, Iñigo Lopez-Gazpio, Lucia
Specia. Semeval-2017 Task 1: Semantic Textual Similarity
Multilingual and Crosslingual Focused Evaluation. Proceedings of
SemEval 2017.
Orignal source:http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark
The dataset is preprocessed to remove unused columns.
>The scores are released under [Commons Attribution - Share Alike 4.0
International License](http://creativecommons.org/licenses/by-sa/4.0/)
> The text of each dataset has a license of its own, as follows:
>- MSR-Paraphrase, Microsoft Research Paraphrase Corpus. In order to use
MSRpar, researchers need to agree with the license terms from
Microsoft Research:
http://research.microsoft.com/en-us/downloads/607d14d9-20cd-47e3-85bc-a2f65cd28042/
>- headlines: Mined from several news sources by European Media Monitor
(Best et al. 2005). using the RSS feed. European Media Monitor (EMM)
Real Time News Clusters are the top news stories for the last 4
hours, updated every ten minutes. The article clustering is fully
automatic. The selection and placement of stories are determined
automatically by a computer program. This site is a joint project of
DG-JRC and DG-COMM. The information on this site is subject to a
disclaimer (see
http://europa.eu/geninfo/legal_notices_en.htm). Please acknowledge
EMM when (re)using this material.
http://emm.newsbrief.eu/rss?type=rtn&language=en&duplicates=false
>- deft-news: A subset of news article data in the DEFT
project.
>- MSR-Video, Microsoft Research Video Description Corpus. In order to
use MSRvideo, researchers need to agree with the license terms from
Microsoft Research:
http://research.microsoft.com/en-us/downloads/38cf15fd-b8df-477e-a4e4-a4680caa75af/
>- image: The Image Descriptions data set is a subset of
the PASCAL VOC-2008 data set (Rashtchian et al., 2010) . PASCAL
VOC-2008 data set consists of 1,000 images and has been used by a
number of image description systems. The image captions of the data
set are released under a CreativeCommons Attribution-ShareAlike
license, the descriptions itself are free.
>- track5.en-en: This text is a subset of the Stanford Natural
Language Inference (SNLI) corpus, by The Stanford NLP Group is
licensed under a Creative Commons Attribution-ShareAlike 4.0
International License. Based on a work at
http://shannon.cs.illinois.edu/DenotationGraph/.
https://creativecommons.org/licenses/by-sa/4.0/
>- answers-answers: user content from stack-exchange. Check the license
below in ======ANSWERS-ANSWERS======
>- answers-forums: user content from stack-exchange. Check the license
below in ======ANSWERS-FORUMS======
>======ANSWER-ANSWER======
>Creative Commons Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0)
http://creativecommons.org/licenses/by-sa/3.0/
>Attribution Requirements:
> "* Visually display or otherwise indicate the source of the content
as coming from the Stack Exchange Network. This requirement is
satisfied with a discreet text blurb, or some other unobtrusive but
clear visual indication.
> * Ensure that any Internet use of the content includes a hyperlink
directly to the original question on the source site on the Network
(e.g., http://stackoverflow.com/questions/12345)
> * Visually display or otherwise clearly indicate the author names for
every question and answer used
> * Ensure that any Internet use of the content includes a hyperlink for
each author name directly back to his or her user profile page on the
source site on the Network (e.g.,
http://stackoverflow.com/users/12345/username), directly to the Stack
Exchange domain, in standard HTML (i.e. not through a Tinyurl or other
such indirect hyperlink, form of obfuscation or redirection), without
any “nofollow” command or any other such means of avoiding detection by
search engines, and visible even with JavaScript disabled."
> (https://archive.org/details/stackexchange)
>======ANSWERS-FORUMS======
>Stack Exchange Inc. generously made the data used to construct the STS 2015 answer-answer statement pairs available under a Creative Commons Attribution-ShareAlike (cc-by-sa) 3.0 license.
>The license is reproduced below from: https://archive.org/details/stackexchange
>The STS.input.answers-forums.txt file should be redistributed with this LICENSE text and the accompanying files in LICENSE.answers-forums.zip. The tsv files in the zip file contain the additional information that's needed to comply with the license.
>--
>All user content contributed to the Stack Exchange network is cc-by-sa 3.0 licensed, intended to be shared and remixed. We even provide all our data as a convenient data dump.
>http://creativecommons.org/licenses/by-sa/3.0/
>But our cc-by-sa 3.0 licensing, while intentionally permissive, does *require attribution*:
>"Attribution — You must attribute the work in the manner specified by the author or licensor (but not in any way that suggests that they endorse you or your use of the work)."
>Specifically the attribution requirements are as follows:
> 1. Visually display or otherwise indicate the source of the content as coming from the Stack Exchange Network. This requirement is satisfied with a discreet text blurb, or some other unobtrusive but clear visual indication.
> 2. Ensure that any Internet use of the content includes a hyperlink directly to the original question on the source site on the Network (e.g., http://stackoverflow.com/questions/12345)
> 3. Visually display or otherwise clearly indicate the author names for every question and answer so used.
> 4. Ensure that any Internet use of the content includes a hyperlink for each author name directly back to his or her user profile page on the source site on the Network (e.g., http://stackoverflow.com/users/12345/username), directly to the Stack Exchange domain, in standard HTML (i.e. not through a Tinyurl or other such indirect hyperlink, form of obfuscation or redirection), without any “nofollow” command or any other such means of avoiding detection by search engines, and visible even with JavaScript disabled.
>Our goal is to maintain the spirit of fair attribution. That means attribution to the website, and more importantly, to the individuals who so generously contributed their time to create that content in the first place!
>For more information, see the Stack Exchange Terms of Service: http://stackexchange.com/legal/terms-of-service
================================================
FILE: LICENSE
================================================
MIT License
Copyright (c) Microsoft Corporation. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE
================================================
FILE: MANIFEST.in
================================================
graft utils_nlp
global-exclude *.py[cod] __pycache__ *.so *.dylib
exclude README.md
exclude SETUP.md
exclude CONTRIBUTING.md
================================================
FILE: NOTICE.txt
================================================
NOTICES AND INFORMATION
Do Not Translate or Localize
This software incorporates material from third parties. Microsoft makes certain
open source code available at https://3rdpartysource.microsoft.com, or you may
send a check or money order for US $5.00, including the product name, the open
source component name, and version number, to:
Source Code Compliance Team
Microsoft Corporation
One Microsoft Way
Redmond, WA 98052
USA
Notwithstanding any other terms, you may reverse engineer this software to the
extent required to debug changes to any libraries licensed under the GNU Lesser
General Public License.
This software requires you to access or provide third party code that may contain restrictions on how such third party code can be used. You are solely responsible for reviewing any licenses applicable to such code and determining whether your use is permitted by such license
--
https://github.com/huggingface/transformers
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
--
https://github.com/stanfordnlp/glove
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2014 The Board of Trustees of The Leland Stanford Junior University
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
BSD License
For SentEval software
Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name Facebook nor the names of its contributors may be used to
endorse or promote products derived from this software without specific
prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--
https://github.com/allenai/bi-att-flow
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
--
https://github.com/nlpyang/BertSum
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
--
For spaCY
The MIT License (MIT)
Copyright (C) 2016-2020 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
================================================
FILE: README.md
================================================
<img src="NLP-Logo.png" align="right" alt="" width="300"/>
# NLP Best Practices
In recent years, natural language processing (NLP) has seen quick growth in quality and usability, and this has helped to drive business adoption of artificial intelligence (AI) solutions. In the last few years, researchers have been applying newer deep learning methods to NLP. Data scientists started moving from traditional methods to state-of-the-art (SOTA) deep neural network (DNN) algorithms which use language models pretrained on large text corpora.
This repository contains examples and best practices for building NLP systems, provided as [Jupyter notebooks](examples) and [utility functions](utils_nlp). The focus of the repository is on state-of-the-art methods and common scenarios that are popular among researchers and practitioners working on problems involving text and language.
## Overview
The goal of this repository is to build a comprehensive set of tools and examples that leverage recent advances in NLP algorithms, neural architectures, and distributed machine learning systems.
The content is based on our past and potential future engagements with customers as well as collaboration with partners, researchers, and the open source community.
We hope that the tools can significantly reduce the “time to market” by simplifying the experience from defining the business problem to development of solution by orders of magnitude. In addition, the example notebooks would serve as guidelines and showcase best practices and usage of the tools in a wide variety of languages.
In an era of transfer learning, transformers, and deep architectures, we believe that pretrained models provide a unified solution to many real-world problems and allow handling different tasks and languages easily. We will, therefore, prioritize such models, as they achieve state-of-the-art results on several NLP benchmarks like [*GLUE*](https://gluebenchmark.com/leaderboard) and [*SQuAD*](https://rajpurkar.github.io/SQuAD-explorer/) leaderboards. The models can be used in a number of applications ranging from simple text classification to sophisticated intelligent chat bots.
Note that for certain kind of NLP problems, you may not need to build your own models. Instead, pre-built or easily customizable solutions exist which do not require any custom coding or machine learning expertise. We strongly recommend evaluating if these can sufficiently solve your problem. If these solutions are not applicable, or the accuracy of these solutions is not sufficient, then resorting to more complex and time-consuming custom approaches may be necessary. The following cognitive services offer simple solutions to address common NLP tasks:
<br><br><b>[Text Analytics](https://azure.microsoft.com/en-us/services/cognitive-services/text-analytics/) </b> are a set of pre-trained REST APIs which can be called for Sentiment Analysis, Key phrase extraction, Language detection and Named Entity Detection and more. These APIs work out of the box and require minimal expertise in machine learning, but have limited customization capabilities.
<br><br><b>[QnA Maker](https://azure.microsoft.com/en-us/services/cognitive-services/qna-maker/) </b>is a cloud-based API service that lets you create a conversational question-and-answer layer over your existing data. Use it to build a knowledge base by extracting questions and answers from your semi-structured content, including FAQs, manuals, and documents.
<br><br><b>[Language Understanding](https://azure.microsoft.com/en-us/services/cognitive-services/language-understanding-intelligent-service/)</b> is a SaaS service to train and deploy a model as a REST API given a user-provided training set. You could do Intent Classification as well as Named Entity Extraction by performing simple steps of providing example utterances and labelling them. It supports Active Learning, so your model always keeps learning and improving.
## Target Audience
For this repository our target audience includes data scientists and machine learning engineers with varying levels of NLP knowledge as our content is source-only and targets custom machine learning modelling. The utilities and examples provided are intended to be solution accelerators for real-world NLP problems.
## Focus Areas
The repository aims to expand NLP capabilities along three separate dimensions
### Scenarios
We aim to have end-to-end examples of common tasks and scenarios such as text classification, named entity recognition etc.
### Algorithms
We aim to support multiple models for each of the supported scenarios. Currently, transformer-based models are supported across most scenarios. We have been working on integrating the [transformers package](https://github.com/huggingface/transformers) from [Hugging Face](https://huggingface.co/) which allows users to easily load pretrained models and fine-tune them for different tasks.
### Languages
We strongly subscribe to the multi-language principles laid down by ["Emily Bender"](http://faculty.washington.edu/ebender/papers/Bender-SDSS-2019.pdf)
* "Natural language is not a synonym for English"
* "English isn't generic for language, despite what NLP papers might lead you to believe"
* "Always name the language you are working on" ([Bender rule](https://www.aclweb.org/anthology/Q18-1041/))
The repository aims to support non-English languages across all the scenarios. Pre-trained models used in the repository such as BERT, FastText support 100+ languages out of the box. Our goal is to provide end-to-end examples in as many languages as possible. We encourage community contributions in this area.
## Content
The following is a summary of the commonly used NLP scenarios covered in the repository. Each scenario is demonstrated in one or more [Jupyter notebook examples](examples) that make use of the core code base of models and repository utilities.
| Scenario | Models | Description|Languages|
|-------------------------| ------------------- |-------|---|
|Text Classification |BERT, DistillBERT, XLNet, RoBERTa, ALBERT, XLM| Text classification is a supervised learning method of learning and predicting the category or the class of a document given its text content. |English, Chinese, Hindi, Arabic, German, French, Japanese, Spanish, Dutch|
|Named Entity Recognition |BERT| Named entity recognition (NER) is the task of classifying words or key phrases of a text into predefined entities of interest. |English|
|Text Summarization|BERTSumExt <br> BERTSumAbs <br> UniLM (s2s-ft) <br> MiniLM |Text summarization is a language generation task of summarizing the input text into a shorter paragraph of text.|English
|Entailment |BERT, XLNet, RoBERTa| Textual entailment is the task of classifying the binary relation between two natural-language texts, *text* and *hypothesis*, to determine if the *text* agrees with the *hypothesis* or not. |English|
|Question Answering |BiDAF, BERT, XLNet| Question answering (QA) is the task of retrieving or generating a valid answer for a given query in natural language, provided with a passage related to the query. |English|
|Sentence Similarity |BERT, GenSen| Sentence similarity is the process of computing a similarity score given a pair of text documents. |English|
|Embeddings| Word2Vec<br>fastText<br>GloVe| Embedding is the process of converting a word or a piece of text to a continuous vector space of real number, usually, in low dimension.|English|
|Sentiment Analysis| Dependency Parser <br>GloVe| Provides an example of train and use Aspect Based Sentiment Analysis with Azure ML and [Intel NLP Architect](http://nlp_architect.nervanasys.com/absa.html) .|English|
## Getting Started
While solving NLP problems, it is always good to start with the prebuilt [Cognitive Services](https://azure.microsoft.com/en-us/services/cognitive-services/directory/lang/). When the needs are beyond the bounds of the prebuilt cognitive service and when you want to search for custom machine learning methods, you will find this repository very useful. To get started, navigate to the [Setup Guide](SETUP.md), which lists instructions on how to setup your environment and dependencies.
## Azure Machine Learning Service
[Azure Machine Learning service](https://azure.microsoft.com/en-us/services/machine-learning-service/) is a cloud service used to train, deploy, automate, and manage machine learning models, all at the broad scale that the cloud provides. AzureML is presented in notebooks across different scenarios to enhance the efficiency of developing Natural Language systems at scale and for various AI model development related tasks like:
* [**Accessing Datastores**](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-access-data) to easily read and write your data in Azure storage services such as blob storage or file share.
* Scaling up and out on [**Azure Machine Learning Compute**](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute).
* [**Automated Machine Learning**](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train) which builds high quality machine learning models by automating model and hyperparameter selection. AutoML explores BERT, BiLSTM, bag-of-words, and word embeddings on the user's dataset to handle text columns.
* [**Tracking experiments and monitoring metrics**](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-track-experiments) to enhance the model creation process.
* [**Distributed Training**](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-train-ml-models#distributed-training-and-custom-docker-images)
* [**Hyperparameter tuning**](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-tune-hyperparameters)
* Deploying the trained machine learning model as a web service to [**Azure Container Instance**](https://azure.microsoft.com/en-us/services/container-instances/) for deveopment and test, or for low scale, CPU-based workloads.
* Deploying the trained machine learning model as a web service to [**Azure Kubernetes Service**](https://azure.microsoft.com/en-us/services/kubernetes-service/) for high-scale production deployments and provides autoscaling, and fast response times.
To successfully run these notebooks, you will need an [**Azure subscription**](https://azure.microsoft.com/en-us/) or can [**try Azure for free**](https://azure.microsoft.com/en-us/free/). There may be other Azure services or products used in the notebooks. Introduction and/or reference of those will be provided in the notebooks themselves.
## Contributing
We hope that the open source community would contribute to the content and bring in the latest SOTA algorithm. This project welcomes contributions and suggestions. Before contributing, please see our [contribution guidelines](CONTRIBUTING.md).
## Blog Posts
- [Bootstrap Your Text Summarization Solution with the Latest Release from NLP-Recipes](https://techcommunity.microsoft.com/t5/ai-customer-engineering-team/bootstrap-your-text-summarization-solution-with-the-latest/ba-p/1268809)
- [Text Annotation made easy with Doccano](https://techcommunity.microsoft.com/t5/ai-customer-engineering-team/text-annotation-made-easy-with-doccano/ba-p/1242612)
- [Jumpstart Analyzing your Hindi Text Data using the NLP Repository](https://techcommunity.microsoft.com/t5/ai-customer-engineering-team/jumpstart-analyzing-your-hindi-text-data-using-the-nlp/ba-p/1087851)
- [Speeding up the Development of Natural Language Processing Solutions with Azure Machine Learning](https://techcommunity.microsoft.com/t5/ai-customer-engineering-team/speeding-up-the-development-of-natural-language-processing/ba-p/1042577)
## References
The following is a list of related repositories that we like and think are useful for NLP tasks.
|Repository|Description|
|---|---|
|[Transformers](https://github.com/huggingface/transformers)|A great PyTorch library from Hugging Face with implementations of popular transformer-based models. We've been using their package extensively in this repo and greatly appreciate their effort.|
|[Azure Machine Learning Notebooks](https://github.com/Azure/MachineLearningNotebooks/)|ML and deep learning examples with Azure Machine Learning.|
|[AzureML-BERT](https://github.com/Microsoft/AzureML-BERT)|End-to-end recipes for pre-training and fine-tuning BERT using Azure Machine Learning service.|
|[MASS](https://github.com/microsoft/MASS)|MASS: Masked Sequence to Sequence Pre-training for Language Generation.|
|[MT-DNN](https://github.com/microsoft/mt-dnn)|Multi-Task Deep Neural Networks for Natural Language Understanding.|
|[UniLM](https://github.com/microsoft/unilm)|Unified Language Model Pre-training.|
|[DialoGPT](https://github.com/microsoft/DialoGPT)|DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation|
## Build Status
| Build | Branch | Status |
| --- | --- | --- |
| **Linux CPU** | master | [](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=50&branchName=master) |
| **Linux CPU** | staging | [](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=50&branchName=staging) |
| **Linux GPU** | master | [](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=51&branchName=master) |
| **Linux GPU** | staging | [](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=51&branchName=staging) |
================================================
FILE: SETUP.md
================================================
# Setup Guide
This document describes how to setup all the dependencies to run the notebooks in this repository.
The recommended environment to run these notebooks is the [Azure Data Science Virtual Machine (DSVM)](https://azure.microsoft.com/en-us/services/virtual-machines/data-science-virtual-machines/). Since a considerable number of the algorithms rely on deep learning, it is recommended to use a GPU DSVM.
For training at scale, operationalization or hyperparameter tuning, it is recommended to use [Azure ML](https://docs.microsoft.com/en-us/azure/machine-learning/service/).
## Table of Contents
* [Compute environments](#compute-environments)
* [Create a cloud-based workstation (Optional)](#Create-a-cloud-based-workstation-optional)
* [Setup guide for Local or Virtual Machines](#setup-guide-for-local-or-virtual-machines)
* [Requirements](#requirements)
* [Dependencies setup](#dependencies-setup)
* [Register the conda environment in the DSVM JupyterHub](#register-conda-environment-in-dsvm-jupyterhub)
* [Installing the Repo's Utils via PIP](#installing-the-repos-utils-via-pip)
* [Setup guide for docker](#Set-up-guide-for-nvidia-docker)
## Compute Environments
Depending on the type of NLP system and the notebook that needs to be run, there are different computational requirements. Currently, this repository supports **Python CPU** and **Python GPU**. A conda environment YAML file can be generated for either CPU or GPU environments as shown below in the *Dependencies Setup* section.
## Create a cloud-based workstation (Optional)
[Azure Machine Learning service](https://azure.microsoft.com/en-us/services/machine-learning-service/)’s Notebook Virtual Machine (VM), is a cloud-based workstation created specifically for data scientists. Notebook VM based authoring is directly integrated into Azure Machine Learning service, providing a code-first experience for Python developers to conveniently build and deploy models in the workspace. Developers and data scientists can perform every operation supported by the Azure Machine Learning Python SDK using a familiar Jupyter notebook in a secure, enterprise-ready environment. Notebook VM is secure and easy-to-use, preconfigured for machine learning, and fully customizable.
You can learn how to create a Notebook VM [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-1st-experiment-sdk-setup#azure) and then follow the same setup as in the [Setup guide for Local or DSVM](#setup-guide-for-local-or-dsvm-machines) directly using the terminal in the Notebook VM.
## Setup Guide for Local or Virtual Machines
### Requirements
* A machine running Linux, MacOS or Windows.
* On Windows, Microsoft Visual C++ 14.0 is required for building certain packages. Download Microsoft Visual C++ Build Tools [here](https://visualstudio.microsoft.com/downloads/).
* Miniconda or Anaconda with Python version >= 3.6.
* This is pre-installed on Azure DSVM such that one can run the following steps directly. To setup on your local machine, [Miniconda](https://docs.conda.io/en/latest/miniconda.html) is a quick way to get started.
* It is recommended to update conda to the latest version: `conda update -n base -c defaults conda`
> NOTE: Windows machines are not **FULLY SUPPORTED**. Please use at your own risk.
### Dependencies Setup
We provide a script, [generate_conda_file.py](tools/generate_conda_file.py), to generate a conda-environment yaml file
which you can use to create the target environment using the Python version 3.6 with all the correct dependencies.
Assuming the repo is cloned as `nlp-recipes` in the system, to install **a default (Python CPU) environment**:
cd nlp-recipes
python tools/generate_conda_file.py
conda env create -f nlp_cpu.yaml
You can specify the environment name as well with the flag `-n`.
Click on the following menus to see how to install the Python GPU environment:
<details>
<summary><strong><em>Python GPU environment</em></strong></summary>
Assuming that you have a GPU machine, to install the Python GPU environment,
1. Check the CUDA **driver** version on your machine by running
nvidia-smi
The top of the output shows the CUDA **driver** version, which is 10.0 in the example below.
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 410.79 Driver Version: 410. CUDA Version: 10.0 |
|-------------------------------+----------------------+----------------------+
2. Decide which cuda **runtime** version you should install.
The cuda **runtime** version is the version of the cudatoolkit that will be installed in the conda environment in the next step, which should be <= the CUDA **driver** version found in step 1.
Currently, this repo uses PyTorch 1.4.0 which is compatible with cuda 9.2 and cuda 10.1. The conda environment file generated in step 3 installs cudatoolkit 10.1 by default. If your CUDA **driver** version is < 10.1, you should add additional argument "--cuda_version 9.2" when calling generate_conda_files.py.
3. Install the GPU environment:
If CUDA **driver** version >= 10.1
cd nlp-recipes
python tools/generate_conda_file.py --gpu
conda env create -n nlp_gpu -f nlp_gpu.yaml
If CUDA **driver** version < 10.1
cd nlp-recipes
python tools/generate_conda_file.py --gpu --cuda_version 9.2
conda env create -n nlp_gpu -f nlp_gpu.yaml
4. Enable mixed precision training (optional)
Mixed precision training is particularly useful if your model takes a long time to train. It usually reduces the training time by 50% and produces the same model quality. To enable mixed precision training, run the following command
conda activate nlp_gpu
git clone https://github.com/NVIDIA/apex.git
cd apex
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
**Troubleshooting**:
If you run into an error message "RuntimeError: Cuda extensions are being compiled with a version of Cuda that does not match the version used to compile Pytorch binaries.", you need to make sure your NVIDIA Cuda compiler driver (nvcc) version and your cuda **runtime** version are exactly the same. To check the nvcc version, run
nvcc -V
If the nvcc version is 10.0, it's recommended to upgrade to 10.1 and re-create your conda environment with cudatoolkit=10.1.
**Steps to upgrade CUDA **driver** version and nvcc version**
We have tested the following steps. Alternatively, you can follow the official instructions [here](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html)
a. Update apt-get and reboot your machine
sudo apt-get update
sudo apt-get upgrade --fix-missing
sudo reboot
b. Download the CUDA toolkit .run file from https://developer.nvidia.com/cuda-10.1-download-archive-base based on your target platform. For example, on a Linux machine with Ubuntu 16.04, run
wget https://developer.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda_10.1.105_418.39_linux.run
c. Upgrade CUDA driver by running
sudo sh cuda_10.1.105_418.39_linux.run
First, accept the user agreement.

Next, choose the components to install.
It's possible that you already have NVIDIA driver 418.39 and CUDA 10.1, but nvcc 10.0. In this case, you can uncheck the "DRIVER" box and upgrade nvcc by re-installing CUDA toolkit only.

If you choose to install all components, follow the instructions on the screen to uninstall existing NVIDIA driver and CUDA toolkit first.

Then re-run
sudo sh cuda_10.1.105_418.39_linux.run
Select "Yes" to update the cuda symlink.

d. Run the following commands again to make sure you have NVIDIA driver 418.39, CUDA driver 10.1 and nvcc 10.1
nvidia-smi
nvcc -V
e. Repeat steps 3 & 4 to recreate your conda environment with cudatoolkit **runtime** 10.1 and apex installed for mixed precision training.
</details>
### Register Conda Environment in DSVM JupyterHub
We can register our created conda environment to appear as a kernel in the Jupyter notebooks.
conda activate my_env_name
python -m ipykernel install --user --name my_env_name --display-name "Python (my_env_name)"
If you are using the DSVM, you can [connect to JupyterHub](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/dsvm-ubuntu-intro#jupyterhub-and-jupyterlab) by browsing to `https://your-vm-ip:8000`. If you are prompted to enter user name and password, enter the user name and password that you use to log in to your virtual machine.
### Installing the Repo's Utils via PIP
<details>
<summary>The utils_nlp module of this repository needs to be installed as a python package in order to be used by the examples. <strong><em>Click to expand and see the details</em></strong>
</summary>
<p>
A setup.py file is provided in order to simplify the installation of this utilities in this repo from the main directory.
To install the package, please run the command below (from directory root)
pip install -e .
Running the command tells pip to install the `utils_nlp` package from source in [development mode](https://setuptools.readthedocs.io/en/latest/setuptools.html#development-mode). This just means that any updates to `utils_nlp` source directory will immediately be reflected in the installed package without needing to reinstall; a very useful practice for a package with constant updates.
> It is also possible to install directly from Github, which is the best way to utilize the `utils_nlp` package in external projects (while still reflecting updates to the source as it's installed as an editable `'-e'` package).
> `pip install -e git+git@github.com:microsoft/nlp-recipes.git@master#egg=utils_nlp`
Either command, from above, makes `utils_nlp` available in your conda virtual environment. You can verify it was properly installed by running:
pip list
**NOTE** - The pip installation does not install any of the necessary package dependencies, it is expected that conda will be used as shown above to setup the environment for the utilities being used.
</p>
</details>
The details of the versioning info can be found at [VERSIONING.md](VERSIONING.md).
# Set up guide for (nvidia) docker
## Pre-requisites
In order to use the notebooks within a docker enviornment, you will need to have [nvidia docker drivers](https://github.com/NVIDIA/nvidia-docker) and [docker](https://docs.docker.com/install/linux/docker-ce/ubuntu/) installed on your computer.
## Building docker image
A docker file is provided within the [docker](docker/) folder. You can create the image using
```
cd docker
docker build -f . -t nlp-recipes
```
This will create a docker image containing all the dependencies and will name it as nlp-recipies:latest
## Running the container
You can run the notebook within the container environment using
```
docker run --gpus all -p 8888:8888 nlp-recipes
```
This will map port 8888 of the local machine
## Trouble shooting
* If you have permission issues with `docker build` or `docker run`, you might need to run docker with sudo permissions.
* If you are getting 'port already in use' errors, consider mapping a different port on the local machine to port 8888 on the container e.g.
```
docker run --gpus all -p 9000:8888 nlp-recipes
```
================================================
FILE: VERSIONING.md
================================================
# Semantic Versioning
> NOTE: Support for `setuptools_scm` is currently removed due to a known [issue](https://github.com/pypa/setuptools_scm/issues/357) with the way pip installations restrict access to certain SCM metadata during package installation. Support will be restored when `setuptools_scm` and `pip` developers fix this with a patch.
This library is configured to use
[setuptools_scm](https://github.com/pypa/setuptools_scm/) to automatically get package version from git commit histories.
**There shouldn't be any references to manually coded versions**.
Verify what git tag to use by running:
```bash
python setup.py --version
```
It should look something like `0.1.0.dev4+gdfedba7.d20190209`
Using the information above the master branch, after a merge commit, can be _**Tagged**_ with the above semantic version `0.1.0` (ignoring the `dev4+gdfedba7.d20190209`)
For example:
git tag v0.1.0
Now verify the semantic version for the package:
python setup.py --version
All new merged commit on master must have a
[Semantic Versioning](https://semver.org/) release version with an
accompanying tag. TL;DR:
* `major.minor.patch`
* Patch is for bugfix
* Minor is for new features
* Major is for backwards-incompatible changes
* tags should be of the form `v0.1.2`
Installing this library into another clean git repository with a tag version, you should get a nice version like `0.2.1`.
However, if you inspect the `__version__` in this repo,
you'll get a nice **'dirty'** version number like `'0.2.1.dev0+g850a76d.d20180908'`.
This is useful for debugging, building sphinx docs in dev and so on.
You should never have to specify a version manually except just tagging your commit from the tag calculation generated by running
python setup.py --version
================================================
FILE: _config.yml
================================================
theme: jekyll-theme-cayman
================================================
FILE: cgmanifest.json
================================================
{
"Registrations": [
{
"component": {
"type": "git",
"git": {
"repositoryUrl": "https://github.com/facebookresearch/XLM",
"commitHash": ""
}
},
"license": "CC BY-NC 4.0"
},
{
"component": {
"type": "git",
"git": {
"repositoryUrl": "https://github.com/allenai/bi-att-flow",
"commitHash": "e444acf13892cf62189b9eac3c7654bd83baf848"
}
},
"license": "Apache-2.0"
},
{
"component": {
"type": "git",
"git": {
"repositoryUrl": "https://github.com/stanfordnlp/glove",
"commitHash": "26f6e18eb117ca7b080d01acb453fd1c9742418d"
}
},
"license": "Apache-2.0"
},
{
"component": {
"type": "git",
"git": {
"repositoryUrl": "https://github.com/nlpyang/PreSumm",
"commitHash": "2df3312582a3a014aacbc1be810841705c67d06e"
}
},
"license": "MIT License"
}
],
"Version": 1
}
================================================
FILE: docker/Dockerfile
================================================
FROM nvidia/cuda
# Install Anaconda
# Non interactive installation instructions can be found
# https://hub.docker.com/r/continuumio/anaconda/dockerfile
# https://hub.docker.com/r/continuumio/miniconda/dockerfile
ENV PATH /opt/conda/bin:$PATH
SHELL ["/bin/bash", "-c"]
RUN apt-get update --fix-missing && apt-get install -y wget bzip2 ca-certificates \
libglib2.0-0 libxext6 libsm6 libxrender1 \
git mercurial subversion
RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda2-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh && \
/bin/bash ~/miniconda.sh -b -p /opt/conda && \
rm ~/miniconda.sh && \
ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
echo "conda activate base" >> ~/.bashrc
# Get the latest version repository
WORKDIR /root
RUN apt-get install -y zip && \
wget --quiet https://github.com/microsoft/nlp-recipes/archive/staging.zip -O staging.zip && \
unzip staging.zip && rm staging.zip
# Install the packages
WORKDIR /root/nlp-recipes-staging
RUN python /root/nlp-recipes-staging/tools/generate_conda_file.py --gpu && \
conda env create -n nlp_gpu -f nlp_gpu.yaml
RUN source activate nlp_gpu && \
pip install -e . && \
python -m ipykernel install --user --name nlp_gpu --display-name "Python (nlp_gpu)"
# Run notebook
EXPOSE 8888/tcp
WORKDIR /root/nlp-recipes-staging
CMD source activate nlp_gpu && \
jupyter notebook --allow-root --ip 0.0.0.0 --port 8888 --no-browser --notebook-dir .
================================================
FILE: docs/Makefile
================================================
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
================================================
FILE: docs/README.md
================================================
# Documentation
To setup the documentation, first you need to install the dependencies of the cpu environment. For it please follow the [SETUP.md](../SETUP.md). Then type:
conda activate nlp_cpu
pip install sphinx_rtd_theme
To build the documentation as HTML:
cd docs
make html
================================================
FILE: docs/_config.yml
================================================
theme: jekyll-theme-cayman
================================================
FILE: docs/source/azureml.rst
================================================
.. _azureml:
AzureML module
**************************
AzureML module from NLP utilities.
AzureML utils
===============================
.. automodule:: utils_nlp.azureml.azureml_utils
:members:
AzureML utils for BERT
===============================
.. automodule:: utils_nlp.azureml.azureml_bert_util
:members:
================================================
FILE: docs/source/conf.py
================================================
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# -*- coding: utf-8 -*-
#
# Configuration file for the Sphinx documentation builder.
#
# This file does only contain a selection of the most common options. For a
# full list see the documentation:
# http://www.sphinx-doc.org/en/master/config
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.join("..", "..")))
sys.setrecursionlimit(1500)
from utils_nlp import TITLE, VERSION, COPYRIGHT, AUTHOR
# -- Project information -----------------------------------------------------
project = TITLE
copyright = COPYRIGHT
author = AUTHOR
# The short X.Y version
version = ".".join(VERSION.split(".")[:2])
# The full version, including alpha/beta/rc tags
release = VERSION
prefix = "NLPRecipes"
# -- General configuration ---------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
"sphinx.ext.autodoc",
"sphinx.ext.doctest",
"sphinx.ext.intersphinx",
"sphinx.ext.ifconfig",
"sphinx.ext.viewcode", # Add links to highlighted source code
"sphinx.ext.napoleon", # to render Google format docstrings
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
# source_suffix = ['.rst', '.md']
source_suffix = ".rst"
# The master toctree document.
master_doc = "index"
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ["Thumbs.db", ".DS_Store"]
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = None
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = "sphinx_rtd_theme"
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#
# html_theme_options = {}
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
# html_static_path = ["images"]
# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
#
# The default sidebars (for documents that don't match any pattern) are
# defined by theme itself. Builtin themes are using these templates by
# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
# 'searchbox.html']``.
#
# html_sidebars = {}
# -- Options for HTMLHelp output ---------------------------------------------
# Output file base name for HTML help builder.
htmlhelp_basename = prefix + "doc"
# -- Options for LaTeX output ------------------------------------------------
latex_elements = {
"papersize": "letterpaper",
"pointsize": "10pt",
"figure_align": "htbp",
"preamble": r"""
%% Adding source listings https://en.wikibooks.org/wiki/LaTeX/Source_Code_Listings
\usepackage{listings}
\usepackage{color}
\definecolor{mygreen}{rgb}{0,0.6,0}
\definecolor{mygray}{rgb}{0.5,0.5,0.5}
\definecolor{mymauve}{rgb}{0.58,0,0.82}
\lstset{
backgroundcolor=\color{white}, % choose the background color; you must add \usepackage{color} or \usepackage{xcolor}; should come as last argument
basicstyle=\footnotesize, % the size of the fonts that are used for the code
breakatwhitespace=false, % sets if automatic breaks should only happen at whitespace
breaklines=true, % sets automatic line breaking
captionpos=b, % sets the caption-position to bottom
commentstyle=\color{mygreen}, % comment style
deletekeywords={...}, % if you want to delete keywords from the given language
escapeinside={\%*}{*)}, % if you want to add LaTeX within your code
extendedchars=true, % lets you use non-ASCII characters; for 8-bits encodings only, does not work with UTF-8
firstnumber=1000, % start line enumeration with line 1000
frame=single, % adds a frame around the code
keepspaces=true, % keeps spaces in text, useful for keeping indentation of code (possibly needs columns=flexible)
keywordstyle=\color{blue}, % keyword style
language=Python, % the language of the code
morekeywords={*,...}, % if you want to add more keywords to the set
numbers=left, % where to put the line-numbers; possible values are (none, left, right)
numbersep=5pt, % how far the line-numbers are from the code
numberstyle=\tiny\color{mygray}, % the style that is used for the line-numbers
rulecolor=\color{black}, % if not set, the frame-color may be changed on line-breaks within not-black text (e.g. comments (green here))
showspaces=false, % show spaces everywhere adding particular underscores; it overrides 'showstringspaces'
showstringspaces=false, % underline spaces within strings only
showtabs=false, % show tabs within strings adding particular underscores
stepnumber=2, % the step between two line-numbers. If it's 1, each line will be numbered
stringstyle=\color{mymauve}, % string literal style
tabsize=2, % sets default tabsize to 2 spaces
title=\lstname % show the filename of files included with \lstinputlisting; also try caption instead of title
}
""",
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [(master_doc, prefix + ".tex", prefix + " Documentation", prefix, "manual")]
# -- Options for manual page output ------------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [(master_doc, prefix, prefix + " Documentation", [author], 1)]
# -- Options for Texinfo output ----------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
(
master_doc,
prefix,
prefix + " Documentation",
author,
prefix,
"One line description of project.",
"Miscellaneous",
)
]
# -- Options for Epub output -------------------------------------------------
# Bibliographic Dublin Core info.
epub_title = project
# The unique identifier of the text. This can be a ISBN number
# or the project homepage.
#
# epub_identifier = ''
# A unique identification for the text.
#
# epub_uid = ''
# A list of files that should not be packed into the epub file.
epub_exclude_files = ["search.html"]
# -- Extension configuration -------------------------------------------------
# -- Options for intersphinx extension ---------------------------------------
# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {"https://docs.python.org/": None}
##################################################
# Other options
# html_favicon = os.path.join(html_static_path[0], "favicon.ico")
# Ensure that __init__() is always documented
# source: https://stackoverflow.com/a/5599712
def skip(app, what, name, obj, would_skip, options):
if name == "__init__":
return False
return would_skip
def setup(app):
app.connect("autodoc-skip-member", skip)
================================================
FILE: docs/source/index.rst
================================================
NLP Utilities
===================================================
The `NLP repository <https://github.com/microsoft/nlp-recipes>`_ provides examples and best practices for building NLP systems, provided as Jupyter notebooks.
The module `utils_nlp <https://github.com/microsoft/nlp-recipes/tree/master/utils_nlp>`_ contains functions to simplify common tasks used when developing and
evaluating NLP systems.
.. toctree::
:maxdepth: 1
:caption: Contents:
AzureML <azureml>
Common <common>
Dataset <dataset>
Evaluation <eval>
NLP Algorithms <model>
NLP Interpretability <interpreter>
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
================================================
FILE: examples/README.md
================================================
# Examples
This folder contains examples and best practices, written in Jupyter notebooks, for building Natural Language Processing systems for the following scenarios.
|Category|Applications|Methods|Languages|
|---| ------------------------ | ------------------- |---|
|[Text Classification](text_classification)|Topic Classification|BERT, XLNet, RoBERTa, DistilBERT|en, hi, ar|
|[Named Entity Recognition](named_entity_recognition) |Wikipedia NER|BERT|en|
|[Text Summarization](text_summarization)|News Summarization, Headline Generation|Extractive: BERTSumExt <br> Abstractive: UniLM (s2s-ft)|en
|[Entailment](entailment)|MultiNLI Natural Language Inference|BERT|en|
|[Question Answering](question_answering) |SQuAD|BiDAF, BERT, XLNet, DistilBERT|en|
|[Sentence Similarity](sentence_similarity)|STS Benchmark|BERT, GenSen|en|
|[Embeddings](embeddings)|Custom Embeddings Training|Word2Vec, fastText, GloVe|en|
|[Annotation](annotation)|Text Annotation|Doccano|en|
|[Model Explainability](model_explainability)|DNN Layer Explanation|DUUDNM (Guan et al.)|en|
## Data/Telemetry
The Azure Machine Learning notebooks collect browser usage data and send it to Microsoft to help improve our products and services. Read Microsoft's [privacy statement to learn more](https://privacy.microsoft.com/en-US/privacystatement).
To opt out of tracking, a Python [script](../tools/remove_pixelserver.py) under the `tools` folder is also provided. Executing the script will check all notebooks under the `examples` folder, and automatically remove the telemetry cell:
```sh
python ../tools/remove_pixelserver.py
```
================================================
FILE: examples/annotation/Doccano.md
================================================
# Doccano: Text Annotation Tool
## What is Doccano?
[Doccano](https://github.com/chakki-works/doccano) is one of the best open source tools that provides text annotation features. The latest version supports annotation features for text classification, sequence labeling (NER) and sequence to sequence (machine translation, text summarization). There are many other open source and commercial annotation tools available. Hereafter is an list of those tools:
- [Brat](https://brat.nlplab.org/) (open source)
- [Anafora](https://github.com/weitechen/anafora) (open source)
- [Prodigy](https://prodi.gy/) (commercial)
- [LightTag](https://www.lighttag.io/) (commercial)
Doccano needs to be hosted somewhere such that we can collaborate it. This tutorial walks through how to deploy Doccano on Azure and collaboratively annotate text data for natural language processing tasks.
## Deploy to Azure
Doccano can be deployed to Azure ([Web App for Containers](https://azure.microsoft.com/en-us/services/app-service/containers/) +
[PostgreSQL database](https://azure.microsoft.com/en-us/services/postgresql/)) by clicking on the button below:
<p align="center">
<a href="https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fchakki-works%2Fdoccano%2Fmaster%2Fazuredeploy.json"><img width=180 src="https://nlpbp.blob.core.windows.net/images/deploybutton.jpg" /></a>
</p>
You will need to have an existing Azure subscription such that you can create all Azure resources need to deploy Doccano. Otherwise you can get a [free Azure account](https://azure.microsoft.com/en-us/offers/ms-azr-0044p/?WT.mc_id=medium-blog-abornst) and then click the deploy button above.
You will need to specify your subscription and resource group, and fill in the setting details (App Name, Secret Key, and etc.) and then deploy. It takes a few minutes to create all needed Azure resources. Hereafter is a screen snippet of the deployment.
<p align="center">
<img src="https://nlpbp.blob.core.windows.net/images/deploy_to_azure.jpg" />
</p>
## Tutorial
### Useful Links
#### Main Page
After the deployment you can navigate to following url where **{`appname`}** is the `App Name` you choose when deploy to Azure:
_**https://{appname}.azurewebsites.net**_
For example, if your appname is "**doccano**", then the link will be
_**https://doccano.azurewebsites.net**_
And we will use `doccano` as the app name for this tutorial.
#### Login Page
You can login by clicking the `login` button at the top right of the main page, or you can navigate to the page with the link
_**https://doccano.azurewebsites.net/login**_
Both will bring you in to the Doccano login page where you can login with the Admin user name and Admin password you configured in the deployment.
#### Admin Page
By default, only the Admin user is created for you after the deployment. You can add more users, groups and configure the Doccano service by navigating to the admin page.
_**https://doccano.azurewebsites.net/admin**_
<p align="center">
<img src="https://nlpbp.blob.core.windows.net/images/admin_page.JPG" />
</p>
### Create Project
The first step we need to do is to create a new project for annotation. And here we will use the NER annotation task for science fictions to give you a brief tutorial on Doccano.
After login with Admin user name and Admin password, you will be navigated to the main project list page of Doccano and there is no project.
<p align="center">
<img src="https://nlpbp.blob.core.windows.net/images/project_list.jpg" />
</p>
To create your project, make sure you’re in the project list page and click `Create Project` button. As for this tutorial, we name the project as `sequence labeling for books`, write some description, then choose the sequence labeling task type.
<p align="center">
<img src="https://nlpbp.blob.core.windows.net/images/create_project.jpg" />
</p>
### Import Data
After creating a project, we will see the "`Import Data`" page, or click `Import Data` button in the navigation bar. We should see the following screen:
<p align="center">
<img src="https://nlpbp.blob.core.windows.net/images/import_data.jpg" />
</p>
We choose JSONL and click `Select a file` button. Select `books.json` and it would be loaded automatically. Below is the `books.json` file containing lots of science fictions description with different languages. We need to annotate some entities like people name, book title, date and so on.
```json
{"text": "The Hitchhiker's Guide to the Galaxy (sometimes referred to as HG2G, HHGTTGor H2G2) is a comedy science fiction series created by Douglas Adams. Originally a radio comedy broadcast on BBC Radio 4 in 1978, it was later adapted to other formats, including stage shows, novels, comic books, a 1981 TV series, a 1984 video game, and 2005 feature film."}
{"text": "《三体》是中国大陆作家刘慈欣于2006年5月至12月在《科幻世界》杂志上连载的一部长篇科幻小说,出版后成为中国大陆最畅销的科幻长篇小说之一。2008年,该书的单行本由重庆出版社出版。本书是三体系列(系列原名为:地球往事三部曲)的第一部,该系列的第二部《三体II:黑暗森林》已经于2008年5月出版。2010年11月,第三部《三体III:死神永生》出版发行。 2011年,“地球往事三部曲”在台湾陆续出版。小说的英文版获得美国科幻奇幻作家协会2014年度“星云奖”提名,并荣获2015年雨果奖最佳小说奖。"}
{"text": "『銀河英雄伝説』(ぎんがえいゆうでんせつ)は、田中芳樹によるSF小説。また、これを原作とするアニメ、漫画、コンピュータゲーム、朗読、オーディオブック等の関連作品。略称は『銀英伝』(ぎんえいでん)。原作は累計発行部数が1500万部を超えるベストセラー小説である。1982年から2009年6月までに複数の版で刊行され、発行部数を伸ばし続けている。"}
```
After importing the dataset, you should be able to see the dataset immediately.
### Define labels
Click `Labels` button in left bar to define our own labels. We should see the label editor page. In label editor page, you can create labels by specifying label text, shortcut key, background color and text color.
<p align="center">
<img src="https://nlpbp.blob.core.windows.net/images/define_labels.jpg" />
</p>
### Annotation
Next, we are ready to annotate the texts. Just click the `Annotate Data` button in the navigation bar, we can start to annotate the documents. You can just select the text and then use the shortcut key that you have defined to label the entities.
<p align="center">
<img src="https://nlpbp.blob.core.windows.net/images/annotate.jpg" />
</p>
### Export Data
After the annotation step, we can download the annotated data. Click the `Edit data` button in the navigation bar, and then click `Export Data`. You should see below screen:
<p align="center">
<img src="https://nlpbp.blob.core.windows.net/images/export_data.jpg" />
</p>
Here we choose JSONL file to download the data by clicking the button. Below is the annotated result for our tutorial project.
```json
{"id": 1, "text": "The Hitchhiker's Guide to the Galaxy (sometimes referred to as HG2G, HHGTTGor H2G2) is a comedy science fiction series created by Douglas Adams. Originally a radio comedy broadcast on BBC Radio 4 in 1978, it was later adapted to other formats, including stage shows, novels, comic books, a 1981 TV series, a 1984 video game, and 2005 feature film.", "annotations": [{"label": 2, "start_offset": 0, "end_offset": 36, "user": 1}, {"label": 2, "start_offset": 63, "end_offset": 67, "user": 1}, {"label": 2, "start_offset": 69, "end_offset": 82, "user": 1}, {"label": 5, "start_offset": 89, "end_offset": 111, "user": 1}, {"label": 1, "start_offset": 130, "end_offset": 143, "user": 1}, {"label": 5, "start_offset": 158, "end_offset": 180, "user": 1}, {"label": 6, "start_offset": 184, "end_offset": 195, "user": 1}, {"label": 3, "start_offset": 199, "end_offset": 203, "user": 1}, {"label": 5, "start_offset": 254, "end_offset": 265, "user": 1}, {"label": 5, "start_offset": 267, "end_offset": 273, "user": 1}, {"label": 5, "start_offset": 275, "end_offset": 286, "user": 1}, {"label": 3, "start_offset": 290, "end_offset": 294, "user": 1}, {"label": 5, "start_offset": 295, "end_offset": 304, "user": 1}, {"label": 3, "start_offset": 308, "end_offset": 312, "user": 1}, {"label": 5, "start_offset": 313, "end_offset": 323, "user": 1}, {"label": 3, "start_offset": 329, "end_offset": 333, "user": 1}, {"label": 5, "start_offset": 334, "end_offset": 346, "user": 1}], "meta": {}, "annotation_approver": "admin"}
{"id": 2, "text": "《三体》是中国大陆作家刘慈欣于2006年5月至12月在《科幻世界》杂志上连载的一部长篇科幻小说,出版后成为中国大陆最畅销的科幻长篇小说之一。2008年,该书的单行本由重庆出版社出版。本书是三体系列(系列原名为:地球往事三部曲)的第一部,该系列的第二部《三体II:黑暗森林》已经于2008年5月出版。2010年11月,第三部《三体III:死神永生》出版发行。 2011年,“地球往事三部曲”在台湾陆续出版。小说的英文版获得美国科幻奇幻作家协会2014年度“星云奖”提名,并荣获2015年雨果奖最佳小说奖。", "annotations": [{"label": 2, "start_offset": 1, "end_offset": 3, "user": 1}, {"label": 4, "start_offset": 5, "end_offset": 9, "user": 1}, {"label": 1, "start_offset": 11, "end_offset": 14, "user": 1}, {"label": 3, "start_offset": 15, "end_offset": 26, "user": 1}, {"label": 2, "start_offset": 28, "end_offset": 32, "user": 1}, {"label": 5, "start_offset": 41, "end_offset": 47, "user": 1}, {"label": 4, "start_offset": 53, "end_offset": 57, "user": 1}, {"label": 5, "start_offset": 61, "end_offset": 67, "user": 1}, {"label": 3, "start_offset": 70, "end_offset": 74, "user": 1}, {"label": 6, "start_offset": 83, "end_offset": 88, "user": 1}, {"label": 2, "start_offset": 105, "end_offset": 112, "user": 1}, {"label": 2, "start_offset": 94, "end_offset": 98, "user": 1}, {"label": 2, "start_offset": 126, "end_offset": 135, "user": 1}, {"label": 3, "start_offset": 139, "end_offset": 146, "user": 1}, {"label": 3, "start_offset": 149, "end_offset": 157, "user": 1}, {"label": 2, "start_offset": 162, "end_offset": 172, "user": 1}, {"label": 3, "start_offset": 179, "end_offset": 184, "user": 1}, {"label": 2, "start_offset": 186, "end_offset": 193, "user": 1}, {"label": 4, "start_offset": 195, "end_offset": 197, "user": 1}, {"label": 5, "start_offset": 202, "end_offset": 204, "user": 1}, {"label": 6, "start_offset": 210, "end_offset": 220, "user": 1}, {"label": 3, "start_offset": 220, "end_offset": 225, "user": 1}, {"label": 6, "start_offset": 227, "end_offset": 230, "user": 1}, {"label": 3, "start_offset": 237, "end_offset": 242, "user": 1}, {"label": 6, "start_offset": 242, "end_offset": 245, "user": 1}], "meta": {}, "annotation_approver": "admin"}
{"id": 3, "text": "『銀河英雄伝説』(ぎんがえいゆうでんせつ)は、田中芳樹によるSF小説。また、これを原作とするアニメ、漫画、コンピュータゲーム、朗読、オーディオブック等の関連作品。略称は『銀英伝』(ぎんえいでん)。原作は累計発行部数が1500万部を超えるベストセラー小説である。1982年から2009年6月までに複数の版で刊行され、発行部数を伸ばし続けている。", "annotations": [{"label": 2, "start_offset": 1, "end_offset": 7, "user": 1}, {"label": 1, "start_offset": 23, "end_offset": 30, "user": 1}, {"label": 5, "start_offset": 30, "end_offset": 34, "user": 1}, {"label": 2, "start_offset": 85, "end_offset": 88, "user": 1}, {"label": 5, "start_offset": 50, "end_offset": 52, "user": 1}, {"label": 5, "start_offset": 63, "end_offset": 65, "user": 1}, {"label": 3, "start_offset": 130, "end_offset": 135, "user": 1}, {"label": 3, "start_offset": 137, "end_offset": 144, "user": 1}], "meta": {}, "annotation_approver": "admin"}
```
Please note that in the exported JSON file, the label for each entity is an entity ID which is inconvenient if you want to consume the annotations somewhere else. Some post processing is needed if you want to have the entity type value instead of the type ID.
### View Statistics
One good thing of Doccano is that it also has dashboard to display annotation progress and label distributions. Click the `Edit data` button in the navigation bar, and then click `Statistics` on the left side of the menu.
<p align="center">
<img src="https://nlpbp.blob.core.windows.net/images/statistic.jpg" />
</p>
Congratulation! You just mastered how to use Doccano for a sequence labeling project.
================================================
FILE: examples/annotation/README.md
================================================
# Text Annotation
This folder contains a tutorial that walks through how to deploy text annotation tool on Azure and how to collaboratively annotate text data for natural language processing tasks.
- **[Doccano](Doccano.md)**
Doccano is an open source tools that provides three main text annotation features. This tutorial only shows a Named Entity Recognition (NER) annotation task as an example.
================================================
FILE: examples/embeddings/README.md
================================================
# Word Embedding
This folder contains examples and best practices, written in Jupyter notebooks, for training word embedding on custom data from scratch.
There are
three typical ways for training word embedding:
[Word2Vec](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf),
[GloVe](https://nlp.stanford.edu/pubs/glove.pdf), and [fastText](https://arxiv.org/abs/1607.01759).
All of the three methods provide pretrained models ([pretrained model with
Word2Vec](https://code.google.com/archive/p/word2vec/), [pretrained model with
Glove](https://github.com/stanfordnlp/GloVe), [pretrained model with
fastText](https://fasttext.cc/docs/en/crawl-vectors.html)).
These pretrained models are trained with
general corpus like Wikipedia data, Common Crawl data, etc., and may not serve well for situations
where you have a domain-specific language learning problem or there is no pretrained model for the
language you need to work with. In this folder, we provide examples of how to apply each of the
three methods to train your own word embeddings.
# What is Word Embedding?
Word embedding is a technique to map words or phrases from a vocabulary to vectors or real numbers.
The learned vector representations of words capture syntactic and semantic word relationships and
therefore can be very useful for tasks like sentence similary, text classifcation, etc.
## Summary
|Notebook|Environment|Description|Dataset| Language |
|---|---|---|---|---|
|[Developing Word Embeddings](embedding_trainer.ipynb)|Local| A notebook shows how to learn word representation with Word2Vec, fastText and Glove|[STS Benchmark dataset](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark#STS_benchmark_dataset_and_companion_dataset) | en |
================================================
FILE: examples/embeddings/embedding_trainer.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Developing Word Embeddings"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Rather than use pre-trained embeddings (as we did in the sentence similarity baseline_deep_dive [notebook](../sentence_similarity/baseline_deep_dive.ipynb)), we can train word embeddings using our own dataset. In this notebook, we demonstrate the training process for producing word embeddings using the word2vec, GloVe, and fastText models. We'll utilize the STS Benchmark dataset for this task. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Table of Contents\n",
"* [Data Loading and Preprocessing](#Load-and-Preprocess-Data)\n",
"* [Word2Vec](#Word2Vec)\n",
"* [fastText](#fastText)\n",
"* [GloVe](#GloVe)\n",
"* [Concluding Remarks](#Concluding-Remarks)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import gensim\n",
"import sys\n",
"import os\n",
"\n",
"# Set the environment path\n",
"sys.path.append(\"../..\")\n",
"\n",
"import numpy as np\n",
"from utils_nlp.dataset.preprocess import (\n",
" to_lowercase,\n",
" to_spacy_tokens,\n",
" rm_spacy_stopwords,\n",
")\n",
"from utils_nlp.dataset import stsbenchmark\n",
"from utils_nlp.common.timer import Timer\n",
"from gensim.models import Word2Vec\n",
"from gensim.models.fasttext import FastText"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"# Set the path for where your repo is located\n",
"NLP_REPO_PATH = os.path.join('..','..')\n",
"\n",
"# Set the path for where your datasets are located\n",
"BASE_DATA_PATH = os.path.join(NLP_REPO_PATH, \"data\")\n",
"\n",
"# Set the path for location to save embeddings\n",
"SAVE_FILES_PATH = os.path.join(BASE_DATA_PATH, \"trained_word_embeddings\")\n",
"if not os.path.exists(SAVE_FILES_PATH):\n",
" os.makedirs(SAVE_FILES_PATH)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load and Preprocess Data"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 401/401 [00:02<00:00, 182KB/s] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Data downloaded to ../../data/raw/stsbenchmark\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"# Produce a pandas dataframe for the training set\n",
"train_raw = stsbenchmark.load_pandas_df(BASE_DATA_PATH, file_split=\"train\")\n",
"\n",
"# Clean the sts dataset\n",
"sts_train = stsbenchmark.clean_sts(train_raw)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>score</th>\n",
" <th>sentence1</th>\n",
" <th>sentence2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5.00</td>\n",
" <td>A plane is taking off.</td>\n",
" <td>An air plane is taking off.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.80</td>\n",
" <td>A man is playing a large flute.</td>\n",
" <td>A man is playing a flute.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3.80</td>\n",
" <td>A man is spreading shreded cheese on a pizza.</td>\n",
" <td>A man is spreading shredded cheese on an uncoo...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2.60</td>\n",
" <td>Three men are playing chess.</td>\n",
" <td>Two men are playing chess.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4.25</td>\n",
" <td>A man is playing the cello.</td>\n",
" <td>A man seated is playing the cello.</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" score sentence1 \\\n",
"0 5.00 A plane is taking off. \n",
"1 3.80 A man is playing a large flute. \n",
"2 3.80 A man is spreading shreded cheese on a pizza. \n",
"3 2.60 Three men are playing chess. \n",
"4 4.25 A man is playing the cello. \n",
"\n",
" sentence2 \n",
"0 An air plane is taking off. \n",
"1 A man is playing a flute. \n",
"2 A man is spreading shredded cheese on an uncoo... \n",
"3 Two men are playing chess. \n",
"4 A man seated is playing the cello. "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sts_train.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(5749, 3)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Check the size of our dataframe\n",
"sts_train.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Training set preprocessing"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Convert all text to lowercase\n",
"df_low = to_lowercase(sts_train) \n",
"# Tokenize text\n",
"sts_tokenize = to_spacy_tokens(df_low) \n",
"# Tokenize with removal of stopwords\n",
"sts_train_stop = rm_spacy_stopwords(sts_tokenize) "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Append together the two sentence columns to get a list of all tokenized sentences.\n",
"all_sentences = sts_train_stop[[\"sentence1_tokens_rm_stopwords\", \"sentence2_tokens_rm_stopwords\"]]\n",
"# Flatten two columns into one list and remove all sentences that are size 0 after tokenization and stop word removal.\n",
"sentences = [i for i in all_sentences.values.flatten().tolist() if len(i) > 0]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"11498"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(sentences)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Minimum sentence length is 1 tokens\n",
"Maximum sentence length is 43 tokens\n",
"Median sentence length is 6.0 tokens\n"
]
}
],
"source": [
"sentence_lengths = [len(i) for i in sentences]\n",
"print(\"Minimum sentence length is {} tokens\".format(min(sentence_lengths)))\n",
"print(\"Maximum sentence length is {} tokens\".format(max(sentence_lengths)))\n",
"print(\"Median sentence length is {} tokens\".format(np.median(sentence_lengths)))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[['plane', 'taking', '.'],\n",
" ['air', 'plane', 'taking', '.'],\n",
" ['man', 'playing', 'large', 'flute', '.'],\n",
" ['man', 'playing', 'flute', '.'],\n",
" ['man', 'spreading', 'shreded', 'cheese', 'pizza', '.'],\n",
" ['man', 'spreading', 'shredded', 'cheese', 'uncooked', 'pizza', '.'],\n",
" ['men', 'playing', 'chess', '.'],\n",
" ['men', 'playing', 'chess', '.'],\n",
" ['man', 'playing', 'cello', '.'],\n",
" ['man', 'seated', 'playing', 'cello', '.']]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sentences[:10]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Word2Vec"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Word2vec is a predictive model for learning word embeddings from text (see [original research paper](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)). Word embeddings are learned such that words that share common contexts in the corpus will be close together in the vector space. There are two different model architectures that can be used to produce word2vec embeddings: continuous bag-of-words (CBOW) or continuous skip-gram. The former uses a window of surrounding words (the \"context\") to predict the current word and the latter uses the current word to predict the surrounding context words. See this [tutorial](https://www.guru99.com/word-embedding-word2vec.html#3) on word2vec for more detailed background on the model."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The gensim Word2Vec model has many different parameters (see [here](https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec)) but the ones that are useful to know about are: \n",
"- size: length of the word embedding/vector (defaults to 100)\n",
"- window: maximum distance between the word being predicted and the current word (defaults to 5)\n",
"- min_count: ignores all words that have a frequency lower than this value (defaults to 5)\n",
"- workers: number of worker threads used to train the model (defaults to 3)\n",
"- sg: training algorithm; 1 for skip-gram and 0 for CBOW (defaults to 0)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# Set up a Timer to see how long the model takes to train\n",
"t = Timer()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"t.start()\n",
"\n",
"# Train the Word2vec model\n",
"word2vec_model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=3, sg=0)\n",
"\n",
"t.stop()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Time elapsed: 0.3194\n"
]
}
],
"source": [
"print(\"Time elapsed: {}\".format(t))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now that the model is trained we can:\n",
"\n",
"1. Query for the word embeddings of a given word. \n",
"2. Inspect the model vocabulary\n",
"3. Save the word embeddings"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Embedding for apple: [-0.13886626 -0.04330257 0.12527628 0.08564945 0.02040523 -0.10037457\n",
" -0.1182736 0.05916803 -0.09810918 0.11094606 -0.00045659 -0.07130833\n",
" -0.07526248 0.01439941 -0.01924936 -0.04267681 0.05364342 0.01334886\n",
" 0.09927388 0.04298429 0.07616432 -0.09218667 0.13563654 0.13954957\n",
" 0.17032589 0.13070972 0.04971378 0.05326121 0.1633883 0.0867981\n",
" 0.01025774 0.19571003 -0.11564688 0.00285543 -0.02306972 -0.07086422\n",
" -0.03311775 0.16642122 0.10450041 0.11148815 -0.11674852 -0.10021858\n",
" -0.00149789 -0.10769422 0.1467818 -0.00330875 0.09308671 -0.12129212\n",
" 0.07261119 0.07583102 0.00192156 0.23766024 -0.0063716 -0.10565527\n",
" -0.06545153 0.04053855 0.24339062 0.15191206 -0.04718588 -0.05213067\n",
" 0.00187512 -0.08648538 -0.05337012 0.15507293 -0.09485061 0.03063929\n",
" 0.00369516 -0.20911641 0.09312427 0.03583751 0.07270095 0.18968543\n",
" 0.08637197 -0.03679648 0.12222783 -0.11879333 -0.1462169 0.02210324\n",
" 0.18023533 0.03193852 -0.02540419 0.01615141 0.12228711 -0.03577682\n",
" 0.05543301 0.15039788 -0.01812798 0.10888109 -0.08378831 -0.10893872\n",
" 0.04931932 0.03412211 0.05080304 -0.16159546 0.02976557 0.08955383\n",
" -0.02231676 0.06976417 0.2003142 0.04647517]\n",
"\n",
"First 30 vocabulary words: ['plane', 'taking', '.', 'air', 'man', 'playing', 'large', 'flute', 'spreading', 'cheese', 'pizza', 'men', 'seated', 'fighting', 'smoking', 'piano', 'guitar', 'singing', 'woman', 'person']\n"
]
}
],
"source": [
"# 1. Let's see the word embedding for \"apple\" by accessing the \"wv\" attribute and passing in \"apple\" as the key.\n",
"print(\"Embedding for apple:\", word2vec_model.wv[\"apple\"])\n",
"\n",
"# 2. Inspect the model vocabulary by accessing keys of the \"wv.vocab\" attribute. We'll print the first 20 words.\n",
"print(\"\\nFirst 30 vocabulary words:\", list(word2vec_model.wv.vocab)[:20])\n",
"\n",
"# 3. Save the word embeddings. We can save as binary format (to save space) or ASCII format.\n",
"word2vec_model.wv.save_word2vec_format(SAVE_FILES_PATH+\"word2vec_model\", binary=True) # binary format\n",
"word2vec_model.wv.save_word2vec_format(SAVE_FILES_PATH+\"word2vec_model\", binary=False) # ASCII format"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## fastText"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"fastText is an unsupervised algorithm created by Facebook Research for efficiently learning word embeddings (see [original research paper](https://arxiv.org/pdf/1607.04606.pdf)). fastText is significantly different than word2vec or GloVe in that these two algorithms treat each word as the smallest possible unit to find an embedding for. Conversely, fastText assumes that words are formed by an n-gram of characters (i.e. 2-grams of the word \"language\" would be {la, an, ng, gu, ua, ag, ge}). The embedding for a word is then composed of the sum of these character n-grams. This has advantages when finding word embeddings for rare words and words not present in the dictionary, as these words can still be broken down into character n-grams. Typically, for smaller datasets, fastText performs better than word2vec or GloVe. See this [tutorial](https://fasttext.cc/docs/en/unsupervised-tutorial.html) on fastText for more detail."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The gensim fastText model has many different parameters (see [here](https://radimrehurek.com/gensim/models/fasttext.html#gensim.models.fasttext.FastText)) but the ones that are useful to know about are: \n",
"- size: length of the word embedding/vector (defaults to 100)\n",
"- window: maximum distance between the word being predicted and the current word (defaults to 5)\n",
"- min_count: ignores all words that have a frequency lower than this value (defaults to 5)\n",
"- workers: number of worker threads used to train the model (defaults to 3)\n",
"- sg: training algorithm- 1 for skip-gram and 0 for CBOW (defaults to 0)\n",
"- iter: number of epochs (defaults to 5)\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"# Set up a Timer to see how long the model takes to train\n",
"t = Timer()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"t.start()\n",
"\n",
"# Train the FastText model\n",
"fastText_model = FastText(size=100, window=5, min_count=5, sentences=sentences, iter=5)\n",
"\n",
"t.stop()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Time elapsed: 9.3665\n"
]
}
],
"source": [
"print(\"Time elapsed: {}\".format(t))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can utilize the same attributes as we saw above for word2vec due to them both originating from the gensim package"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Embedding for apple: [-0.2255679 -0.15831569 0.03804937 0.47731966 0.47977886 -0.27653983\n",
" -0.27343377 -0.4507852 -0.05649747 0.01470412 0.27904618 -0.02155268\n",
" -0.02492249 -0.07855172 0.18532543 0.25709668 0.05939932 0.10333744\n",
" -0.09892524 -0.61932683 -0.15273307 -0.02246136 -0.06295346 -0.5022594\n",
" -0.13407618 -0.10411069 0.13370538 0.11902415 -0.44436237 0.27073038\n",
" 0.06540621 -0.02650584 -0.0179158 0.08797703 0.18899101 0.12898529\n",
" 0.05865225 -0.18658654 -0.40497953 -0.23991017 0.30457255 0.39893195\n",
" 0.2913193 -0.18734889 0.10662938 -0.1165131 -0.42884877 0.31400812\n",
" 0.04840293 0.10146416 -0.10285722 -0.21854313 -0.69022155 -0.48051542\n",
" -0.17416449 0.12879132 0.12302257 -0.32911557 -0.48828328 0.22531843\n",
" -0.35535514 -0.34300882 0.07264371 0.262703 -0.10182904 0.03486007\n",
" -0.09019874 0.12621203 0.35632437 -0.10350075 0.3397234 -0.04080832\n",
" -0.17116521 -0.20685913 0.18177888 0.19674565 0.00776504 -0.22853185\n",
" 0.01387324 -0.33452377 0.1017314 -0.06989139 0.15893722 0.02910445\n",
" -0.18428223 0.30011976 -0.05394572 -0.18550391 0.09144824 0.2203982\n",
" 0.3605487 -0.0106479 0.729859 0.516405 -0.44636923 -0.4128766\n",
" -0.523939 -0.20086594 -0.38725898 0.0440867 ]\n",
"\n",
"First 30 vocabulary words: ['plane', 'taking', '.', 'air', 'man', 'playing', 'large', 'flute', 'spreading', 'cheese', 'pizza', 'men', 'seated', 'fighting', 'smoking', 'piano', 'guitar', 'singing', 'woman', 'person']\n"
]
}
],
"source": [
"# 1. Let's see the word embedding for \"apple\" by accessing the \"wv\" attribute and passing in \"apple\" as the key.\n",
"print(\"Embedding for apple:\", fastText_model.wv[\"apple\"])\n",
"\n",
"# 2. Inspect the model vocabulary by accessing keys of the \"wv.vocab\" attribute. We'll print the first 20 words.\n",
"print(\"\\nFirst 30 vocabulary words:\", list(fastText_model.wv.vocab)[:20])\n",
"\n",
"# 3. Save the word embeddings. We can save as binary format (to save space) or ASCII format.\n",
"fastText_model.wv.save_word2vec_format(SAVE_FILES_PATH+\"fastText_model\", binary=True) # binary format\n",
"fastText_model.wv.save_word2vec_format(SAVE_FILES_PATH+\"fastText_model\", binary=False) # ASCII format"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## GloVe"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"GloVe is an unsupervised algorithm for obtaining word embeddings created by the Stanford NLP group (see [original research paper](https://nlp.stanford.edu/pubs/glove.pdf)). Training occurs on word-word co-occurrence statistics with the objective of learning word embeddings such that the dot product of two words' embeddings is equal to the words' probability of co-occurrence. See this [tutorial](https://nlp.stanford.edu/projects/glove/) on GloVe for more detailed background on the model. \n",
"\n",
"Gensim doesn't have an implementation of the GloVe model and the other python packages that implement GloVe are unstable, so we leveraged the code directly from the Stanford NLP [repo](https://github.com/stanfordnlp/GloVe). "
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"mkdir -p build\n",
"gcc src/glove.c -o build/glove -lm -pthread -Ofast -march=native -funroll-loops -Wall -Wextra -Wpedantic\n",
"\u001b[01m\u001b[Ksrc/glove.c:\u001b[m\u001b[K In function ‘\u001b[01m\u001b[Kglove_thread\u001b[m\u001b[K’:\n",
"\u001b[01m\u001b[Ksrc/glove.c:117:9:\u001b[m\u001b[K \u001b[01;35m\u001b[Kwarning: \u001b[m\u001b[Kignoring return value of ‘\u001b[01m\u001b[Kfread\u001b[m\u001b[K’, declared with attribute warn_unused_result [-Wunused-result]\n",
" fread(&cr, sizeof(CREC), 1, fin);\n",
"\u001b[01;32m\u001b[K ^\u001b[m\u001b[K\n",
"gcc src/shuffle.c -o build/shuffle -lm -pthread -Ofast -march=native -funroll-loops -Wall -Wextra -Wpedantic\n",
"\u001b[01m\u001b[Ksrc/shuffle.c:\u001b[m\u001b[K In function ‘\u001b[01m\u001b[Kshuffle_merge\u001b[m\u001b[K’:\n",
"\u001b[01m\u001b[Ksrc/shuffle.c:106:17:\u001b[m\u001b[K \u001b[01;35m\u001b[Kwarning: \u001b[m\u001b[Kignoring return value of ‘\u001b[01m\u001b[Kfread\u001b[m\u001b[K’, declared with attribute warn_unused_result [-Wunused-result]\n",
" fread(&array[i], sizeof(CREC), 1, fid[j]);\n",
"\u001b[01;32m\u001b[K ^\u001b[m\u001b[K\n",
"\u001b[01m\u001b[Ksrc/shuffle.c:\u001b[m\u001b[K In function ‘\u001b[01m\u001b[Kshuffle_by_chunks\u001b[m\u001b[K’:\n",
"\u001b[01m\u001b[Ksrc/shuffle.c:163:9:\u001b[m\u001b[K \u001b[01;35m\u001b[Kwarning: \u001b[m\u001b[Kignoring return value of ‘\u001b[01m\u001b[Kfread\u001b[m\u001b[K’, declared with attribute warn_unused_result [-Wunused-result]\n",
" fread(&array[i], sizeof(CREC), 1, fin);\n",
"\u001b[01;32m\u001b[K ^\u001b[m\u001b[K\n",
"gcc src/cooccur.c -o build/cooccur -lm -pthread -Ofast -march=native -funroll-loops -Wall -Wextra -Wpedantic\n",
"\u001b[01m\u001b[Ksrc/cooccur.c:\u001b[m\u001b[K In function ‘\u001b[01m\u001b[Kmerge_files\u001b[m\u001b[K’:\n",
"\u001b[01m\u001b[Ksrc/cooccur.c:267:9:\u001b[m\u001b[K \u001b[01;35m\u001b[Kwarning: \u001b[m\u001b[Kignoring return value of ‘\u001b[01m\u001b[Kfread\u001b[m\u001b[K’, declared with attribute warn_unused_result [-Wunused-result]\n",
" fread(&new, sizeof(CREC), 1, fid[i]);\n",
"\u001b[01;32m\u001b[K ^\u001b[m\u001b[K\n",
"\u001b[01m\u001b[Ksrc/cooccur.c:277:5:\u001b[m\u001b[K \u001b[01;35m\u001b[Kwarning: \u001b[m\u001b[Kignoring return value of ‘\u001b[01m\u001b[Kfread\u001b[m\u001b[K’, declared with attribute warn_unused_result [-Wunused-result]\n",
" fread(&new, sizeof(CREC), 1, fid[i]);\n",
"\u001b[01;32m\u001b[K ^\u001b[m\u001b[K\n",
"\u001b[01m\u001b[Ksrc/cooccur.c:290:9:\u001b[m\u001b[K \u001b[01;35m\u001b[Kwarning: \u001b[m\u001b[Kignoring return value of ‘\u001b[01m\u001b[Kfread\u001b[m\u001b[K’, declared with attribute warn_unused_result [-Wunused-result]\n",
" fread(&new, sizeof(CREC), 1, fid[i]);\n",
"\u001b[01;32m\u001b[K ^\u001b[m\u001b[K\n",
"gcc src/vocab_count.c -o build/vocab_count -lm -pthread -Ofast -march=native -funroll-loops -Wall -Wextra -Wpedantic\n"
]
}
],
"source": [
"# Define path\n",
"glove_model_path = os.path.join(NLP_REPO_PATH, \"utils_nlp\", \"models\", \"glove\")\n",
"# Execute shell commands\n",
"!cd $glove_model_path && make"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Train GloVe vectors"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Training GloVe embeddings requires some data prep and then 4 steps (also documented in the original Stanford NLP repo [here](https://github.com/stanfordnlp/GloVe/tree/master/src))."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Step 0: Prepare Data**\n",
" \n",
"In order to train our GloVe vectors, we first need to save our corpus as a text file with all words separated by 1+ spaces or tabs. Each document/sentence is separated by a new line character."
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"# Save our corpus as tokens delimited by spaces with new line characters in between sentences.\n",
"training_corpus_file_path = os.path.join(SAVE_FILES_PATH, \"training-corpus-cleaned.txt\")\n",
"with open(training_corpus_file_path, 'w', encoding='utf8') as file:\n",
" for sent in sentences:\n",
" file.write(\" \".join(sent) + \"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"# Set up a Timer to see how long the model takes to train\n",
"t = Timer()\n",
"t.start()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Step 1: Build Vocabulary**\n",
"\n",
"Run the vocab_count executable. There are 3 optional parameters:\n",
"1. min-count: lower limit on how many times a word must appear in dataset. Otherwise the word is discarded from our vocabulary.\n",
"2. max-vocab: upper bound on the number of vocabulary words to keep\n",
"3. verbose: 0, 1, or 2 (default)\n",
"\n",
"Then provide the path to the text file we created in Step 0 followed by a file path that we'll save the vocabulary to "
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"BUILDING VOCABULARY\r\n",
"Processed 0 tokens.\u001b[0GProcessed 85334 tokens.\r\n",
"Counted 11716 unique words.\r\n",
"Truncating vocabulary at min count 5.\r\n",
"Using vocabulary of size 2943.\r\n",
"\r\n"
]
}
],
"source": [
"# Define path\n",
"vocab_count_exe_path = os.path.join(glove_model_path, \"build\", \"vocab_count\")\n",
"vocab_file_path = os.path.join(SAVE_FILES_PATH, \"vocab.txt\")\n",
"# Execute shell commands\n",
"!$vocab_count_exe_path -min-count 5 -verbose 2 <$training_corpus_file_path> $vocab_file_path"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Step 2: Construct Word Co-occurrence Statistics**\n",
"\n",
"Run the cooccur executable. There are many optional parameters, but we list the top ones here:\n",
"1. symmetric: 0 for only looking at left context, 1 (default) for looking at both left and right context\n",
"2. window-size: number of context words to use (default 15)\n",
"3. verbose: 0, 1, or 2 (default)\n",
"4. vocab-file: path/name of the vocabulary file created in Step 1\n",
"5. memory: soft limit for memory consumption, default 4\n",
"6. max-product: limit the size of dense co-occurrence array by specifying the max product (integer) of the frequency counts of the two co-occurring words\n",
"\n",
"Then provide the path to the text file we created in Step 0 followed by a file path that we'll save the co-occurrences to"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"COUNTING COOCCURRENCES\n",
"window size: 15\n",
"context: symmetric\n",
"max product: 13752509\n",
"overflow length: 38028356\n",
"Reading vocab from file \"../../data/trained_word_embeddings/vocab.txt\"...loaded 2943 words.\n",
"Building lookup table...table contains 8661250 elements.\n",
"Processing token: 0\u001b[0GProcessed 85334 tokens.\n",
"Writing cooccurrences to disk......2 files in total.\n",
"Merging cooccurrence files: processed 0 lines.\u001b[39G0 lines.\u001b[39G100000 lines.\u001b[0GMerging cooccurrence files: processed 188154 lines.\n",
"\n"
]
}
],
"source": [
"# Define path\n",
"cooccur_exe_path = os.path.join(glove_model_path, \"build\", \"cooccur\")\n",
"cooccurrence_file_path = os.path.join(SAVE_FILES_PATH, \"cooccurrence.bin\")\n",
"# Execute shell commands\n",
"!$cooccur_exe_path -memory 4 -vocab-file $vocab_file_path -verbose 2 -window-size 15 <$training_corpus_file_path> $cooccurrence_file_path"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Step 3: Shuffle the Co-occurrences**\n",
"\n",
"Run the shuffle executable. The parameters are as follows:\n",
"1. verbose: 0, 1, or 2 (default)\n",
"2. memory: soft limit for memory consumption, default 4\n",
"3. array-size: limit to the length of the buffer which stores chunks of data to shuffle before writing to disk\n",
"\n",
"Then provide the path to the co-occurrence file we created in Step 2 followed by a file path that we'll save the shuffled co-occurrences to"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SHUFFLING COOCCURRENCES\r\n",
"array size: 255013683\r\n",
"Shuffling by chunks: processed 0 lines.\u001b[22Gprocessed 188154 lines.\r\n",
"Wrote 1 temporary file(s).\r\n",
"Merging temp files: processed 0 lines.\u001b[31G188154 lines.\u001b[0GMerging temp files: processed 188154 lines.\r\n",
"\r\n"
]
}
],
"source": [
"# Define path\n",
"shuffle_exe_path = os.path.join(glove_model_path, \"build\", \"shuffle\")\n",
"cooccurrence_shuf_file_path = os.path.join(SAVE_FILES_PATH, \"cooccurrence.shuf.bin\")\n",
"# Execute shell commands\n",
"!$shuffle_exe_path -memory 4 -verbose 2 <$cooccurrence_file_path> $cooccurrence_shuf_file_path"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Step 4: Train GloVe model**\n",
"\n",
"Run the glove executable. There are many parameter options, but the top ones are listed below:\n",
"1. verbose: 0, 1, or 2 (default)\n",
"2. vector-size: dimension of word embeddings (50 is default)\n",
"3. threads: number threads, default 8\n",
"4. iter: number of iterations, default 25\n",
"5. eta: learning rate, default 0.05\n",
"6. binary: whether to save binary format (0: text = default, 1: binary, 2: both)\n",
"7. x-max: cutoff for weighting function, default is 100\n",
"8. vocab-file: file containing vocabulary as produced in Step 1\n",
"9. save-file: filename to save vectors to \n",
"10. input-file: filename with co-occurrences as returned from Step 3"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TRAINING MODEL\n",
"Read 188154 lines.\n",
"Initializing parameters...done.\n",
"vector size: 50\n",
"vocab size: 2943\n",
"x_max: 10.000000\n",
"alpha: 0.750000\n",
"08/13/19 - 05:39.53PM, iter: 001, cost: 0.078545\n",
"08/13/19 - 05:39.53PM, iter: 002, cost: 0.072337\n",
"08/13/19 - 05:39.53PM, iter: 003, cost: 0.070195\n",
"08/13/19 - 05:39.53PM, iter: 004, cost: 0.066766\n",
"08/13/19 - 05:39.53PM, iter: 005, cost: 0.063480\n",
"08/13/19 - 05:39.53PM, iter: 006, cost: 0.060623\n",
"08/13/19 - 05:39.53PM, iter: 007, cost: 0.058089\n",
"08/13/19 - 05:39.53PM, iter: 008, cost: 0.056030\n",
"08/13/19 - 05:39.53PM, iter: 009, cost: 0.053907\n",
"08/13/19 - 05:39.53PM, iter: 010, cost: 0.051774\n",
"08/13/19 - 05:39.53PM, iter: 011, cost: 0.049576\n",
"08/13/19 - 05:39.53PM, iter: 012, cost: 0.047385\n",
"08/13/19 - 05:39.53PM, iter: 013, cost: 0.045207\n",
"08/13/19 - 05:39.53PM, iter: 014, cost: 0.043098\n",
"08/13/19 - 05:39.53PM, iter: 015, cost: 0.041065\n"
]
}
],
"source": [
"# Define path\n",
"glove_exe_path = os.path.join(glove_model_path, \"build\", \"glove\")\n",
"glove_vector_file_path = os.path.join(SAVE_FILES_PATH, \"GloVe_vectors\")\n",
"# Execute shell commands\n",
"!$glove_exe_path -save-file $glove_vector_file_path -threads 8 -input-file \\\n",
"$cooccurrence_shuf_file_path -x-max 10 -iter 15 -vector-size 50 -binary 2 \\\n",
"-vocab-file $vocab_file_path -verbose 2"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"t.stop()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Time elapsed: 3.4293\n"
]
}
],
"source": [
"print(\"Time elapsed: {}\".format(t))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Inspect Word Vectors"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Like we did above for the word2vec and fastText models, let's now inspect our word embeddings"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"#load in the saved word vectors.\n",
"glove_wv = {}\n",
"glove_vector_txt_file_path = os.path.join(SAVE_FILES_PATH, \"GloVe_vectors.txt\")\n",
"with open(glove_vector_txt_file_path, encoding='utf-8') as f:\n",
" for line in f:\n",
" split_line = line.split(\" \")\n",
" glove_wv[split_line[0]] = [float(i) for i in split_line[1:]]"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Embedding for apple: [-0.037004, -0.000665, -0.028638, 0.025758, -0.050187, 0.038694, 0.016966, -0.042032, -0.033963, 0.143667, -0.068749, -0.005046, 0.180022, 0.088593, -0.04615, -0.013351, 0.064172, 0.051637, -0.000885, 0.009899, -0.092548, -0.026595, 0.036515, -0.09158, -0.027992, 0.016924, -0.024003, -0.029879, 0.252747, 0.093754, -0.034897, 0.079439, -0.073516, -0.110923, 0.095652, 0.072123, -0.047069, -0.17929, -0.068377, -0.224694, -0.016158, 0.236704, 0.010695, -0.133073, 0.084929, 0.102969, 0.040056, -0.009444, -0.051333, 0.130339]\n",
"\n",
"First 30 vocabulary words: ['.', ',', 'man', '-', '\"', 'woman', \"'\", 'said', 'dog', 'playing', ':', 'white', 'black', '$', 'killed', 'percent', 'new', 'syria', 'people', 'china']\n"
]
}
],
"source": [
"# 1. Let's see the word embedding for \"apple\" by passing in \"apple\" as the key.\n",
"print(\"Embedding for apple:\", glove_wv[\"apple\"])\n",
"\n",
"# 2. Inspect the model vocabulary by accessing keys of the \"wv.vocab\" attribute. We'll print the first 20 words.\n",
"print(\"\\nFirst 30 vocabulary words:\", list(glove_wv.keys())[:20])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Concluding Remarks\n",
"\n",
"In this notebook we have shown how to train word2vec, GloVe, and fastText word embeddings on the STS Benchmark dataset. We also inspected how long each model took to train on our dataset: word2vec took 0.39 seconds, GloVe took 8.16 seconds, and fastText took 10.41 seconds.\n",
"\n",
"FastText is typically regarded as the best baseline for word embeddings (see [blog](https://medium.com/huggingface/universal-word-sentence-embeddings-ce48ddc8fc3a)) and is a good place to start when generating word embeddings. Now that we generated word embeddings on our dataset, we could also repeat the baseline_deep_dive notebook using these embeddings (versus the pre-trained ones from the internet). "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (nlp_gpu)",
"language": "python",
"name": "nlp_gpu"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
================================================
FILE: examples/entailment/README.md
================================================
# Natural Language Inference (NLI)
This folder provides end-to-end examples of building Natural Language Inference (NLI) models. We
demonstrate the best practices of data preprocessing and model building for NLI task and use the
utility scripts in the [utils_nlp](../../utils_nlp) folder to speed up these processes.
NLI is one of many NLP tasks that require robust compositional sentence understanding, but it's
simpler compared to other tasks like question answering and machine translation.
Currently, we focus on fine-tuning pre-trained BERT model. If you are interested in pre-training your own BERT model, you can view the [AzureML-BERT repo](https://github.com/microsoft/AzureML-BERT), which walks through the process in depth. We plan to continue adding state-of-the-art models as they come up and welcome community contributions.
## Natural Language Inference
Natural Language Inference or Recognizing Textual Entailment (RTE) is the task of classifying
a pair of premise and hypothesis sentences into three classes: contradiction, neutral, and
entailment. For example,
|Premise|Hypothesis|Label|
|-------|----------|-----|
|A man inspects the uniform of a figure in some East Asian country.|The man is sleeping.|contradiction|
|An older and younger man smiling.|Two men are smiling and laughing at the cats playing on the floor.|neutral|
|A soccer game with multiple males playing.|Some men are playing a sport.|entailment|
## Summary
|Notebook|Environment|Description|Dataset| Language |
|--------|:-----------:|-------|----------|---------|
|[entailment_multinli_transformers.ipynb](entailment_multinli_transformers.ipynb)|Local|Fine-tuning of pre-trained BERT model for NLI|[MultiNLI](https://www.nyu.edu/projects/bowman/multinli/)| en |
|[entailment_xnli_bert_azureml.ipynb](entailment_xnli_bert_azureml.ipynb)|AzureML|**Distributed** fine-tuning of pre-trained BERT model for NLI|[XNLI](https://www.nyu.edu/projects/bowman/xnli/)| en
================================================
FILE: examples/entailment/entailment_multinli_transformers.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"*Copyright (c) Microsoft Corporation. All rights reserved.* \n",
"\n",
"*Licensed under the MIT License.*"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Natural Language Inference on MultiNLI Dataset using Transformers"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Before You Start\n",
"\n",
"It takes about 4 hours to fine-tune the `bert-large-cased` model on a Standard_NC24rs_v3 Azure Data Science Virtual Machine with 4 NVIDIA Tesla V100 GPUs. \n",
"> **Tip:** If you want to run through the notebook quickly, you can set the **`QUICK_RUN`** flag in the cell below to **`True`** to run the notebook on a small subset of the data and a smaller number of epochs. \n",
"\n",
"\n",
"If you run into CUDA out-of-memory error, try reducing the `BATCH_SIZE` and `MAX_SEQ_LENGTH`, but note that model performance will be compromised. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"## Set QUICK_RUN = True to run the notebook on a small subset of data and a smaller number of epochs.\n",
"QUICK_RUN = False"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Summary\n",
"In this notebook, we demostrate fine-tuning pretrained transformer models to perform Natural Language Inference (NLI). We use the [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) dataset and the task is to classify sentence pairs into three classes: contradiction, entailment, and neutral. \n",
"To classify a sentence pair, we concatenate the tokens in both sentences and separate the sentences by the special [SEP] token. A [CLS] token is prepended to the token list and used as the aggregate sequence representation for the classification task.The NLI task essentially becomes a sequence classification task. For example, the figure below shows how [BERT](https://arxiv.org/abs/1810.04805) classifies sentence pairs. \n",
"<img src=\"https://nlpbp.blob.core.windows.net/images/bert_two_sentence.PNG\">\n",
"\n",
"We compare the training time and performance of bert-large-cased and xlnet-large-cased. The model used can be set in the **Configurations** section. "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"import sys, os\n",
"nlp_path = os.path.abspath('../../')\n",
"if nlp_path not in sys.path:\n",
" sys.path.insert(0, nlp_path)\n",
"\n",
"import scrapbook as sb\n",
"\n",
"from tempfile import TemporaryDirectory\n",
"\n",
"import numpy as np\n",
"from sklearn.metrics import classification_report\n",
"from sklearn.preprocessing import LabelEncoder\n",
"\n",
"import torch\n",
"\n",
"from utils_nlp.models.transformers.sequence_classification import Processor, SequenceClassifier\n",
"from utils_nlp.dataset.multinli import load_pandas_df\n",
"from utils_nlp.common.pytorch_utils import dataloader_from_dataset\n",
"from utils_nlp.common.timer import Timer"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To see all the model supported by `SequenceClassifier`, call the `list_supported_models` method. \n",
"**Note**: Although `SequenceClassifier` supports distilbert for single sequence classification, distilbert doesn't support sentence pair classification and can not be used in this notebook"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"SequenceClassifier.list_supported_models()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Configurations"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"MODEL_NAME = \"bert-large-cased\"\n",
"TO_LOWER = False\n",
"BATCH_SIZE = 16\n",
"\n",
"# MODEL_NAME = \"xlnet-large-cased\"\n",
"# TO_LOWER = False\n",
"# BATCH_SIZE = 16\n",
"\n",
"TRAIN_DATA_USED_FRACTION = 1\n",
"DEV_DATA_USED_FRACTION = 1\n",
"NUM_EPOCHS = 2\n",
"WARMUP_STEPS= 2500\n",
"\n",
"if QUICK_RUN:\n",
" TRAIN_DATA_USED_FRACTION = 0.001\n",
" DEV_DATA_USED_FRACTION = 0.01\n",
" NUM_EPOCHS = 1\n",
" WARMUP_STEPS= 10\n",
"\n",
"if not torch.cuda.is_available():\n",
" BATCH_SIZE = BATCH_SIZE/2\n",
"\n",
"RANDOM_SEED = 42\n",
"\n",
"# model configurations\n",
"MAX_SEQ_LENGTH = 128\n",
"\n",
"# optimizer configurations\n",
"LEARNING_RATE= 5e-5\n",
"\n",
"# data configurations\n",
"TEXT_COL_1 = \"sentence1\"\n",
"TEXT_COL_2 = \"sentence2\"\n",
"LABEL_COL = \"gold_label\"\n",
"LABEL_COL_NUM = \"gold_label_num\"\n",
"\n",
"CACHE_DIR = TemporaryDirectory().name"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Data\n",
"The MultiNLI dataset comes with three subsets: train, dev_matched, dev_mismatched. The dev_matched dataset are from the same genres as the train dataset, while the dev_mismatched dataset are from genres not seen in the training dataset. \n",
"The `load_pandas_df` function downloads and extracts the zip files if they don't already exist in `local_cache_path` and returns the data subset specified by `file_split`."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"train_df = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"train\")\n",
"dev_df_matched = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"dev_matched\")\n",
"dev_df_mismatched = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"dev_mismatched\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dev_df_matched = dev_df_matched.loc[dev_df_matched['gold_label'] != '-']\n",
"dev_df_mismatched = dev_df_mismatched.loc[dev_df_mismatched['gold_label'] != '-']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"Training dataset size: {}\".format(train_df.shape[0]))\n",
"print(\"Development (matched) dataset size: {}\".format(dev_df_matched.shape[0]))\n",
"print(\"Development (mismatched) dataset size: {}\".format(dev_df_mismatched.shape[0]))\n",
"print()\n",
"print(train_df[['gold_label', 'sentence1', 'sentence2']].head())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# sample\n",
"train_df = train_df.sample(frac=TRAIN_DATA_USED_FRACTION).reset_index(drop=True)\n",
"dev_df_matched = dev_df_matched.sample(frac=DEV_DATA_USED_FRACTION).reset_index(drop=True)\n",
"dev_df_mismatched = dev_df_mismatched.sample(frac=DEV_DATA_USED_FRACTION).reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"label_encoder = LabelEncoder()\n",
"train_labels = label_encoder.fit_transform(train_df[LABEL_COL])\n",
"train_df[LABEL_COL_NUM] = train_labels \n",
"num_labels = len(np.unique(train_labels))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Tokenize and Preprocess\n",
"Before training, we tokenize and preprocess the sentence texts to convert them into the format required by transformer model classes. \n",
"The `dataset_from_dataframe` method of the `Processor` class performs the following preprocessing steps and returns a Pytorch `DataSet`\n",
"* Tokenize input texts using the tokenizer of the pre-trained model specified by `model_name`. \n",
"* Convert the tokens into token indices corresponding to the tokenizer's vocabulary.\n",
"* Pad or truncate the token lists to the specified max length."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"processor = Processor(model_name=MODEL_NAME, cache_dir=CACHE_DIR, to_lower=TO_LOWER)\n",
"\n",
"train_dataset = processor.dataset_from_dataframe(\n",
" df=train_df,\n",
" text_col=TEXT_COL_1,\n",
" label_col=LABEL_COL_NUM,\n",
" text2_col=TEXT_COL_2,\n",
" max_len=MAX_SEQ_LENGTH,\n",
")\n",
"dev_dataset_matched = processor.dataset_from_dataframe(\n",
" df=dev_df_matched,\n",
" text_col=TEXT_COL_1, \n",
" text2_col=TEXT_COL_2,\n",
" max_len=MAX_SEQ_LENGTH,\n",
")\n",
"dev_dataset_mismatched = processor.dataset_from_dataframe(\n",
" df=dev_df_mismatched,\n",
" text_col=TEXT_COL_1, \n",
" text2_col=TEXT_COL_2,\n",
" max_len=MAX_SEQ_LENGTH,\n",
")\n",
"\n",
"train_dataloader = dataloader_from_dataset(\n",
" train_dataset, batch_size=BATCH_SIZE, shuffle=True\n",
")\n",
"dev_dataloader_matched = dataloader_from_dataset(\n",
" dev_dataset_matched, batch_size=BATCH_SIZE, shuffle=False\n",
")\n",
"dev_dataloader_mismatched = dataloader_from_dataset(\n",
" dev_dataset_mismatched, batch_size=BATCH_SIZE, shuffle=False\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train and Predict"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create Classifier"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"classifier = SequenceClassifier(\n",
" model_name=MODEL_NAME, num_labels=num_labels, cache_dir=CACHE_DIR\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Train Classifier"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"with Timer() as t:\n",
" classifier.fit(\n",
" train_dataloader,\n",
" num_epochs=NUM_EPOCHS,\n",
" learning_rate=LEARNING_RATE,\n",
" warmup_steps=WARMUP_STEPS,\n",
" )\n",
"\n",
"print(\"Training time : {:.3f} hrs\".format(t.interval / 3600))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Predict on Test Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with Timer() as t:\n",
" predictions_matched = classifier.predict(dev_dataloader_matched)\n",
"print(\"Prediction time : {:.3f} hrs\".format(t.interval / 3600))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with Timer() as t:\n",
" predictions_mismatched = classifier.predict(dev_dataloader_mismatched)\n",
"print(\"Prediction time : {:.3f} hrs\".format(t.interval / 3600))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluate"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"predictions_matched = label_encoder.inverse_transform(predictions_matched)\n",
"print(classification_report(dev_df_matched[LABEL_COL], predictions_matched, digits=3))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"predictions_mismatched = label_encoder.inverse_transform(predictions_mismatched)\n",
"print(classification_report(dev_df_mismatched[LABEL_COL], predictions_mismatched, digits=3))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Compare Model Performance"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"|Model name|Training time|Scoring time|Matched F1|Mismatched F1|\n",
"|:--------:|:-----------:|:----------:|:--------:|:-----------:|\n",
"|xlnet-large-cased|5.15 hrs|0.11 hrs|0.887|0.890|\n",
"|bert-large-cased|4.01 hrs|0.08 hrs|0.867|0.867|"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"result_matched_dict = classification_report(dev_df_matched[LABEL_COL], predictions_matched, digits=3, output_dict=True)\n",
"result_mismatched_dict = classification_report(dev_df_mismatched[LABEL_COL], predictions_mismatched, digits=3, output_dict=True)\n",
"sb.glue(\"matched_precision\", result_matched_dict[\"weighted avg\"][\"precision\"])\n",
"sb.glue(\"matched_recall\", result_matched_dict[\"weighted avg\"][\"recall\"])\n",
"sb.glue(\"matched_f1\", result_matched_dict[\"weighted avg\"][\"f1-score\"])\n",
"sb.glue(\"mismatched_precision\", result_mismatched_dict[\"weighted avg\"][\"precision\"])\n",
"sb.glue(\"mismatched_recall\", result_mismatched_dict[\"weighted avg\"][\"recall\"])\n",
"sb.glue(\"mismatched_f1\", result_mismatched_dict[\"weighted avg\"][\"f1-score\"])"
]
}
],
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "nlp_gpu",
"language": "python",
"name": "nlp_gpu"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
================================================
FILE: examples/entailment/entailment_xnli_bert_azureml.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Natural Language Inference on XNLI Dataset using BERT with Azure Machine Learning\n",
"\n",
"## 1. Summary\n",
"In this notebook, we demonstrate how to fine-tune BERT using distributed training (Horovod) on Azure Machine Learning service to do language inference in English. We use the [XNLI](https://github.com/facebookresearch/XNLI) dataset and to classify sentence pairs into three classes: contradiction, entailment, and neutral. \n",
"\n",
"The figure below shows how [BERT](https://arxiv.org/abs/1810.04805) classifies sentence pairs. It concatenates the tokens in each sentence pairs and separates the sentences by the [SEP] token. A [CLS] token is prepended to the token list and used as the aggregate sequence representation for the classification task.\n",
"<img src=\"https://nlpbp.blob.core.windows.net/images/bert_two_sentence.PNG\">\n",
"\n",
"**Note: To learn how to do pre-training on your own, please reference the [AzureML-BERT repo](https://github.com/microsoft/AzureML-BERT) created by Microsoft.**"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# Imports\n",
"\n",
"import sys\n",
"\n",
"sys.path.append(\"../..\")\n",
"\n",
"import os\n",
"import shutil\n",
"import torch\n",
"import json\n",
"import pandas as pd\n",
"\n",
"import azureml.core\n",
"from azureml.train.dnn import PyTorch\n",
"from azureml.core.runconfig import MpiConfiguration\n",
"from azureml.core import Experiment\n",
"from azureml.widgets import RunDetails\n",
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
"from azureml.exceptions import ComputeTargetException\n",
"from utils_nlp.azureml.azureml_utils import get_or_create_workspace, get_output_files"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"# Parameters\n",
"\n",
"DEBUG = True\n",
"NODE_COUNT = 4\n",
"NUM_PROCESS = 1\n",
"DATA_PERCENT_USED = 1.0\n",
"\n",
"config_path = (\n",
" \"./.azureml\"\n",
") # Path to the directory containing config.json with azureml credentials\n",
"\n",
"# Azure resources\n",
"subscription_id = \"YOUR_SUBSCRIPTION_ID\"\n",
"resource_group = \"YOUR_RESOURCE_GROUP_NAME\" \n",
"workspace_name = \"YOUR_WORKSPACE_NAME\" \n",
"workspace_region = \"YOUR_WORKSPACE_REGION\" # eg: eastus, eastus2.\n",
"cluster_name = \"gpu-entail\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. AzureML Setup\n",
"\n",
"### 2.1 Initialize a Workspace\n",
"\n",
"The following cell looks to set up the connection to your [Azure Machine Learning service Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace). You can choose to connect to an existing workspace or create a new one. \n",
"\n",
"**To access an existing workspace:**\n",
"1. If you have a `config.json` file, you do not need to provide the workspace information; you will only need to update the `config_path` variable that is defined above which contains the file.\n",
"2. Otherwise, you will need to supply the following:\n",
" * The name of your workspace\n",
" * Your subscription id\n",
" * The resource group name\n",
"\n",
"**To create a new workspace:**\n",
"\n",
"Set the following information:\n",
"* A name for your workspace\n",
"* Your subscription id\n",
"* The resource group name\n",
"* [Azure region](https://azure.microsoft.com/en-us/global-infrastructure/regions/) to create the workspace in, such as `eastus2`. \n",
"\n",
"This will automatically create a new resource group for you in the region provided if a resource group with the name given does not already exist. "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"ws = get_or_create_workspace(\n",
" config_path=config_path,\n",
" subscription_id=subscription_id,\n",
" resource_group=resource_group,\n",
" workspace_name=workspace_name,\n",
" workspace_region=workspace_region,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\n",
" \"Workspace name: \" + ws.name,\n",
" \"Azure region: \" + ws.location,\n",
" \"Subscription id: \" + ws.subscription_id,\n",
" \"Resource group: \" + ws.resource_group,\n",
" sep=\"\\n\",\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.3 Link AmlCompute Compute Target\n",
"\n",
"We need to link a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training our model (see [compute options](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets#supported-compute-targets) for explanation of the different options). We will use an [AmlCompute](https://docs.microsoft.com/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute) target and link to an existing target (if the cluster_name exists) or create a STANDARD_NC6 GPU cluster (autoscales from 0 to 4 nodes) in this example. Creating a new AmlComputes takes approximately 5 minutes. \n",
"\n",
"As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found compute target: gpu-entail\n",
"{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-08-03T13:43:20.068000+00:00', 'errors': None, 'creationTime': '2019-07-27T02:14:46.127092+00:00', 'modifiedTime': '2019-07-27T02:15:07.181277+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6S_V2'}\n"
]
}
],
"source": [
"try:\n",
" compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n",
" print(\"Found compute target: {}\".format(cluster_name))\n",
"except ComputeTargetException:\n",
" print(\"Creating new compute target: {}\".format(cluster_name))\n",
" compute_config = AmlCompute.provisioning_configuration(\n",
" vm_size=\"STANDARD_NC6\", max_nodes=NODE_COUNT\n",
" )\n",
" compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n",
" compute_target.wait_for_completion(show_output=True)\n",
"\n",
"\n",
"print(compute_target.get_status().serialize())"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'./entail_utils\\\\utils_nlp'"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"project_dir = \"./entail_utils\"\n",
"if DEBUG and os.path.exists(project_dir):\n",
" shutil.rmtree(project_dir)\n",
"shutil.copytree(\"../../utils_nlp\", os.path.join(project_dir, \"utils_nlp\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Prepare Training Script"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Writing ./entail_utils/train.py\n"
]
}
],
"source": [
"%%writefile $project_dir/train.py\n",
"import horovod.torch as hvd\n",
"import torch\n",
"import numpy as np\n",
"import time\n",
"import argparse\n",
"from utils_nlp.common.timer import Timer\n",
"from utils_nlp.dataset.xnli_torch_dataset import XnliDataset\n",
"from utils_nlp.models.bert.common import Language\n",
"from utils_nlp.models.bert.sequence_classification_distributed import (\n",
" BERTSequenceClassifier,\n",
")\n",
"from sklearn.metrics import classification_report\n",
"\n",
"print(\"Torch version:\", torch.__version__)\n",
"\n",
"hvd.init()\n",
"\n",
"LANGUAGE_ENGLISH = \"en\"\n",
"TRAIN_FILE_SPLIT = \"train\"\n",
"TEST_FILE_SPLIT = \"test\"\n",
"TO_LOWERCASE = True\n",
"PRETRAINED_BERT_LNG = Language.ENGLISH\n",
"LEARNING_RATE = 5e-5\n",
"WARMUP_PROPORTION = 0.1\n",
"BATCH_SIZE = 32\n",
"NUM_GPUS = 1\n",
"OUTPUT_DIR = \"./outputs/\"\n",
"LABELS = [\"contradiction\", \"entailment\", \"neutral\"]\n",
"\n",
"## each machine gets it's own copy of data\n",
"CACHE_DIR = \"./xnli-%d\" % hvd.rank()\n",
"\n",
"parser = argparse.ArgumentParser()\n",
"# Training settings\n",
"parser.add_argument(\n",
" \"--seed\", type=int, default=42, metavar=\"S\", help=\"random seed (default: 42)\"\n",
")\n",
"parser.add_argument(\n",
" \"--epochs\", type=int, default=2, metavar=\"S\", help=\"random seed (default: 2)\"\n",
")\n",
"parser.add_argument(\n",
" \"--no-cuda\", action=\"store_true\", default=False, help=\"disables CUDA training\"\n",
")\n",
"parser.add_argument(\n",
" \"--data_percent_used\",\n",
" type=float,\n",
" default=1.0,\n",
" metavar=\"S\",\n",
" help=\"data percent used (default: 1.0)\",\n",
")\n",
"\n",
"args = parser.parse_args()\n",
"args.cuda = not args.no_cuda and torch.cuda.is_available()\n",
"\n",
"\"\"\"\n",
"Note: For example, you have 4 nodes and 4 GPUs each node, so you spawn 16 workers. \n",
"Every worker will have a rank [0, 15], and every worker will have a local_rank [0, 3]\n",
"\"\"\"\n",
"if args.cuda:\n",
" torch.cuda.set_device(hvd.local_rank())\n",
" torch.cuda.manual_seed(args.seed)\n",
"\n",
"# num_workers - this is equal to number of gpus per machine\n",
"kwargs = {\"num_workers\": NUM_GPUS, \"pin_memory\": True} if args.cuda else {}\n",
"\n",
"train_dataset = XnliDataset(\n",
" file_split=TRAIN_FILE_SPLIT,\n",
" cache_dir=CACHE_DIR,\n",
" language=LANGUAGE_ENGLISH,\n",
" to_lowercase=TO_LOWERCASE,\n",
" tok_language=PRETRAINED_BERT_LNG,\n",
" data_percent_used=args.data_percent_used,\n",
")\n",
"\n",
"\n",
"# set the label_encoder for evaluation\n",
"label_encoder = train_dataset.label_encoder\n",
"num_labels = len(np.unique(train_dataset.labels))\n",
"\n",
"# Train\n",
"classifier = BERTSequenceClassifier(\n",
" language=Language.ENGLISH,\n",
" num_labels=num_la
gitextract_zstwuz6r/
├── .amlignore
├── .bumpversion.cfg
├── .flake8
├── .github/
│ ├── ISSUE_TEMPLATE/
│ │ ├── bug_report.md
│ │ ├── feature_request.md
│ │ └── general-ask.md
│ ├── ISSUE_TEMPLATE.md
│ └── PULL_REQUEST_TEMPLATE.md
├── .gitignore
├── .pre-commit-config.yaml
├── CONTRIBUTING.md
├── DatasetReferences.md
├── LICENSE
├── MANIFEST.in
├── NOTICE.txt
├── README.md
├── SETUP.md
├── VERSIONING.md
├── _config.yml
├── cgmanifest.json
├── docker/
│ └── Dockerfile
├── docs/
│ ├── Makefile
│ ├── README.md
│ ├── _config.yml
│ └── source/
│ ├── azureml.rst
│ ├── conf.py
│ └── index.rst
├── examples/
│ ├── README.md
│ ├── annotation/
│ │ ├── Doccano.md
│ │ └── README.md
│ ├── embeddings/
│ │ ├── README.md
│ │ └── embedding_trainer.ipynb
│ ├── entailment/
│ │ ├── README.md
│ │ ├── entailment_multinli_transformers.ipynb
│ │ └── entailment_xnli_bert_azureml.ipynb
│ ├── model_explainability/
│ │ ├── README.md
│ │ └── interpret_dnn_layers.ipynb
│ ├── named_entity_recognition/
│ │ ├── README.md
│ │ └── ner_wikigold_transformer.ipynb
│ ├── question_answering/
│ │ ├── README.md
│ │ ├── bert_run_squad_azureml.py
│ │ ├── bidaf_aml_deep_dive.ipynb
│ │ ├── bidaf_config.json
│ │ ├── pretrained-BERT-SQuAD-deep-dive-aml.ipynb
│ │ ├── question_answering_squad_transformers.ipynb
│ │ └── question_answering_system_bidaf_quickstart.ipynb
│ ├── sentence_similarity/
│ │ ├── README.md
│ │ ├── automl_local_deployment_aci.ipynb
│ │ ├── automl_with_pipelines_deployment_aks.ipynb
│ │ ├── baseline_deep_dive.ipynb
│ │ ├── bert_encoder.ipynb
│ │ ├── bert_senteval.ipynb
│ │ ├── gensen_aml_deep_dive.ipynb
│ │ ├── gensen_config.json
│ │ ├── gensen_local.ipynb
│ │ ├── gensen_train.py
│ │ └── gensen_wrapper.py
│ ├── sentiment_analysis/
│ │ └── absa/
│ │ ├── README.md
│ │ ├── absa.ipynb
│ │ ├── absa_azureml.ipynb
│ │ └── dataset/
│ │ └── data.md
│ ├── text_classification/
│ │ ├── README.md
│ │ ├── tc_bert_azureml.ipynb
│ │ ├── tc_mnli_mtdnn.ipynb
│ │ ├── tc_mnli_transformers.ipynb
│ │ └── tc_multi_languages_transformers.ipynb
│ └── text_summarization/
│ ├── abstractive_summarization_bertsum_cnndm_distributed_train.py
│ ├── abstractive_summarization_bertsumabs_cnndm.ipynb
│ ├── abstractive_summarization_minilm_cnndm.ipynb
│ ├── abstractive_summarization_unilm_cnndm.ipynb
│ ├── abstractive_summarization_unilm_cnndm.py
│ ├── extractive_summarization_cnndm_aml_distributed.ipynb
│ ├── extractive_summarization_cnndm_distributed_train.py
│ ├── extractive_summarization_cnndm_transformer.ipynb
│ └── summarization_evaluation.ipynb
├── pyproject.toml
├── setup.py
├── tests/
│ ├── README.md
│ ├── __init__.py
│ ├── ci/
│ │ ├── azureml_integration_tests.yml
│ │ ├── component_governance.yml
│ │ ├── cpu_integration_tests_linux.yml
│ │ ├── cpu_unit_tests_linux.yml
│ │ ├── gpu_integration_tests_linux.yml
│ │ ├── gpu_unit_tests_linux.yml
│ │ ├── notebooks_cpu_unit_tests_linux.yml
│ │ └── notebooks_gpu_unit_tests_linux.yml
│ ├── conftest.py
│ ├── integration/
│ │ ├── test_ddp_summarization.py
│ │ ├── test_gpu_utils.py
│ │ ├── test_notebooks_abstractive_summarization_bertsumabs.py
│ │ ├── test_notebooks_embeddings.py
│ │ ├── test_notebooks_entailment.py
│ │ ├── test_notebooks_extractive_summarization.py
│ │ ├── test_notebooks_interpretability.py
│ │ ├── test_notebooks_minilm_abstractive_summarization.py
│ │ ├── test_notebooks_named_entity_recognition.py
│ │ ├── test_notebooks_question_answering.py
│ │ ├── test_notebooks_sentence_similarity.py
│ │ ├── test_notebooks_text_classification.py
│ │ └── test_notebooks_unilm_abstractive_summarization.py
│ ├── notebooks_common.py
│ ├── smoke/
│ │ ├── test_dataset.py
│ │ ├── test_gpu_utils.py
│ │ └── test_word_embeddings.py
│ └── unit/
│ ├── test_abstractive_summarization_bertsum.py
│ ├── test_abstractive_summarization_seq2seq.py
│ ├── test_bert_common.py
│ ├── test_bert_encoder.py
│ ├── test_bert_sentence_encoding.py
│ ├── test_common_pytorch_utils.py
│ ├── test_data_loaders.py
│ ├── test_dataset.py
│ ├── test_dataset_pytorch.py
│ ├── test_distributed_sampler.py
│ ├── test_eval_classification.py
│ ├── test_eval_compute_rouge.py
│ ├── test_extractive_summarization.py
│ ├── test_gensen_utils.py
│ ├── test_interpreter.py
│ ├── test_models_transformers_question_answering.py
│ ├── test_notebooks_cpu.py
│ ├── test_notebooks_gpu.py
│ ├── test_preprocess.py
│ ├── test_timer.py
│ ├── test_transformers_sequence_classification.py
│ └── test_transformers_token_classification.py
├── tools/
│ ├── README.md
│ ├── __init__.py
│ ├── generate_conda_file.py
│ ├── generate_requirements_txt.py
│ └── remove_pixelserver.py
└── utils_nlp/
├── README.md
├── __init__.py
├── azureml/
│ ├── README.md
│ ├── __init__.py
│ ├── azureml_bert_util.py
│ └── azureml_utils.py
├── common/
│ ├── README.md
│ ├── __init__.py
│ ├── pytorch_utils.py
│ └── timer.py
├── dataset/
│ ├── README.md
│ ├── __init__.py
│ ├── bbc_hindi.py
│ ├── cnndm.py
│ ├── dac.py
│ ├── data_loaders.py
│ ├── msrpc.py
│ ├── multinli.py
│ ├── ner_utils.py
│ ├── preprocess.py
│ ├── sentence_selection.py
│ ├── snli.py
│ ├── squad.py
│ ├── stsbenchmark.py
│ ├── url_utils.py
│ ├── wikigold.py
│ ├── xnli.py
│ └── xnli_torch_dataset.py
├── eval/
│ ├── README.md
│ ├── SentEval/
│ │ ├── .gitignore
│ │ ├── LICENSE
│ │ ├── README.md
│ │ ├── senteval/
│ │ │ ├── __init__.py
│ │ │ ├── binary.py
│ │ │ ├── engine.py
│ │ │ ├── mrpc.py
│ │ │ ├── probing.py
│ │ │ ├── rank.py
│ │ │ ├── sick.py
│ │ │ ├── snli.py
│ │ │ ├── sst.py
│ │ │ ├── sts.py
│ │ │ ├── tools/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── classifier.py
│ │ │ │ ├── ranking.py
│ │ │ │ ├── relatedness.py
│ │ │ │ └── validation.py
│ │ │ ├── trec.py
│ │ │ └── utils.py
│ │ └── setup.py
│ ├── __init__.py
│ ├── classification.py
│ ├── evaluate_squad.py
│ ├── evaluate_summarization.py
│ ├── question_answering.py
│ ├── rouge/
│ │ ├── compute_rouge.py
│ │ └── rouge_ext.py
│ └── senteval.py
├── interpreter/
│ ├── Interpreter.py
│ ├── README.md
│ └── __init__.py
├── language_utils/
│ └── hi/
│ └── hindi_stemmer.py
└── models/
├── README.md
├── bert/
│ ├── README.md
│ ├── __init__.py
│ ├── common.py
│ ├── sequence_classification.py
│ ├── sequence_classification_distributed.py
│ ├── sequence_encoding.py
│ └── token_classification.py
├── gensen/
│ ├── README.md
│ ├── __init__.py
│ ├── create_gensen_model.py
│ ├── gensen.py
│ ├── multi_task_model.py
│ ├── preprocess_utils.py
│ └── utils.py
├── glove/
│ ├── Makefile
│ ├── README.md
│ ├── demo.sh
│ └── src/
│ ├── README.md
│ ├── cooccur.c
│ ├── glove.c
│ ├── shuffle.c
│ └── vocab_count.c
├── pretrained_embeddings/
│ ├── README.md
│ ├── __init__.py
│ ├── fasttext.py
│ ├── glove.py
│ └── word2vec.py
├── pytorch_modules/
│ ├── README.md
│ ├── __init__.py
│ └── conditional_gru.py
├── transformers/
│ ├── abstractive_summarization_bertsum.py
│ ├── abstractive_summarization_seq2seq.py
│ ├── bertsum/
│ │ ├── __init__.py
│ │ ├── adam.py
│ │ ├── beam.py
│ │ ├── data_loader.py
│ │ ├── dataset.py
│ │ ├── decoder.py
│ │ ├── encoder.py
│ │ ├── loss.py
│ │ ├── model_builder.py
│ │ ├── neural.py
│ │ ├── optimizers.py
│ │ ├── penalties.py
│ │ └── predictor.py
│ ├── common.py
│ ├── datasets.py
│ ├── extractive_summarization.py
│ ├── named_entity_recognition.py
│ ├── question_answering.py
│ └── sequence_classification.py
└── xlnet/
├── README.md
├── common.py
└── sequence_classification.py
SYMBOL INDEX (963 symbols across 134 files)
FILE: docs/source/conf.py
function skip (line 231) | def skip(app, what, name, obj, would_skip, options):
function setup (line 237) | def setup(app):
FILE: examples/question_answering/bert_run_squad_azureml.py
class SquadExample (line 52) | class SquadExample(object):
method __init__ (line 55) | def __init__(self,
method __str__ (line 69) | def __str__(self):
method __repr__ (line 72) | def __repr__(self):
class InputFeatures (line 85) | class InputFeatures(object):
method __init__ (line 88) | def __init__(self,
function read_squad_examples (line 113) | def read_squad_examples(input_file, is_training):
function convert_examples_to_features (line 182) | def convert_examples_to_features(examples, tokenizer, max_seq_length,
function _improve_answer_span (line 334) | def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
function _check_is_max_context (line 371) | def _check_is_max_context(doc_spans, cur_span_index, position):
function write_predictions (line 413) | def write_predictions(all_examples, all_features, all_results, n_best_size,
function get_final_text (line 547) | def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=...
function _get_best_indexes (line 643) | def _get_best_indexes(logits, n_best_size):
function _compute_softmax (line 655) | def _compute_softmax(scores):
function main (line 678) | def main():
FILE: examples/sentence_similarity/gensen_train.py
function metric_average (line 49) | def metric_average(value, name):
function setup_horovod (line 61) | def setup_horovod(model, learning_rate):
function setup_logging (line 92) | def setup_logging(config):
function log_config (line 107) | def log_config(config):
function evaluate (line 126) | def evaluate(
function evaluate_nli (line 223) | def evaluate_nli(nli_iterator, model, batch_size, n_gpus):
function train (line 274) | def train(config, data_folder, learning_rate=0.0001, max_epoch=None):
function read_config (line 616) | def read_config(json_file):
FILE: examples/sentence_similarity/gensen_wrapper.py
class GenSenClassifier (line 15) | class GenSenClassifier:
method __init__ (line 27) | def __init__(
method _validate_params (line 44) | def _validate_params(self):
method _get_gensen_tokens (line 63) | def _get_gensen_tokens(self, train_df=None, dev_df=None, test_df=None):
method _read_config (line 81) | def _read_config(config_file):
method _create_multiseq2seq_model (line 94) | def _create_multiseq2seq_model(self):
method fit (line 107) | def fit(self, train_df, dev_df, test_df):
method predict (line 128) | def predict(self, sentences):
FILE: examples/text_summarization/abstractive_summarization_bertsum_cnndm_distributed_train.py
function main (line 168) | def main():
function main_worker (line 199) | def main_worker(
FILE: examples/text_summarization/abstractive_summarization_unilm_cnndm.py
function main (line 54) | def main():
FILE: examples/text_summarization/extractive_summarization_cnndm_distributed_train.py
function cleanup (line 141) | def cleanup():
function main (line 150) | def main():
function main_worker (line 173) | def main_worker(local_rank, ngpus_per_node, summarizer, args):
FILE: setup.py
function read (line 15) | def read(*names, **kwargs):
FILE: tests/conftest.py
function scripts (line 28) | def scripts():
function notebooks (line 47) | def notebooks():
function tmp (line 146) | def tmp(tmp_path_factory):
function tmp_module (line 155) | def tmp_module(tmp_path_factory):
function ner_test_data (line 164) | def ner_test_data():
function qa_test_df (line 256) | def qa_test_df():
function pytest_addoption (line 308) | def pytest_addoption(parser):
function subscription_id (line 321) | def subscription_id(request):
function resource_group (line 326) | def resource_group(request):
function workspace_name (line 331) | def workspace_name(request):
function workspace_region (line 336) | def workspace_region(request):
function cluster_name (line 341) | def cluster_name(request):
function bert_english_tokenizer (line 346) | def bert_english_tokenizer():
function xlnet_english_tokenizer (line 351) | def xlnet_english_tokenizer():
function teardown_service (line 356) | def teardown_service(subscription_id, resource_group, workspace_name, wo...
FILE: tests/integration/test_ddp_summarization.py
function test_ddp_extractive_summarization_cnndm_transformers (line 10) | def test_ddp_extractive_summarization_cnndm_transformers(scripts, tmp):
function test_ddp_abstractive_summarization_cnndm_transformers (line 52) | def test_ddp_abstractive_summarization_cnndm_transformers(scripts, tmp):
FILE: tests/integration/test_gpu_utils.py
function test_machine_is_gpu_machine (line 10) | def test_machine_is_gpu_machine():
FILE: tests/integration/test_notebooks_abstractive_summarization_bertsumabs.py
function test_abstractive_summarization_bertsumabs_cnndm (line 15) | def test_abstractive_summarization_bertsumabs_cnndm(notebooks, tmp):
FILE: tests/integration/test_notebooks_embeddings.py
function test_embedding_trainer_runs (line 12) | def test_embedding_trainer_runs(notebooks):
FILE: tests/integration/test_notebooks_entailment.py
function test_entailment_multinli_bert (line 17) | def test_entailment_multinli_bert(notebooks, tmp):
function test_entailment_xnli_bert_azureml (line 42) | def test_entailment_xnli_bert_azureml(
FILE: tests/integration/test_notebooks_extractive_summarization.py
function test_extractive_summarization_cnndm_transformers (line 14) | def test_extractive_summarization_cnndm_transformers(notebooks, tmp):
function test_extractive_summarization_cnndm_transformers_processed (line 41) | def test_extractive_summarization_cnndm_transformers_processed(notebooks...
FILE: tests/integration/test_notebooks_interpretability.py
function test_deep_and_unified_understanding (line 13) | def test_deep_and_unified_understanding(notebooks):
FILE: tests/integration/test_notebooks_minilm_abstractive_summarization.py
function test_minilm_abstractive_summarization (line 15) | def test_minilm_abstractive_summarization(notebooks, tmp):
function test_minilm_abstractive_summarization (line 40) | def test_minilm_abstractive_summarization(notebooks, tmp):
FILE: tests/integration/test_notebooks_named_entity_recognition.py
function test_ner_wikigold_bert (line 13) | def test_ner_wikigold_bert(notebooks, tmp):
FILE: tests/integration/test_notebooks_question_answering.py
function test_question_answering_squad_transformers (line 13) | def test_question_answering_squad_transformers(notebooks, tmp):
function test_bidaf_deep_dive (line 38) | def test_bidaf_deep_dive(
function test_bidaf_quickstart (line 65) | def test_bidaf_quickstart(
function test_bert_qa_runs (line 88) | def test_bert_qa_runs(notebooks, subscription_id, resource_group, worksp...
FILE: tests/integration/test_notebooks_sentence_similarity.py
function baseline_results (line 15) | def baseline_results():
function test_similarity_embeddings_baseline_runs (line 37) | def test_similarity_embeddings_baseline_runs(notebooks, baseline_results):
function test_gensen_local (line 50) | def test_gensen_local(notebooks):
function test_bert_encoder (line 73) | def test_bert_encoder(notebooks, tmp):
function test_bert_senteval (line 87) | def test_bert_senteval(
function test_similarity_embeddings_baseline_runs (line 116) | def test_similarity_embeddings_baseline_runs(notebooks, baseline_results):
function test_automl_local_deployment_aci (line 127) | def test_automl_local_deployment_aci(
function test_gensen_aml_deep_dive (line 154) | def test_gensen_aml_deep_dive(notebooks):
function test_automl_with_pipelines_deployment_aks (line 181) | def test_automl_with_pipelines_deployment_aks(notebooks):
FILE: tests/integration/test_notebooks_text_classification.py
function test_tc_mnli_transformers (line 18) | def test_tc_mnli_transformers(notebooks, tmp):
function test_tc_bert_azureml (line 43) | def test_tc_bert_azureml(
function test_multi_languages_transformer (line 80) | def test_multi_languages_transformer(notebooks, tmp):
FILE: tests/integration/test_notebooks_unilm_abstractive_summarization.py
function test_unilm_abstractive_summarization (line 15) | def test_unilm_abstractive_summarization(notebooks, tmp):
function test_unilm_abstractive_summarization (line 41) | def test_unilm_abstractive_summarization(notebooks, tmp):
FILE: tests/notebooks_common.py
function path_notebooks (line 12) | def path_notebooks():
FILE: tests/smoke/test_dataset.py
function test_msrpc_download (line 12) | def test_msrpc_download(tmp_path):
function test_msrpc_load_df (line 20) | def test_msrpc_load_df(tmp_path):
function test_xnli (line 27) | def test_xnli(tmp_path):
FILE: tests/smoke/test_gpu_utils.py
function test_machine_is_gpu_machine (line 10) | def test_machine_is_gpu_machine():
FILE: tests/smoke/test_word_embeddings.py
function test_load_pretrained_vectors_word2vec (line 22) | def test_load_pretrained_vectors_word2vec(tmp_path):
function test_load_pretrained_vectors_glove (line 33) | def test_load_pretrained_vectors_glove(tmp_path):
function test_load_pretrained_vectors_fasttext (line 44) | def test_load_pretrained_vectors_fasttext(tmp_path):
FILE: tests/unit/test_abstractive_summarization_bertsum.py
function source_data (line 21) | def source_data():
function target_data (line 35) | def target_data():
function test_dataset_for_bertsumabs (line 52) | def test_dataset_for_bertsumabs(tmp_module):
function test_train_model (line 87) | def test_train_model(tmp_module, test_dataset_for_bertsumabs, batch_size...
function test_finetuned_model (line 135) | def test_finetuned_model(
FILE: tests/unit/test_abstractive_summarization_seq2seq.py
function s2s_test_data (line 28) | def s2s_test_data():
function test_S2SAbstractiveSummarizer (line 87) | def test_S2SAbstractiveSummarizer(s2s_test_data, tmp, model_name):
function test_S2SAbsSumProcessor (line 151) | def test_S2SAbsSumProcessor(s2s_test_data, tmp):
function test_S2SConfig (line 229) | def test_S2SConfig(tmp):
FILE: tests/unit/test_bert_common.py
function test_tokenize (line 9) | def test_tokenize(bert_english_tokenizer):
function test_tokenize_ner (line 19) | def test_tokenize_ner(ner_test_data, bert_english_tokenizer):
function test_create_data_loader (line 72) | def test_create_data_loader(ner_test_data):
FILE: tests/unit/test_bert_encoder.py
function data (line 10) | def data():
function test_encoder (line 13) | def test_encoder(tmp, data):
FILE: tests/unit/test_bert_sentence_encoding.py
function data (line 12) | def data():
function test_sentence_encoding (line 21) | def test_sentence_encoding(tmp, data):
FILE: tests/unit/test_common_pytorch_utils.py
function model (line 19) | def model():
function test_get_device_cpu (line 23) | def test_get_device_cpu():
function test_machine_is_gpu_machine (line 35) | def test_machine_is_gpu_machine():
function test_get_device_gpu (line 40) | def test_get_device_gpu():
function test_get_device_all_gpus (line 52) | def test_get_device_all_gpus():
function test_get_device_local_rank (line 60) | def test_get_device_local_rank():
function test_get_device_local_rank_cpu (line 68) | def test_get_device_local_rank_cpu():
function test_move_to_device_cpu (line 75) | def test_move_to_device_cpu(model):
function test_move_to_device_cpu_parallelized (line 82) | def test_move_to_device_cpu_parallelized(model):
function test_move_to_device_exception_not_torch_device (line 92) | def test_move_to_device_exception_not_torch_device(model):
function test_move_to_device_exception_wrong_type (line 98) | def test_move_to_device_exception_wrong_type(model):
function test_move_to_device_exception_gpu_model_on_cpu_machine (line 108) | def test_move_to_device_exception_gpu_model_on_cpu_machine(model):
function test_parallelize_model_exception_cuda_zero_gpus (line 115) | def test_parallelize_model_exception_cuda_zero_gpus(model):
function test_parallelize_model (line 123) | def test_parallelize_model(model):
FILE: tests/unit/test_data_loaders.py
function csv_file (line 21) | def csv_file(tmpdir):
function json_file (line 39) | def json_file(tmpdir):
function test_dask_csv_rnd_loader (line 53) | def test_dask_csv_rnd_loader(csv_file):
function test_dask_csv_seq_loader (line 75) | def test_dask_csv_seq_loader(csv_file):
function test_dask_json_rnd_loader (line 95) | def test_dask_json_rnd_loader(json_file):
function test_dask_json_seq_loader (line 117) | def test_dask_json_seq_loader(json_file):
FILE: tests/unit/test_dataset.py
function ner_utils_test_data (line 23) | def ner_utils_test_data(scope="module"):
function test_maybe_download (line 78) | def test_maybe_download():
function test_msrpc (line 90) | def test_msrpc():
function test_wikigold (line 95) | def test_wikigold(tmp_path):
function test_ner_utils (line 114) | def test_ner_utils(ner_utils_test_data):
function test_xnli (line 119) | def test_xnli(tmp_path):
function test_snli (line 126) | def test_snli(tmp_path):
function test_squad (line 135) | def test_squad(tmp_path):
function test_CNNDMSummarizationDatasetOrg (line 157) | def test_CNNDMSummarizationDatasetOrg(tmp):
FILE: tests/unit/test_dataset_pytorch.py
function test_QADataset (line 4) | def test_QADataset(qa_test_df):
FILE: tests/unit/test_distributed_sampler.py
function test_sampler (line 8) | def test_sampler():
FILE: tests/unit/test_eval_classification.py
function test_compute (line 9) | def test_compute():
FILE: tests/unit/test_eval_compute_rouge.py
function rouge_test_data (line 29) | def rouge_test_data():
function test_compute_rouge_perl (line 128) | def test_compute_rouge_perl(rouge_test_data):
function test_compute_rouge_python (line 144) | def test_compute_rouge_python(rouge_test_data):
function test_compute_rouge_python_hi (line 160) | def test_compute_rouge_python_hi(rouge_test_data):
function test_compute_rouge_perl_file (line 176) | def test_compute_rouge_perl_file(rouge_test_data, tmp):
function test_compute_rouge_python_file (line 200) | def test_compute_rouge_python_file(rouge_test_data, tmp):
FILE: tests/unit/test_extractive_summarization.py
function source_data (line 18) | def source_data():
function target_data (line 27) | def target_data():
function data (line 38) | def data(tmp_module):
function test_bert_training (line 70) | def test_bert_training(data, tmp_module):
FILE: tests/unit/test_gensen_utils.py
function test_gensen_preprocess (line 12) | def test_gensen_preprocess(tmp_path):
function test_data_iterator (line 54) | def test_data_iterator():
FILE: tests/unit/test_interpreter.py
function fixed_length_Phi (line 18) | def fixed_length_Phi(x):
function variable_length_Phi (line 22) | def variable_length_Phi(function):
function fixed_length_interp (line 27) | def fixed_length_interp():
function variable_length_interp (line 34) | def variable_length_interp():
function test_fixed_length_regularization (line 43) | def test_fixed_length_regularization():
function test_variable_length_regularization (line 54) | def test_variable_length_regularization():
function test_initialize_interpreter (line 73) | def test_initialize_interpreter():
function test_train_fixed_length_interp (line 82) | def test_train_fixed_length_interp(fixed_length_interp):
function test_train_variable_length_interp (line 94) | def test_train_variable_length_interp(variable_length_interp):
function test_interpreter_get_simga (line 106) | def test_interpreter_get_simga(fixed_length_interp):
FILE: tests/unit/test_models_transformers_question_answering.py
function qa_test_data (line 23) | def qa_test_data(qa_test_df, tmp_module):
function test_QAProcessor (line 145) | def test_QAProcessor(qa_test_data, tmp_module):
function test_AnswerExtractor (line 195) | def test_AnswerExtractor(qa_test_data, tmp_module):
function test_postprocess_bert_answer (line 239) | def test_postprocess_bert_answer(qa_test_data, tmp_module):
function test_postprocess_xlnet_answer (line 274) | def test_postprocess_xlnet_answer(qa_test_data, tmp_module):
FILE: tests/unit/test_notebooks_cpu.py
function test_bert_encoder (line 12) | def test_bert_encoder(notebooks, tmp):
FILE: tests/unit/test_notebooks_gpu.py
function test_bert_encoder (line 13) | def test_bert_encoder(notebooks, tmp):
FILE: tests/unit/test_preprocess.py
function df_sentences (line 12) | def df_sentences():
function test_to_lowercase_all (line 31) | def test_to_lowercase_all(df_sentences):
function test_to_lowercase_subset (line 38) | def test_to_lowercase_subset(df_sentences):
function test_to_spacy_tokens (line 45) | def test_to_spacy_tokens(df_sentences):
function test_rm_spacy_stopwords (line 62) | def test_rm_spacy_stopwords(df_sentences):
function test_to_nltk_tokens (line 73) | def test_to_nltk_tokens(df_sentences):
function test_rm_nltk_stopwords (line 90) | def test_rm_nltk_stopwords(df_sentences):
function test_convert_to_unicode (line 101) | def test_convert_to_unicode():
FILE: tests/unit/test_timer.py
function t (line 14) | def t():
function test_no_time (line 18) | def test_no_time(t):
function test_stop_before_start (line 23) | def test_stop_before_start(t):
function test_interval_before_stop (line 28) | def test_interval_before_stop(t):
function test_timer (line 34) | def test_timer(t):
function test_timer_format (line 48) | def test_timer_format(t):
FILE: tests/unit/test_transformers_sequence_classification.py
function data (line 15) | def data():
function test_classifier (line 20) | def test_classifier(data, tmpdir):
function test_classifier_gpu_train_cpu_predict (line 37) | def test_classifier_gpu_train_cpu_predict(data, tmpdir):
FILE: tests/unit/test_transformers_token_classification.py
function test_token_classifier_fit_predict (line 15) | def test_token_classifier_fit_predict(tmpdir, ner_test_data):
FILE: tools/remove_pixelserver.py
function remove_pixelserver_from_notebook (line 15) | def remove_pixelserver_from_notebook(file_path):
function get_all_notebook_files (line 52) | def get_all_notebook_files():
function main (line 69) | def main():
FILE: utils_nlp/azureml/azureml_bert_util.py
function warmup_linear (line 35) | def warmup_linear(x, warmup=0.002):
function adjust_gradient_accumulation_steps (line 41) | def adjust_gradient_accumulation_steps(x, initial_steps, target_steps, w...
class DistributedCommunicator (line 45) | class DistributedCommunicator:
method __init__ (line 48) | def __init__(self, accumulation_step=1):
method register_model (line 67) | def register_model(self, model, fp16):
method _allreduce_tensor (line 91) | def _allreduce_tensor(self, p):
method _make_hook (line 103) | def _make_hook(self, p):
method synchronize (line 110) | def synchronize(self):
method set_accumulation_step (line 142) | def set_accumulation_step(self, accumulation_step):
FILE: utils_nlp/azureml/azureml_utils.py
function get_auth (line 16) | def get_auth():
function get_or_create_workspace (line 31) | def get_or_create_workspace(
function get_or_create_amlcompute (line 87) | def get_or_create_amlcompute(
function get_output_files (line 139) | def get_output_files(run, output_path, file_names=None):
FILE: utils_nlp/common/pytorch_utils.py
function get_device (line 11) | def get_device(num_gpus=None, gpu_ids=None, local_rank=-1):
function move_model_to_device (line 30) | def move_model_to_device(model, device):
function parallelize_model (line 44) | def parallelize_model(model, device, num_gpus=None, gpu_ids=None, local_...
function dataloader_from_dataset (line 103) | def dataloader_from_dataset(
function compute_training_steps (line 135) | def compute_training_steps(
function get_amp (line 167) | def get_amp(fp16):
FILE: utils_nlp/common/timer.py
class Timer (line 9) | class Timer(object):
method __init__ (line 29) | def __init__(self):
method __enter__ (line 34) | def __enter__(self):
method __exit__ (line 38) | def __exit__(self, *args):
method __str__ (line 41) | def __str__(self):
method start (line 44) | def start(self):
method stop (line 49) | def stop(self):
method interval (line 62) | def interval(self):
FILE: utils_nlp/dataset/__init__.py
class Split (line 11) | class Split(str, Enum):
FILE: utils_nlp/dataset/bbc_hindi.py
function load_pandas_df (line 27) | def load_pandas_df(local_cache_path=TemporaryDirectory().name):
function load_tc_dataset (line 59) | def load_tc_dataset(
function get_label_values (line 172) | def get_label_values(label_encoder, label_ids):
FILE: utils_nlp/dataset/cnndm.py
function _clean (line 46) | def _clean(x):
function _remove_ttags (line 52) | def _remove_ttags(line):
function _target_sentence_tokenization (line 60) | def _target_sentence_tokenization(line):
function join (line 64) | def join(sentences):
function CNNDMSummarizationDataset (line 68) | def CNNDMSummarizationDataset(*args, **kwargs):
class CNNDMBertSumProcessedData (line 147) | class CNNDMBertSumProcessedData:
method download (line 153) | def download(local_path=".data"):
function detokenize (line 168) | def detokenize(line):
function CNNDMSummarizationDatasetOrg (line 182) | def CNNDMSummarizationDatasetOrg(
FILE: utils_nlp/dataset/dac.py
function load_pandas_df (line 31) | def load_pandas_df(local_cache_path=None, num_rows=None):
function load_tc_dataset (line 50) | def load_tc_dataset(
function get_label_values (line 162) | def get_label_values(label_encoder, label_ids):
FILE: utils_nlp/dataset/data_loaders.py
class DaskCSVLoader (line 10) | class DaskCSVLoader:
method __init__ (line 15) | def __init__(self, file_path, sep=",", header="infer", block_size=10e6...
method get_random_batches (line 36) | def get_random_batches(self, num_batches, batch_size):
method get_sequential_batches (line 55) | def get_sequential_batches(self, batch_size):
class DaskJSONLoader (line 69) | class DaskJSONLoader:
method __init__ (line 74) | def __init__(self, file_path, block_size=10e6, random_seed=None, lines...
method get_random_batches (line 92) | def get_random_batches(self, num_batches, batch_size):
method get_sequential_batches (line 111) | def get_sequential_batches(self, batch_size, num_batches=None):
FILE: utils_nlp/dataset/msrpc.py
function download_msrpc (line 24) | def download_msrpc(download_dir):
function load_pandas_df (line 41) | def load_pandas_df(local_cache_path=None, dataset_type="train"):
FILE: utils_nlp/dataset/multinli.py
function download_file_and_extract (line 35) | def download_file_and_extract(
function download_tsv_files_and_extract (line 54) | def download_tsv_files_and_extract(local_cache_path: str = ".") -> None:
function load_pandas_df (line 79) | def load_pandas_df(local_cache_path=".", file_split="train"):
function get_generator (line 99) | def get_generator(
function load_tc_dataset (line 134) | def load_tc_dataset(
function get_label_values (line 260) | def get_label_values(label_encoder, label_ids):
FILE: utils_nlp/dataset/ner_utils.py
function preprocess_conll (line 7) | def preprocess_conll(text, sep="\t"):
function read_conll_file (line 48) | def read_conll_file(file_path, sep="\t", encoding=None):
FILE: utils_nlp/dataset/preprocess.py
function to_lowercase_all (line 15) | def to_lowercase_all(df):
function to_lowercase (line 28) | def to_lowercase(df, column_names=[]):
function to_spacy_tokens (line 47) | def to_spacy_tokens(
function rm_spacy_stopwords (line 74) | def rm_spacy_stopwords(
function to_nltk_tokens (line 108) | def to_nltk_tokens(
function rm_nltk_stopwords (line 132) | def rm_nltk_stopwords(
function convert_to_unicode (line 162) | def convert_to_unicode(input_text, encoding="utf-8"):
FILE: utils_nlp/dataset/sentence_selection.py
function _get_ngrams (line 11) | def _get_ngrams(n, text):
function _get_word_ngrams (line 27) | def _get_word_ngrams(n, sentences):
function cal_rouge (line 40) | def cal_rouge(evaluated_ngrams, reference_ngrams):
function combination_selection (line 61) | def combination_selection(doc_sent_list, abstract_sent_list, summary_size):
function greedy_selection (line 95) | def greedy_selection(doc_sent_list, abstract_sent_list, summary_size):
FILE: utils_nlp/dataset/snli.py
function load_pandas_df (line 28) | def load_pandas_df(local_cache_path=None, file_split=Split.TRAIN, file_t...
function _maybe_download_and_extract (line 58) | def _maybe_download_and_extract(zip_path, file_split, file_type):
function download_snli (line 90) | def download_snli(dest_path):
function extract_snli (line 104) | def extract_snli(zip_path, source_path, dest_path):
function clean_cols (line 118) | def clean_cols(df):
function clean_rows (line 151) | def clean_rows(df, label_col=LABEL_COL):
function clean_df (line 168) | def clean_df(df, label_col=LABEL_COL):
function load_azureml_df (line 175) | def load_azureml_df(local_cache_path=None, file_split=Split.TRAIN, file_...
FILE: utils_nlp/dataset/squad.py
function load_pandas_df (line 26) | def load_pandas_df(local_cache_path=".", squad_version="v1.1", file_spli...
FILE: utils_nlp/dataset/stsbenchmark.py
function load_pandas_df (line 21) | def load_pandas_df(data_path, file_split=DEFAULT_FILE_SPLIT):
function _maybe_download_and_extract (line 38) | def _maybe_download_and_extract(sts_file, base_data_path):
function _download_sts (line 47) | def _download_sts(dirpath):
function _extract_sts (line 63) | def _extract_sts(tarpath, target_dirpath=".", tmode="r"):
function _load_sts (line 84) | def _load_sts(src_file_path):
function clean_sts (line 121) | def clean_sts(df):
FILE: utils_nlp/dataset/url_utils.py
function maybe_download (line 21) | def maybe_download(url, filename=None, work_directory=".", expected_byte...
function maybe_download_googledrive (line 63) | def maybe_download_googledrive(
function extract_tar (line 94) | def extract_tar(file_path, dest_path="."):
function extract_zip (line 108) | def extract_zip(file_path, dest_path="."):
function download_path (line 123) | def download_path(path):
FILE: utils_nlp/dataset/wikigold.py
function load_train_test_dfs (line 31) | def load_train_test_dfs(local_cache_path="./", test_fraction=0.5, random...
function get_unique_labels (line 82) | def get_unique_labels():
function load_dataset (line 87) | def load_dataset(
FILE: utils_nlp/dataset/xnli.py
function load_pandas_df (line 21) | def load_pandas_df(local_cache_path=".", file_split="dev", language="zh"):
FILE: utils_nlp/dataset/xnli_torch_dataset.py
function _load_pandas_df (line 27) | def _load_pandas_df(cache_dir, file_split, language, data_percent_used):
function _tokenize (line 34) | def _tokenize(tok_language, to_lowercase, cache_dir, df):
function _fit_train_labels (line 47) | def _fit_train_labels(df):
class XnliDataset (line 54) | class XnliDataset(data.Dataset):
method __init__ (line 55) | def __init__(
method __len__ (line 109) | def __len__(self):
method __getitem__ (line 113) | def __getitem__(self, index):
FILE: utils_nlp/eval/SentEval/senteval/binary.py
class BinaryClassifierEval (line 21) | class BinaryClassifierEval(object):
method __init__ (line 22) | def __init__(self, pos, neg, seed=1111):
method do_prepare (line 27) | def do_prepare(self, params, prepare):
method loadFile (line 33) | def loadFile(self, fpath):
method run (line 37) | def run(self, params, batcher):
class CREval (line 63) | class CREval(BinaryClassifierEval):
method __init__ (line 64) | def __init__(self, task_path, seed=1111):
class MREval (line 71) | class MREval(BinaryClassifierEval):
method __init__ (line 72) | def __init__(self, task_path, seed=1111):
class SUBJEval (line 79) | class SUBJEval(BinaryClassifierEval):
method __init__ (line 80) | def __init__(self, task_path, seed=1111):
class MPQAEval (line 87) | class MPQAEval(BinaryClassifierEval):
method __init__ (line 88) | def __init__(self, task_path, seed=1111):
FILE: utils_nlp/eval/SentEval/senteval/engine.py
class SE (line 26) | class SE(object):
method __init__ (line 27) | def __init__(self, params, batcher, prepare=None):
method eval (line 56) | def eval(self, name):
FILE: utils_nlp/eval/SentEval/senteval/mrpc.py
class MRPCEval (line 23) | class MRPCEval(object):
method __init__ (line 24) | def __init__(self, task_path, seed=1111):
method do_prepare (line 33) | def do_prepare(self, params, prepare):
method loadFile (line 40) | def loadFile(self, fpath):
method run (line 54) | def run(self, params, batcher):
FILE: utils_nlp/eval/SentEval/senteval/probing.py
class PROBINGEval (line 23) | class PROBINGEval(object):
method __init__ (line 24) | def __init__(self, task, task_path, seed=1111):
method do_prepare (line 36) | def do_prepare(self, params, prepare):
method loadFile (line 41) | def loadFile(self, fpath):
method run (line 57) | def run(self, params, batcher):
class LengthEval (line 104) | class LengthEval(PROBINGEval):
method __init__ (line 105) | def __init__(self, task_path, seed=1111):
class WordContentEval (line 110) | class WordContentEval(PROBINGEval):
method __init__ (line 111) | def __init__(self, task_path, seed=1111):
class DepthEval (line 119) | class DepthEval(PROBINGEval):
method __init__ (line 120) | def __init__(self, task_path, seed=1111):
class TopConstituentsEval (line 125) | class TopConstituentsEval(PROBINGEval):
method __init__ (line 126) | def __init__(self, task_path, seed=1111):
class BigramShiftEval (line 131) | class BigramShiftEval(PROBINGEval):
method __init__ (line 132) | def __init__(self, task_path, seed=1111):
class TenseEval (line 143) | class TenseEval(PROBINGEval):
method __init__ (line 144) | def __init__(self, task_path, seed=1111):
class SubjNumberEval (line 149) | class SubjNumberEval(PROBINGEval):
method __init__ (line 150) | def __init__(self, task_path, seed=1111):
class ObjNumberEval (line 155) | class ObjNumberEval(PROBINGEval):
method __init__ (line 156) | def __init__(self, task_path, seed=1111):
class OddManOutEval (line 161) | class OddManOutEval(PROBINGEval):
method __init__ (line 162) | def __init__(self, task_path, seed=1111):
class CoordinationInversionEval (line 167) | class CoordinationInversionEval(PROBINGEval):
method __init__ (line 168) | def __init__(self, task_path, seed=1111):
FILE: utils_nlp/eval/SentEval/senteval/rank.py
class ImageCaptionRetrievalEval (line 26) | class ImageCaptionRetrievalEval(object):
method __init__ (line 27) | def __init__(self, task_path, seed=1111):
method do_prepare (line 35) | def do_prepare(self, params, prepare):
method loadFile (line 41) | def loadFile(self, fpath):
method run (line 68) | def run(self, params, batcher):
FILE: utils_nlp/eval/SentEval/senteval/sick.py
class SICKRelatednessEval (line 25) | class SICKRelatednessEval(object):
method __init__ (line 26) | def __init__(self, task_path, seed=1111):
method do_prepare (line 34) | def do_prepare(self, params, prepare):
method loadFile (line 42) | def loadFile(self, fpath):
method run (line 58) | def run(self, params, batcher):
method encode_labels (line 123) | def encode_labels(self, labels, nclass=5):
class SICKEntailmentEval (line 137) | class SICKEntailmentEval(SICKRelatednessEval):
method __init__ (line 138) | def __init__(self, task_path, seed=1111):
method loadFile (line 146) | def loadFile(self, fpath):
method run (line 162) | def run(self, params, batcher):
FILE: utils_nlp/eval/SentEval/senteval/snli.py
class SNLIEval (line 23) | class SNLIEval(object):
method __init__ (line 24) | def __init__(self, taskpath, seed=1111):
method do_prepare (line 62) | def do_prepare(self, params, prepare):
method loadFile (line 65) | def loadFile(self, fpath):
method run (line 70) | def run(self, params, batcher):
FILE: utils_nlp/eval/SentEval/senteval/sst.py
class SSTEval (line 22) | class SSTEval(object):
method __init__ (line 23) | def __init__(self, task_path, nclasses=2, seed=1111):
method do_prepare (line 37) | def do_prepare(self, params, prepare):
method loadFile (line 42) | def loadFile(self, fpath):
method run (line 57) | def run(self, params, batcher):
FILE: utils_nlp/eval/SentEval/senteval/sts.py
class STSEval (line 26) | class STSEval(object):
method loadFile (line 27) | def loadFile(self, fpath):
method do_prepare (line 52) | def do_prepare(self, params, prepare):
method run (line 59) | def run(self, params, batcher):
class STS12Eval (line 107) | class STS12Eval(STSEval):
method __init__ (line 108) | def __init__(self, taskpath, seed=1111):
class STS13Eval (line 116) | class STS13Eval(STSEval):
method __init__ (line 118) | def __init__(self, taskpath, seed=1111):
class STS14Eval (line 125) | class STS14Eval(STSEval):
method __init__ (line 126) | def __init__(self, taskpath, seed=1111):
class STS15Eval (line 134) | class STS15Eval(STSEval):
method __init__ (line 135) | def __init__(self, taskpath, seed=1111):
class STS16Eval (line 143) | class STS16Eval(STSEval):
method __init__ (line 144) | def __init__(self, taskpath, seed=1111):
class STSBenchmarkEval (line 152) | class STSBenchmarkEval(SICKRelatednessEval):
method __init__ (line 153) | def __init__(self, task_path, seed=1111):
method loadFile (line 161) | def loadFile(self, fpath):
FILE: utils_nlp/eval/SentEval/senteval/tools/classifier.py
class PyTorchClassifier (line 24) | class PyTorchClassifier(object):
method __init__ (line 25) | def __init__(self, inputdim, nclasses, l2reg=0., batch_size=64, seed=1...
method prepare_split (line 38) | def prepare_split(self, X, y, validation_data=None, validation_split=N...
method fit (line 60) | def fit(self, X, y, validation_data=None, validation_split=None,
method trainepoch (line 85) | def trainepoch(self, X, y, epoch_size=1):
method score (line 111) | def score(self, devX, devy):
method predict (line 130) | def predict(self, devX):
method predict_proba (line 144) | def predict_proba(self, devX):
class MLP (line 162) | class MLP(PyTorchClassifier):
method __init__ (line 163) | def __init__(self, params, inputdim, nclasses, l2reg=0., batch_size=64,
FILE: utils_nlp/eval/SentEval/senteval/tools/ranking.py
class COCOProjNet (line 23) | class COCOProjNet(nn.Module):
method __init__ (line 24) | def __init__(self, config):
method forward (line 36) | def forward(self, img, sent, imgc, sentc):
method proj_sentence (line 66) | def proj_sentence(self, sent):
method proj_image (line 71) | def proj_image(self, img):
class PairwiseRankingLoss (line 77) | class PairwiseRankingLoss(nn.Module):
method __init__ (line 81) | def __init__(self, margin):
method forward (line 85) | def forward(self, anchor1, anchor2, img_sentc, sent_imgc):
class ImageSentenceRankingPytorch (line 95) | class ImageSentenceRankingPytorch(object):
method __init__ (line 97) | def __init__(self, train, valid, test, config):
method prepare_data (line 126) | def prepare_data(self, trainTxt, trainImg, devTxt, devImg,
method run (line 137) | def run(self):
method trainepoch (line 226) | def trainepoch(self, trainTxt, trainImg, devTxt, devImg, nepoches=1):
method t2i (line 274) | def t2i(self, images, captions):
method i2t (line 314) | def i2t(self, images, captions):
FILE: utils_nlp/eval/SentEval/senteval/tools/relatedness.py
class RelatednessPytorch (line 23) | class RelatednessPytorch(object):
method __init__ (line 25) | def __init__(self, train, valid, test, devscores, config):
method prepare_data (line 59) | def prepare_data(self, trainX, trainy, devX, devy, testX, testy):
method run (line 70) | def run(self):
method trainepoch (line 103) | def trainepoch(self, X, y, nepoches=1):
method predict_proba (line 124) | def predict_proba(self, devX):
FILE: utils_nlp/eval/SentEval/senteval/tools/validation.py
function get_classif_name (line 28) | def get_classif_name(classifier_config, usepytorch):
class InnerKFoldClassifier (line 39) | class InnerKFoldClassifier(object):
method __init__ (line 43) | def __init__(self, X, y, config):
method run (line 57) | def run(self):
class KFoldClassifier (line 110) | class KFoldClassifier(object):
method __init__ (line 114) | def __init__(self, train, test, config):
method run (line 126) | def run(self):
class SplitClassifier (line 184) | class SplitClassifier(object):
method __init__ (line 188) | def __init__(self, X, y, config):
method run (line 202) | def run(self):
FILE: utils_nlp/eval/SentEval/senteval/trec.py
class TRECEval (line 22) | class TRECEval(object):
method __init__ (line 23) | def __init__(self, task_path, seed=1111):
method do_prepare (line 29) | def do_prepare(self, params, prepare):
method loadFile (line 33) | def loadFile(self, fpath):
method run (line 46) | def run(self, params, batcher):
FILE: utils_nlp/eval/SentEval/senteval/utils.py
function create_dictionary (line 16) | def create_dictionary(sentences):
function cosine (line 38) | def cosine(u, v):
class dotdict (line 42) | class dotdict(dict):
function get_optimizer (line 49) | def get_optimizer(s):
FILE: utils_nlp/eval/classification.py
function eval_classification (line 21) | def eval_classification(actual, predicted, round_decimals=4):
function compute_correlation_coefficients (line 38) | def compute_correlation_coefficients(x, y=None):
function plot_confusion_matrix (line 58) | def plot_confusion_matrix(
FILE: utils_nlp/eval/evaluate_squad.py
function normalize_answer (line 15) | def normalize_answer(s):
function f1_score (line 34) | def f1_score(prediction, ground_truth):
function exact_match_score (line 47) | def exact_match_score(prediction, ground_truth):
function metric_max_over_ground_truths (line 51) | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
function evaluate (line 59) | def evaluate(dataset, predictions):
FILE: utils_nlp/eval/evaluate_summarization.py
function get_rouge (line 10) | def get_rouge(predictions, targets, temp_dir, random_seed=42):
FILE: utils_nlp/eval/question_answering.py
function get_raw_scores (line 11) | def get_raw_scores(qa_ids, actuals, preds):
function find_best_thresh (line 100) | def find_best_thresh(preds, scores, na_probs, qid_to_has_ans, unanswerab...
function find_all_best_thresh (line 176) | def find_all_best_thresh(
function evaluate_qa (line 210) | def evaluate_qa(
FILE: utils_nlp/eval/rouge/compute_rouge.py
function compute_rouge_perl (line 14) | def compute_rouge_perl(cand, ref, is_input_files=False, verbose=False):
function compute_rouge_python (line 81) | def compute_rouge_python(cand, ref, is_input_files=False, language="en"):
FILE: utils_nlp/eval/rouge/rouge_ext.py
class RougeExt (line 30) | class RougeExt(Rouge):
method __init__ (line 58) | def __init__(
method tokenize_text (line 190) | def tokenize_text(self, text):
method split_into_sentences (line 203) | def split_into_sentences(self, text):
method stem_tokens (line 217) | def stem_tokens(self, tokens):
method _split_into_words (line 232) | def _split_into_words(self, sentences):
method _get_word_ngrams_and_length (line 248) | def _get_word_ngrams_and_length(self, n, sentences):
method _get_unigrams (line 266) | def _get_unigrams(self, sentences):
method _compute_ngrams (line 284) | def _compute_ngrams(self, evaluated_sentences, reference_sentences, n):
method _compute_ngrams_lcs (line 322) | def _compute_ngrams_lcs(self, evaluated_sentences, reference_sentences...
method _preprocess_summary_as_a_whole (line 461) | def _preprocess_summary_as_a_whole(self, summary):
method _preprocess_summary_per_sentence (line 532) | def _preprocess_summary_per_sentence(self, summary):
FILE: utils_nlp/eval/senteval.py
class SentEvalConfig (line 7) | class SentEvalConfig:
method __init__ (line 16) | def __init__(self, model_params, senteval_params):
method model_params (line 27) | def model_params(self):
method model_params (line 31) | def model_params(self, model_params):
method append_senteval_params (line 34) | def append_senteval_params(self, params):
FILE: utils_nlp/interpreter/Interpreter.py
function calculate_regularization (line 14) | def calculate_regularization(sampled_x, Phi, reduced_axes=None, device=N...
class Interpreter (line 47) | class Interpreter(nn.Module):
method __init__ (line 64) | def __init__(self, x, Phi, scale=0.5, rate=0.1, regularization=None, w...
method forward (line 113) | def forward(self):
method optimize (line 136) | def optimize(self, iteration=5000, lr=0.01, show_progress=False):
method get_sigma (line 161) | def get_sigma(self):
method visualize (line 171) | def visualize(self):
FILE: utils_nlp/language_utils/hi/hindi_stemmer.py
function hi_stem (line 87) | def hi_stem(word):
FILE: utils_nlp/models/bert/common.py
class Language (line 32) | class Language(str, Enum):
class Tokenizer (line 45) | class Tokenizer:
method __init__ (line 46) | def __init__(self, language=Language.ENGLISH, to_lower=False, cache_di...
method tokenize (line 60) | def tokenize(self, text):
method _truncate_seq_pair (line 76) | def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
method preprocess_classification_tokens (line 103) | def preprocess_classification_tokens(self, tokens, max_len=BERT_MAX_LEN):
method preprocess_encoder_tokens (line 159) | def preprocess_encoder_tokens(self, tokens, max_len=BERT_MAX_LEN):
method tokenize_ner (line 216) | def tokenize_ner(
function create_data_loader (line 369) | def create_data_loader(
class TextDataset (line 418) | class TextDataset(Dataset):
method __init__ (line 425) | def __init__(self, filename):
method __len__ (line 436) | def __len__(self):
method _cast (line 441) | def _cast(row):
method __getitem__ (line 444) | def __getitem__(self, index):
function get_dataset_multiple_files (line 470) | def get_dataset_multiple_files(files):
FILE: utils_nlp/models/bert/sequence_classification.py
class BERTSequenceClassifier (line 25) | class BERTSequenceClassifier:
method __init__ (line 28) | def __init__(self, language=Language.ENGLISH, num_labels=2, cache_dir=...
method cuda (line 53) | def cuda(self):
method fit (line 59) | def fit(
method predict (line 200) | def predict(
FILE: utils_nlp/models/bert/sequence_classification_distributed.py
class BERTSequenceClassifier (line 30) | class BERTSequenceClassifier:
method __init__ (line 33) | def __init__(
method create_optimizer (line 89) | def create_optimizer(
method create_data_loader (line 139) | def create_data_loader(self, dataset, batch_size=32, mode="train", **k...
method save_model (line 168) | def save_model(self):
method fit (line 191) | def fit(
method predict (line 289) | def predict(self, test_loader, num_gpus=None, probabilities=False):
FILE: utils_nlp/models/bert/sequence_encoding.py
class PoolingStrategy (line 25) | class PoolingStrategy(str, Enum):
class BERTSentenceEncoder (line 33) | class BERTSentenceEncoder:
method __init__ (line 36) | def __init__(
method layer_index (line 84) | def layer_index(self):
method layer_index (line 88) | def layer_index(self, layer_index):
method cuda (line 95) | def cuda(self):
method pooling_strategy (line 102) | def pooling_strategy(self):
method pooling_strategy (line 106) | def pooling_strategy(self, pooling_strategy):
method get_hidden_states (line 109) | def get_hidden_states(self, text, batch_size=32):
method pool (line 184) | def pool(self, df):
method encode (line 241) | def encode(self, text, batch_size=32, as_numpy=False):
FILE: utils_nlp/models/bert/token_classification.py
class BERTTokenClassifier (line 24) | class BERTTokenClassifier:
method __init__ (line 27) | def __init__(self, language=Language.ENGLISH, num_labels=2, cache_dir=...
method cuda (line 62) | def cuda(self):
method _get_optimizer (line 68) | def _get_optimizer(self, learning_rate, num_train_optimization_steps, ...
method fit (line 101) | def fit(
method predict (line 192) | def predict(
function create_label_map (line 279) | def create_label_map(label_list, trailing_piece_tag="X"):
function postprocess_token_labels (line 288) | def postprocess_token_labels(
FILE: utils_nlp/models/gensen/create_gensen_model.py
function create_multiseq2seq_model (line 13) | def create_multiseq2seq_model(
FILE: utils_nlp/models/gensen/gensen.py
class Encoder (line 18) | class Encoder(nn.Module):
method __init__ (line 24) | def __init__(
method set_pretrained_embeddings (line 51) | def set_pretrained_embeddings(self, embedding_matrix):
method forward (line 89) | def forward(self, input, lengths, return_all=False, pool="last"):
class GenSen (line 126) | class GenSen(nn.Module):
method __init__ (line 132) | def __init__(self, *args, **kwargs):
method vocab_expansion (line 136) | def vocab_expansion(self, task_vocab):
method get_representation (line 146) | def get_representation(
class GenSenSingle (line 190) | class GenSenSingle(nn.Module):
method __init__ (line 196) | def __init__(
method _load_params (line 223) | def _load_params(self):
method first_expansion (line 278) | def first_expansion(self):
method vocab_expansion (line 312) | def vocab_expansion(self, task_vocab):
method get_minibatch (line 371) | def get_minibatch(self, sentences, tokenize=False, add_start_end=True):
method get_representation (line 425) | def get_representation(
FILE: utils_nlp/models/gensen/multi_task_model.py
class MultitaskModel (line 13) | class MultitaskModel(nn.Module):
method __init__ (line 21) | def __init__(
method init_weights (line 103) | def init_weights(self):
method set_pretrained_embeddings (line 112) | def set_pretrained_embeddings(self, embedding_matrix):
method forward (line 134) | def forward(
method decode (line 265) | def decode(self, logits):
method get_hidden (line 274) | def get_hidden(self, input_src, src_lengths, strategy="last"):
FILE: utils_nlp/models/gensen/preprocess_utils.py
function _preprocess (line 9) | def _preprocess(split_map, data_path, column_names):
function _split_and_cleanup (line 66) | def _split_and_cleanup(split_map, data_path):
function gensen_preprocess (line 107) | def gensen_preprocess(train_tok, dev_tok, test_tok, data_path):
FILE: utils_nlp/models/gensen/utils.py
class DataIterator (line 21) | class DataIterator(object):
method _trim_vocab (line 25) | def _trim_vocab(vocab, vocab_size):
method construct_vocab (line 66) | def construct_vocab(
class BufferedDataIterator (line 97) | class BufferedDataIterator(DataIterator):
method __init__ (line 100) | def __init__(
method _reset_filepointer (line 160) | def _reset_filepointer(self, idx):
method fetch_buffer (line 170) | def fetch_buffer(self, idx, reset=True):
method build_vocab (line 214) | def build_vocab(self):
method shuffle_dataset (line 268) | def shuffle_dataset(self, idx):
method get_parallel_minibatch (line 276) | def get_parallel_minibatch(
class NLIIterator (line 376) | class NLIIterator(DataIterator):
method __init__ (line 379) | def __init__(
method shuffle_dataset (line 434) | def shuffle_dataset(self):
method get_parallel_minibatch (line 438) | def get_parallel_minibatch(self, index, batch_size, sent_type="train"):
function get_validation_minibatch (line 541) | def get_validation_minibatch(
function compute_validation_loss (line 622) | def compute_validation_loss(
FILE: utils_nlp/models/glove/src/cooccur.c
type real (line 37) | typedef double real;
type CREC (line 39) | typedef struct cooccur_rec {
type CRECID (line 45) | typedef struct cooccur_rec_id {
type HASHREC (line 52) | typedef struct hashrec {
function scmp (line 68) | int scmp( char *s1, char *s2 ) {
function bitwisehash (line 76) | unsigned int bitwisehash(char *word, int tsize, unsigned int seed) {
function HASHREC (line 85) | HASHREC ** inithashtable() {
function HASHREC (line 94) | HASHREC *hashsearch(HASHREC **ht, char *w) {
function hashinsert (line 107) | void hashinsert(HASHREC **ht, char *w, long long id) {
function get_word (line 134) | int get_word(char *word, FILE *fin) {
function write_chunk (line 167) | int write_chunk(CREC *cr, long long length, FILE *fout) {
function compare_crec (line 186) | int compare_crec(const void *a, const void *b) {
function compare_crecid (line 194) | int compare_crecid(CRECID a, CRECID b) {
function swap_entry (line 201) | void swap_entry(CRECID *pq, int i, int j) {
function insert (line 208) | void insert(CRECID *pq, CRECID new, int size) {
function delete (line 218) | void delete(CRECID *pq, int size) {
function merge_write (line 240) | int merge_write(CRECID new, CRECID *old, FILE *fout) {
function merge_files (line 251) | int merge_files(int num) {
function get_cooccurrence (line 308) | int get_cooccurrence() {
function find_arg (line 451) | int find_arg(char *str, int argc, char **argv) {
function main (line 465) | int main(int argc, char **argv) {
FILE: utils_nlp/models/glove/src/glove.c
type real (line 36) | typedef double real;
type CREC (line 38) | typedef struct cooccur_rec {
function scmp (line 61) | int scmp( char *s1, char *s2 ) {
function initialize_parameters (line 66) | void initialize_parameters() {
function real (line 94) | inline real check_nan(real update) {
function save_params (line 178) | int save_params(int nb_iter) {
function train_glove (line 293) | int train_glove() {
function find_arg (line 353) | int find_arg(char *str, int argc, char **argv) {
function main (line 367) | int main(int argc, char **argv) {
FILE: utils_nlp/models/glove/src/shuffle.c
type real (line 31) | typedef double real;
type CREC (line 33) | typedef struct cooccur_rec {
function scmp (line 45) | int scmp( char *s1, char *s2 ) {
function rand_long (line 52) | static long rand_long(long n) {
function write_chunk (line 62) | int write_chunk(CREC *array, long size, FILE *fout) {
function shuffle (line 69) | void shuffle(CREC *array, long n) {
function shuffle_merge (line 81) | int shuffle_merge(int num) {
function shuffle_by_chunks (line 129) | int shuffle_by_chunks() {
function find_arg (line 177) | int find_arg(char *str, int argc, char **argv) {
function main (line 191) | int main(int argc, char **argv) {
FILE: utils_nlp/models/glove/src/vocab_count.c
type VOCAB (line 37) | typedef struct vocabulary {
type HASHREC (line 42) | typedef struct hashrec {
function scmp (line 54) | int scmp( char *s1, char *s2 ) {
function CompareVocabTie (line 61) | int CompareVocabTie(const void *a, const void *b) {
function CompareVocab (line 69) | int CompareVocab(const void *a, const void *b) {
function bitwisehash (line 78) | unsigned int bitwisehash(char *word, int tsize, unsigned int seed) {
function HASHREC (line 87) | HASHREC ** inithashtable() {
function hashinsert (line 96) | void hashinsert(HASHREC **ht, char *w) {
function get_word (line 135) | int get_word(char *word, FILE *fin) {
function get_counts (line 167) | int get_counts() {
function find_arg (line 226) | int find_arg(char *str, int argc, char **argv) {
function main (line 240) | int main(int argc, char **argv) {
FILE: utils_nlp/models/pretrained_embeddings/fasttext.py
function _extract_fasttext_vectors (line 15) | def _extract_fasttext_vectors(zip_path, dest_path="."):
function _download_fasttext_vectors (line 37) | def _download_fasttext_vectors(download_dir, file_name="wiki.simple.zip"):
function _maybe_download_and_extract (line 59) | def _maybe_download_and_extract(dest_path, file_name):
function load_pretrained_vectors (line 84) | def load_pretrained_vectors(dest_path, file_name="wiki.simple.bin"):
FILE: utils_nlp/models/pretrained_embeddings/glove.py
function _extract_glove_vectors (line 17) | def _extract_glove_vectors(zip_path, dest_path="."):
function _download_glove_vectors (line 39) | def _download_glove_vectors(download_dir, file_name="glove.840B.300d.zip"):
function _maybe_download_and_extract (line 57) | def _maybe_download_and_extract(dest_path, file_name):
function download_and_extract (line 82) | def download_and_extract(dir_path, file_name="glove.840B.300d.txt"):
function load_pretrained_vectors (line 96) | def load_pretrained_vectors(
FILE: utils_nlp/models/pretrained_embeddings/word2vec.py
function _extract_word2vec_vectors (line 15) | def _extract_word2vec_vectors(zip_path, dest_filepath):
function _download_word2vec_vectors (line 34) | def _download_word2vec_vectors(
function _maybe_download_and_extract (line 54) | def _maybe_download_and_extract(dest_path, file_name):
function load_pretrained_vectors (line 79) | def load_pretrained_vectors(
FILE: utils_nlp/models/pytorch_modules/conditional_gru.py
class ConditionalGRU (line 11) | class ConditionalGRU(nn.Module):
method __init__ (line 14) | def __init__(self, input_dim, hidden_dim, dropout=0.0):
method reset_parameters (line 33) | def reset_parameters(self):
method forward (line 39) | def forward(self, input, hidden, ctx):
FILE: utils_nlp/models/transformers/abstractive_summarization_bertsum.py
function fit_to_block_size (line 37) | def fit_to_block_size(sequence, block_size, pad_token_id):
function build_mask (line 56) | def build_mask(sequence, pad_token_id):
function compute_token_type_ids (line 74) | def compute_token_type_ids(batch, separator_token_id):
class BertSumAbsProcessor (line 103) | class BertSumAbsProcessor:
method __init__ (line 107) | def __init__(
method list_supported_models (line 156) | def list_supported_models():
method model_name (line 160) | def model_name(self):
method model_name (line 164) | def model_name(self, value):
method get_inputs (line 175) | def get_inputs(batch, device, model_name, train_mode=True):
method collate (line 215) | def collate(self, data, block_size, device, train_mode=True):
method preprocess (line 301) | def preprocess(self, story_lines, summary_lines=None):
function validate (line 349) | def validate(summarizer, validate_dataset):
class BertSumAbs (line 380) | class BertSumAbs(Transformer):
method __init__ (line 384) | def __init__(
method list_supported_models (line 444) | def list_supported_models():
method fit (line 447) | def fit(
method predict (line 653) | def predict(
method save_model (line 799) | def save_model(self, global_step=None, full_name=None):
FILE: utils_nlp/models/transformers/abstractive_summarization_seq2seq.py
function _get_model_type (line 77) | def _get_model_type(model_name):
function detokenize (line 88) | def detokenize(tk_list):
class S2SAbsSumDataset (line 98) | class S2SAbsSumDataset(Dataset):
method __init__ (line 104) | def __init__(self, features):
method __getitem__ (line 107) | def __getitem__(self, idx):
method __len__ (line 110) | def __len__(self):
class S2SAbsSumProcessor (line 114) | class S2SAbsSumProcessor:
method __init__ (line 130) | def __init__(
method list_supported_models (line 142) | def list_supported_models():
method get_inputs (line 146) | def get_inputs(cls, batch, device, model_name):
method create_s2s_dataset (line 162) | def create_s2s_dataset(
method s2s_dataset_from_iterable_sum_ds (line 245) | def s2s_dataset_from_iterable_sum_ds(
method s2s_dataset_from_sum_ds (line 290) | def s2s_dataset_from_sum_ds(
method s2s_dataset_from_json_or_file (line 331) | def s2s_dataset_from_json_or_file(
class S2SConfig (line 379) | class S2SConfig:
method __init__ (line 412) | def __init__(
method save_to_json (line 439) | def save_to_json(self, json_file):
method load_from_json (line 444) | def load_from_json(cls, json_file):
class S2SAbstractiveSummarizer (line 451) | class S2SAbstractiveSummarizer(Transformer):
method __init__ (line 452) | def __init__(
method list_supported_models (line 580) | def list_supported_models():
method fit (line 583) | def fit(
method predict (line 778) | def predict(
method save_model (line 1055) | def save_model(self, output_dir, global_step, fp16):
function load_and_cache_examples (line 1074) | def load_and_cache_examples(
FILE: utils_nlp/models/transformers/bertsum/adam.py
class Adam (line 11) | class Adam(Optimizer):
method __init__ (line 35) | def __init__(
method __setstate__ (line 57) | def __setstate__(self, state):
method step (line 62) | def step(self, closure=None):
FILE: utils_nlp/models/transformers/bertsum/beam.py
class Beam (line 11) | class Beam(object):
method __init__ (line 25) | def __init__(
method get_current_state (line 77) | def get_current_state(self):
method get_current_origin (line 81) | def get_current_origin(self):
method advance (line 85) | def advance(self, word_probs, attn_out):
method done (line 160) | def done(self):
method sort_finished (line 163) | def sort_finished(self, minimum=None):
method get_hyp (line 178) | def get_hyp(self, timestep, k):
class GNMTGlobalScorer (line 190) | class GNMTGlobalScorer(object):
method __init__ (line 200) | def __init__(self, alpha, length_penalty):
method score (line 207) | def score(self, beam, logprobs):
FILE: utils_nlp/models/transformers/bertsum/data_loader.py
class IterableDistributedSampler (line 10) | class IterableDistributedSampler(object):
method __init__ (line 21) | def __init__(self, world_size=1, rank=0, local_rank=-1):
method iter (line 26) | def iter(self, iterable):
class ChunkDataLoader (line 35) | class ChunkDataLoader(object):
method __init__ (line 47) | def __init__(self, datasets, batch_size, shuffle, is_labeled, sampler):
method eachiter (line 56) | def eachiter(self):
method __iter__ (line 63) | def __iter__(self):
method _next_dataset_iterator (line 66) | def _next_dataset_iterator(self, dataset_iter):
class Batch (line 87) | class Batch(object):
method _pad (line 88) | def _pad(self, data, pad_id, width=-1):
method __init__ (line 94) | def __init__(self, data=None, is_labeled=False):
method to (line 131) | def to(self, device):
method __len__ (line 149) | def __len__(self):
function create_batch_with_size (line 153) | def create_batch_with_size(data, batch_size):
function simple_batch_size_fn (line 169) | def simple_batch_size_fn(new, count):
class DataIterator (line 182) | class DataIterator(object):
method __init__ (line 183) | def __init__(self, dataset, batch_size, is_labeled=False, shuffle=True...
method data (line 195) | def data(self):
method preprocess (line 201) | def preprocess(self, ex, is_labeled):
method batch_buffer (line 220) | def batch_buffer(self, data, batch_size):
method create_batches (line 239) | def create_batches(self):
method __iter__ (line 256) | def __iter__(self):
FILE: utils_nlp/models/transformers/bertsum/dataset.py
function get_dataset (line 9) | def get_dataset(file):
class ExtSumProcessedIterableDataset (line 13) | class ExtSumProcessedIterableDataset(IterableDataset):
method __init__ (line 17) | def __init__(self, file_list, is_shuffle=False):
method get_stream (line 30) | def get_stream(self):
method __iter__ (line 42) | def __iter__(self):
class ExtSumProcessedDataset (line 46) | class ExtSumProcessedDataset(Dataset):
method __init__ (line 50) | def __init__(self, file_list, is_shuffle=False):
method __len__ (line 66) | def __len__(self):
method __getitem__ (line 69) | def __getitem__(self, idx):
FILE: utils_nlp/models/transformers/bertsum/decoder.py
class TransformerDecoderLayer (line 18) | class TransformerDecoderLayer(nn.Module):
method __init__ (line 30) | def __init__(self, d_model, heads, d_ff, dropout):
method forward (line 45) | def forward(
method _get_attn_subsequent_mask (line 105) | def _get_attn_subsequent_mask(self, size):
class TransformerDecoder (line 123) | class TransformerDecoder(nn.Module):
method __init__ (line 153) | def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddin...
method forward (line 172) | def forward(
method init_decoder_state (line 252) | def init_decoder_state(self, src, memory_bank, with_cache=False):
class TransformerDecoderState (line 260) | class TransformerDecoderState(DecoderState):
method __init__ (line 263) | def __init__(self, src):
method _all (line 275) | def _all(self):
method detach (line 284) | def detach(self):
method update_state (line 291) | def update_state(self, new_input, previous_layer_inputs):
method _init_cache (line 297) | def _init_cache(self, memory_bank, num_layers):
method repeat_beam_size_times (line 306) | def repeat_beam_size_times(self, beam_size):
method map_batch_fn (line 310) | def map_batch_fn(self, fn):
FILE: utils_nlp/models/transformers/bertsum/encoder.py
class Classifier (line 13) | class Classifier(nn.Module):
method __init__ (line 14) | def __init__(self, hidden_size):
method forward (line 19) | def forward(self, x, mask_cls):
class PositionalEncoding (line 25) | class PositionalEncoding(nn.Module):
method __init__ (line 26) | def __init__(self, dropout, dim, max_len=5000):
method forward (line 40) | def forward(self, emb, step=None):
method get_emb (line 50) | def get_emb(self, emb):
class TransformerEncoderLayer (line 54) | class TransformerEncoderLayer(nn.Module):
method __init__ (line 55) | def __init__(self, d_model, heads, d_ff, dropout):
method forward (line 63) | def forward(self, iter, query, inputs, mask):
class ExtTransformerEncoder (line 75) | class ExtTransformerEncoder(nn.Module):
method __init__ (line 76) | def __init__(self, d_model, d_ff, heads, dropout, num_inter_layers=0):
method forward (line 92) | def forward(self, top_vecs, mask):
class RNNEncoder (line 111) | class RNNEncoder(nn.Module):
method __init__ (line 113) | def __init__(self, bidirectional, num_layers, input_size,
method forward (line 130) | def forward(self, x, mask):
FILE: utils_nlp/models/transformers/bertsum/loss.py
function abs_loss (line 21) | def abs_loss(generator, symbols, vocab_size, train=True, label_smoothing...
class LossComputeBase (line 32) | class LossComputeBase(nn.Module):
method __init__ (line 52) | def __init__(self, generator, pad_id):
method _make_shard_state (line 57) | def _make_shard_state(self, batch, output, attns=None):
method _compute_loss (line 71) | def _compute_loss(self, batch, output, target, **kwargs):
method monolithic_compute_loss (line 84) | def monolithic_compute_loss(self, output, target, number_tokens):
method sharded_compute_loss (line 104) | def sharded_compute_loss(self, batch, output, shard_size, normalization):
method _stats (line 142) | def _stats(self, loss, scores, target):
method _bottle (line 159) | def _bottle(self, _v):
method _unbottle (line 162) | def _unbottle(self, _v, batch_size):
class LabelSmoothingLoss (line 166) | class LabelSmoothingLoss(nn.Module):
method __init__ (line 173) | def __init__(self, label_smoothing, tgt_vocab_size, ignore_index=-100):
method forward (line 184) | def forward(self, output, target):
class NMTLossCompute (line 196) | class NMTLossCompute(LossComputeBase):
method __init__ (line 201) | def __init__(self, generator, symbols, vocab_size, label_smoothing=0.0):
method _make_shard_state (line 211) | def _make_shard_state(self, target, tgt_num_tokens, output):
method _compute_loss (line 218) | def _compute_loss(self, output, target, **kwargs):
function filter_shard_state (line 230) | def filter_shard_state(state, shard_size=None):
function shards (line 246) | def shards(state, shard_size, eval_only=False):
FILE: utils_nlp/models/transformers/bertsum/model_builder.py
function load_optimizer_checkpoint (line 23) | def load_optimizer_checkpoint(optimizer, checkpoint):
function build_optim (line 34) | def build_optim(
function build_optim_bert (line 60) | def build_optim_bert(
function build_optim_dec (line 90) | def build_optim_dec(
function get_generator (line 119) | def get_generator(vocab_size, dec_hidden_size):
class Transformer (line 126) | class Transformer(nn.Module):
method __init__ (line 127) | def __init__(self, temp_dir, model_class, pretrained_model_name, pretr...
method forward (line 136) | def forward(self, x, segs, mask):
class BertSumExt (line 147) | class BertSumExt(nn.Module):
method __init__ (line 148) | def __init__(self, encoder, args, model_class, pretrained_model_name, ...
method load_cp (line 184) | def load_cp(self, pt):
method forward (line 187) | def forward(self, x, segs, clss, mask, mask_cls, labels=None, sentence...
class Bert (line 205) | class Bert(nn.Module):
method __init__ (line 206) | def __init__(self, large, temp_dir, finetune=False):
method forward (line 219) | def forward(self, x, segs, mask):
class AbsSummarizer (line 230) | class AbsSummarizer(nn.Module):
method __init__ (line 231) | def __init__(
method load_checkpoint (line 353) | def load_checkpoint(self, checkpoint):
method forward (line 372) | def forward(
FILE: utils_nlp/models/transformers/bertsum/neural.py
function aeq (line 11) | def aeq(*args):
function sequence_mask (line 22) | def sequence_mask(lengths, max_len=None):
function gelu (line 36) | def gelu(x):
class GlobalAttention (line 50) | class GlobalAttention(nn.Module):
method __init__ (line 105) | def __init__(self, dim, attn_type="dot"):
method score (line 126) | def score(self, h_t, h_s):
method forward (line 166) | def forward(self, source, memory_bank, memory_lengths=None, memory_mas...
class PositionwiseFeedForward (line 228) | class PositionwiseFeedForward(nn.Module):
method __init__ (line 238) | def __init__(self, d_model, d_ff, dropout=0.1):
method forward (line 247) | def forward(self, x):
class MultiHeadedAttention (line 253) | class MultiHeadedAttention(nn.Module):
method __init__ (line 295) | def __init__(self, head_count, model_dim, dropout=0.1, use_final_linea...
method forward (line 312) | def forward(
class DecoderState (line 468) | class DecoderState(object):
method detach (line 477) | def detach(self):
method beam_update (line 482) | def beam_update(self, idx, positions, beam_size):
method map_batch_fn (line 498) | def map_batch_fn(self, fn):
FILE: utils_nlp/models/transformers/bertsum/optimizers.py
function use_gpu (line 16) | def use_gpu(opt):
function build_optim (line 25) | def build_optim(model, opt, checkpoint):
class MultipleOptimizer (line 72) | class MultipleOptimizer(object):
method __init__ (line 75) | def __init__(self, op):
method zero_grad (line 79) | def zero_grad(self):
method step (line 84) | def step(self):
method state (line 90) | def state(self):
method state_dict (line 94) | def state_dict(self):
method load_state_dict (line 98) | def load_state_dict(self, state_dicts):
class Optimizer (line 105) | class Optimizer(object):
method __init__ (line 136) | def __init__(
method set_parameters (line 167) | def set_parameters(self, params):
method _set_rate (line 198) | def _set_rate(self, learning_rate):
method step (line 206) | def step(self):
method add_param_group (line 237) | def add_param_group(self, param_group):
method load_state_dict (line 287) | def load_state_dict(self, state_dict):
method state_dict (line 290) | def state_dict(self):
method zero_grad (line 294) | def zero_grad(self):
FILE: utils_nlp/models/transformers/bertsum/penalties.py
class PenaltyBuilder (line 10) | class PenaltyBuilder(object):
method __init__ (line 19) | def __init__(self, length_pen):
method length_penalty (line 22) | def length_penalty(self):
method length_wu (line 34) | def length_wu(self, beam, logprobs, alpha=0.0):
method length_average (line 43) | def length_average(self, beam, logprobs, alpha=0.0):
method length_none (line 49) | def length_none(self, beam, logprobs, alpha=0.0, beta=0.0):
FILE: utils_nlp/models/transformers/bertsum/predictor.py
function build_predictor (line 19) | def build_predictor(
function tile (line 44) | def tile(x, count, dim=0):
class Translator (line 68) | class Translator(nn.Module):
method __init__ (line 87) | def __init__(
method forward (line 141) | def forward(self, src, segs, mask_src):
method _fast_translate_batch (line 161) | def _fast_translate_batch(self, src, segs, mask_src, max_length, min_l...
FILE: utils_nlp/models/transformers/common.py
class Transformer (line 30) | class Transformer:
method __init__ (line 31) | def __init__(self, model_name, model, cache_dir):
method model_name (line 38) | def model_name(self):
method model_type (line 42) | def model_type(self):
method set_seed (line 46) | def set_seed(seed, cuda=True):
method get_default_optimizer (line 54) | def get_default_optimizer(model, weight_decay, learning_rate, adam_eps...
method get_default_scheduler (line 80) | def get_default_scheduler(optimizer, warmup_steps, num_training_steps):
method prepare_model_and_optimizer (line 88) | def prepare_model_and_optimizer(
method fine_tune (line 151) | def fine_tune(
method predict (line 294) | def predict(self, eval_dataloader, get_inputs, num_gpus, gpu_ids, verb...
method save_model (line 319) | def save_model(self, file_name=None):
method load_model (line 350) | def load_model(self, file_name):
FILE: utils_nlp/models/transformers/datasets.py
class SCDataSet (line 14) | class SCDataSet(Dataset):
method __init__ (line 17) | def __init__(self, df, text_col, label_col, transform, **transform_args):
method __getitem__ (line 39) | def __getitem__(self, idx):
method __len__ (line 61) | def __len__(self):
class SPCDataSet (line 65) | class SPCDataSet(Dataset):
method __init__ (line 68) | def __init__(
method __getitem__ (line 99) | def __getitem__(self, idx):
method __len__ (line 125) | def __len__(self):
class QADataset (line 154) | class QADataset(Dataset):
method __init__ (line 155) | def __init__(
method __getitem__ (line 210) | def __getitem__(self, idx):
method __len__ (line 233) | def __len__(self):
function _line_iter (line 237) | def _line_iter(file_path):
function _preprocess (line 243) | def _preprocess(sentences, preprocess_pipeline, word_tokenize=None):
function _create_data_from_iterator (line 266) | def _create_data_from_iterator(iterator, preprocessing, word_tokenize):
class IterableSummarizationDataset (line 275) | class IterableSummarizationDataset(IterableDataset):
method __init__ (line 276) | def __init__(
method __iter__ (line 329) | def __iter__(self):
method get_source (line 333) | def get_source(self):
method get_target (line 336) | def get_target(self):
class SummarizationDataset (line 340) | class SummarizationDataset(Dataset):
method __init__ (line 341) | def __init__(
method shorten (line 439) | def shorten(self, top_n=None):
method __getitem__ (line 453) | def __getitem__(self, idx):
method __len__ (line 465) | def __len__(self):
method get_source (line 468) | def get_source(self):
method get_source_txt (line 471) | def get_source_txt(self):
method get_target_txt (line 474) | def get_target_txt(self):
method get_target (line 477) | def get_target(self):
method save_to_jsonl (line 480) | def save_to_jsonl(self, output_file):
function parallel_preprocess (line 490) | def parallel_preprocess(
FILE: utils_nlp/models/transformers/extractive_summarization.py
class Bunch (line 51) | class Bunch(object):
method __init__ (line 54) | def __init__(self, adict):
function get_dataloader (line 58) | def get_dataloader(
function get_pred (line 88) | def get_pred(
class ExtSumProcessedData (line 168) | class ExtSumProcessedData:
method save_data (line 173) | def save_data(data_iter, is_test=False, save_path="./", chunk_size=None):
method _get_files (line 209) | def _get_files(self, root):
method splits (line 225) | def splits(self, root, train_iterable=False):
function preprocess_single_add_oracleids (line 248) | def preprocess_single_add_oracleids(input_data, oracle_mode="greedy", se...
function parallel_preprocess (line 277) | def parallel_preprocess(input_data, preprocess, num_pool=-1):
class ExtSumProcessor (line 309) | class ExtSumProcessor:
method __init__ (line 312) | def __init__(
method list_supported_models (line 362) | def list_supported_models():
method model_name (line 366) | def model_name(self):
method model_name (line 370) | def model_name(self, value):
method get_inputs (line 381) | def get_inputs(batch, device, model_name, train_mode=True):
method preprocess (line 435) | def preprocess(self, input_data_list, oracle_mode="greedy", selections...
method collate (line 455) | def collate(self, data, block_size, device, train_mode=True):
method encode_single (line 487) | def encode_single(self, d, block_size, train_mode=True):
class ExtractiveSummarizer (line 557) | class ExtractiveSummarizer(Transformer):
method __init__ (line 560) | def __init__(
method list_supported_models (line 620) | def list_supported_models():
method fit (line 623) | def fit(
method predict (line 780) | def predict(
method predict_scores (line 881) | def predict_scores(self, test_dataloader, num_gpus=1, gpu_ids=None, ve...
method save_model (line 912) | def save_model(self, full_name=None):
FILE: utils_nlp/models/transformers/named_entity_recognition.py
class TokenClassificationProcessor (line 27) | class TokenClassificationProcessor:
method __init__ (line 40) | def __init__(self, model_name="bert-base-cased", to_lower=False, cache...
method get_inputs (line 52) | def get_inputs(batch, device, model_name, train_mode=True):
method create_label_map (line 88) | def create_label_map(label_lists, trailing_piece_tag="X"):
method preprocess (line 110) | def preprocess(
class TokenClassifier (line 272) | class TokenClassifier(Transformer):
method __init__ (line 285) | def __init__(self, model_name="bert-base-cased", num_labels=2, cache_d...
method list_supported_models (line 295) | def list_supported_models():
method fit (line 298) | def fit(
method predict (line 410) | def predict(self, test_dataloader, num_gpus=None, gpu_ids=None, verbos...
method get_predicted_token_labels (line 442) | def get_predicted_token_labels(self, predictions, label_map, dataset):
method get_true_test_labels (line 492) | def get_true_test_labels(self, label_map, dataset):
FILE: utils_nlp/models/transformers/question_answering.py
function _list_supported_models (line 83) | def _list_supported_models():
class QAProcessor (line 87) | class QAProcessor:
method __init__ (line 109) | def __init__(
method model_name (line 127) | def model_name(self):
method model_name (line 131) | def model_name(self, value):
method model_type (line 143) | def model_type(self):
method get_inputs (line 147) | def get_inputs(batch, device, model_name, train_mode=True):
method list_supported_models (line 183) | def list_supported_models():
method preprocess (line 186) | def preprocess(
method postprocess (line 331) | def postprocess(
class QAResult (line 450) | class QAResult(QAResult_):
class QAResultExtended (line 483) | class QAResultExtended(QAResultExtended_):
class AnswerExtractor (line 509) | class AnswerExtractor(Transformer):
method __init__ (line 528) | def __init__(
method list_supported_models (line 539) | def list_supported_models():
method fit (line 542) | def fit(
method predict (line 663) | def predict(self, test_dataloader, num_gpus=None, gpu_ids=None, verbos...
function postprocess_bert_answer (line 733) | def postprocess_bert_answer(
function postprocess_xlnet_answer (line 1029) | def postprocess_xlnet_answer(
function _is_iterable_but_not_string (line 1280) | def _is_iterable_but_not_string(obj):
function _create_qa_example (line 1285) | def _create_qa_example(qa_input, is_training):
function _create_qa_features (line 1392) | def _create_qa_features(
function _get_final_text (line 1786) | def _get_final_text(pred_text, orig_text, do_lower_case, verbose_logging...
function _get_best_indexes (line 1884) | def _get_best_indexes(logits, n_best_size):
function _compute_softmax (line 1896) | def _compute_softmax(scores):
FILE: utils_nlp/models/transformers/sequence_classification.py
class Processor (line 23) | class Processor:
method __init__ (line 38) | def __init__(self, model_name="bert-base-cased", to_lower=False, cache...
method get_inputs (line 50) | def get_inputs(batch, device, model_name, train_mode=True):
method text_transform (line 86) | def text_transform(text, tokenizer, max_len=MAX_SEQ_LEN):
method text_pair_transform (line 118) | def text_pair_transform(text_1, text_2, tokenizer, max_len=MAX_SEQ_LEN):
method dataset_from_dataframe (line 186) | def dataset_from_dataframe(
class SequenceClassifier (line 210) | class SequenceClassifier(Transformer):
method __init__ (line 211) | def __init__(self, model_name="bert-base-cased", num_labels=2, cache_d...
method list_supported_models (line 221) | def list_supported_models():
method fit (line 224) | def fit(
method predict (line 336) | def predict(self, test_dataloader, num_gpus=None, gpu_ids=None, verbos...
FILE: utils_nlp/models/xlnet/common.py
class Language (line 12) | class Language(Enum):
class Tokenizer (line 20) | class Tokenizer:
method __init__ (line 21) | def __init__(
method preprocess_classification_tokens (line 33) | def preprocess_classification_tokens(self, examples, max_seq_length):
function log_xlnet_params (line 114) | def log_xlnet_params(local_dict):
FILE: utils_nlp/models/xlnet/sequence_classification.py
class XLNetSequenceClassifier (line 21) | class XLNetSequenceClassifier:
method __init__ (line 24) | def __init__(
method fit (line 82) | def fit(
method predict (line 274) | def predict(
Condensed preview — 249 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (2,368K chars).
[
{
"path": ".amlignore",
"chars": 16,
"preview": "data/\nexamples/\n"
},
{
"path": ".bumpversion.cfg",
"chars": 348,
"preview": "[bumpversion]\ncurrent_version = 1.0.0\ncommit = True\ntag = True\nmessage = \"Bump version: {current_version} -> {new_versio"
},
{
"path": ".flake8",
"chars": 807,
"preview": "[flake8]\n# Intial set of rules\n# Feel Free to add any new rule here with description of what it does.\n\n# E203\tWhitespace"
},
{
"path": ".github/ISSUE_TEMPLATE/bug_report.md",
"chars": 542,
"preview": "---\nname: Bug report\nabout: Create a report to help us improve\ntitle: \"[BUG] \"\nlabels: 'bug'\nassignees: ''\n\n---\n\n### Des"
},
{
"path": ".github/ISSUE_TEMPLATE/feature_request.md",
"chars": 386,
"preview": "---\nname: Feature request\nabout: Suggest an idea for this project\ntitle: \"[FEATURE] \"\nlabels: 'enhancement'\nassignees: '"
},
{
"path": ".github/ISSUE_TEMPLATE/general-ask.md",
"chars": 203,
"preview": "---\nname: General ask\nabout: Technical/non-technical asks about the repo\ntitle: \"[ASK] \"\nlabels: ''\nassignees: ''\n\n---\n\n"
},
{
"path": ".github/ISSUE_TEMPLATE.md",
"chars": 677,
"preview": "### Description\n<!--- Describe your issue/bug/request in detail -->\n\n\n### In which platform does it happen?\n<!--- Descri"
},
{
"path": ".github/PULL_REQUEST_TEMPLATE.md",
"chars": 660,
"preview": "### Description\n<!--- Describe your changes in detail -->\n<!--- Why is this change required? What problem does it solve?"
},
{
"path": ".gitignore",
"chars": 1710,
"preview": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packagi"
},
{
"path": ".pre-commit-config.yaml",
"chars": 225,
"preview": "repos:\n- repo: https://github.com/ambv/black\n rev: stable\n hooks:\n - id: black\n language_version: python"
},
{
"path": "CONTRIBUTING.md",
"chars": 5780,
"preview": "# Contribution Guidelines\n\nContribution are welcome! Here's a few things to know:\n\n- [Contribution Guidelines](#contribu"
},
{
"path": "DatasetReferences.md",
"chars": 11471,
"preview": "MICROSOFT PROVIDES THE DATASETS ON AN \"AS IS\" BASIS. MICROSOFT MAKES NO WARRANTIES, EXPRESS OR IMPLIED, GUARANTEES OR CO"
},
{
"path": "LICENSE",
"chars": 1183,
"preview": " MIT License\r\n\r\n Copyright (c) Microsoft Corporation. All rights reserved.\r\n\r\n Permission is hereby granted, fr"
},
{
"path": "MANIFEST.in",
"chars": 128,
"preview": "graft utils_nlp\n\nglobal-exclude *.py[cod] __pycache__ *.so *.dylib\n\nexclude README.md\nexclude SETUP.md\nexclude CONTRIBUT"
},
{
"path": "NOTICE.txt",
"chars": 49207,
"preview": "NOTICES AND INFORMATION\nDo Not Translate or Localize\n\nThis software incorporates material from third parties. Microsoft "
},
{
"path": "README.md",
"chars": 14156,
"preview": "<img src=\"NLP-Logo.png\" align=\"right\" alt=\"\" width=\"300\"/>\n\n\n# NLP Best Practices\n\nIn recent years, natural language pro"
},
{
"path": "SETUP.md",
"chars": 12018,
"preview": "# Setup Guide\n\nThis document describes how to setup all the dependencies to run the notebooks in this repository.\n\nThe r"
},
{
"path": "VERSIONING.md",
"chars": 1827,
"preview": "# Semantic Versioning\n> NOTE: Support for `setuptools_scm` is currently removed due to a known [issue](https://github.co"
},
{
"path": "_config.yml",
"chars": 26,
"preview": "theme: jekyll-theme-cayman"
},
{
"path": "cgmanifest.json",
"chars": 1324,
"preview": "{\n \"Registrations\": [\n {\n \"component\": {\n \"type\": \"git\",\n \"git\": {\n "
},
{
"path": "docker/Dockerfile",
"chars": 1548,
"preview": "FROM nvidia/cuda\n\n# Install Anaconda\n# Non interactive installation instructions can be found \n# https://hub.docker.com/"
},
{
"path": "docs/Makefile",
"chars": 632,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# You can set these varia"
},
{
"path": "docs/README.md",
"chars": 300,
"preview": "# Documentation\n\nTo setup the documentation, first you need to install the dependencies of the cpu environment. For it p"
},
{
"path": "docs/_config.yml",
"chars": 26,
"preview": "theme: jekyll-theme-cayman"
},
{
"path": "docs/source/azureml.rst",
"chars": 332,
"preview": ".. _azureml:\n\nAzureML module\n**************************\n\nAzureML module from NLP utilities.\n\nAzureML utils\n============="
},
{
"path": "docs/source/conf.py",
"chars": 8952,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# -*- coding: utf-8 -*-\n#"
},
{
"path": "docs/source/index.rst",
"chars": 712,
"preview": "\nNLP Utilities\n===================================================\n\nThe `NLP repository <https://github.com/microsoft/nl"
},
{
"path": "examples/README.md",
"chars": 1605,
"preview": "# Examples\n\nThis folder contains examples and best practices, written in Jupyter notebooks, for building Natural Languag"
},
{
"path": "examples/annotation/Doccano.md",
"chars": 11439,
"preview": "# Doccano: Text Annotation Tool\n\n## What is Doccano?\n\n[Doccano](https://github.com/chakki-works/doccano) is one of the b"
},
{
"path": "examples/annotation/README.md",
"chars": 402,
"preview": "# Text Annotation\n\nThis folder contains a tutorial that walks through how to deploy text annotation tool on Azure and ho"
},
{
"path": "examples/embeddings/README.md",
"chars": 1799,
"preview": "# Word Embedding\n\nThis folder contains examples and best practices, written in Jupyter notebooks, for training word embe"
},
{
"path": "examples/embeddings/embedding_trainer.ipynb",
"chars": 37402,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Developing Word Embeddings\"\n ]\n"
},
{
"path": "examples/entailment/README.md",
"chars": 1969,
"preview": "# Natural Language Inference (NLI) \n\nThis folder provides end-to-end examples of building Natural Language Inference (N"
},
{
"path": "examples/entailment/entailment_multinli_transformers.ipynb",
"chars": 13843,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"*Copyright (c) Microsoft Corporatio"
},
{
"path": "examples/entailment/entailment_xnli_bert_azureml.ipynb",
"chars": 19063,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Natural Language Inference on XNL"
},
{
"path": "examples/model_explainability/README.md",
"chars": 572,
"preview": "# Model Explainability\n\nThis folder contains examples and best practices, written in Jupyter notebooks, for explaining a"
},
{
"path": "examples/model_explainability/interpret_dnn_layers.ipynb",
"chars": 48521,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"*Copyright (c) Microsoft Corporatio"
},
{
"path": "examples/named_entity_recognition/README.md",
"chars": 1939,
"preview": "# Named Entity Recognition (NER)\n\nThis folder contains examples and best practices, written in Jupyter notebooks, for bu"
},
{
"path": "examples/named_entity_recognition/ner_wikigold_transformer.ipynb",
"chars": 31496,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"*Copyright (c) Microsoft Corporatio"
},
{
"path": "examples/question_answering/README.md",
"chars": 1895,
"preview": "# Question Answering (QA)\n\nThis folder contains examples and best practices, written in Jupyter notebooks, for building\n"
},
{
"path": "examples/question_answering/bert_run_squad_azureml.py",
"chars": 43621,
"preview": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.\n#\n# Licensed under the"
},
{
"path": "examples/question_answering/bidaf_aml_deep_dive.ipynb",
"chars": 37952,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"Copyright (c) Microsoft Corporation"
},
{
"path": "examples/question_answering/bidaf_config.json",
"chars": 2616,
"preview": "{\r\n \"dataset_reader\": {\r\n \"type\": \"squad\",\r\n \"token_indexers\": {\r\n \"tokens\": {\r\n \"type\": \"single_id\","
},
{
"path": "examples/question_answering/pretrained-BERT-SQuAD-deep-dive-aml.ipynb",
"chars": 43509,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"Copyright (c) Microsoft Corporation"
},
{
"path": "examples/question_answering/question_answering_squad_transformers.ipynb",
"chars": 43490,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"Copyright (c) Microsoft Corporation"
},
{
"path": "examples/question_answering/question_answering_system_bidaf_quickstart.ipynb",
"chars": 24426,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Create a Question Answering (QA) "
},
{
"path": "examples/sentence_similarity/README.md",
"chars": 5085,
"preview": "# Sentence Similarity\n\nThis folder contains examples and best practices, written in Jupyter notebooks, for building\nsent"
},
{
"path": "examples/sentence_similarity/automl_local_deployment_aci.ipynb",
"chars": 47720,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<i>Copyright (c) Microsoft Corporat"
},
{
"path": "examples/sentence_similarity/automl_with_pipelines_deployment_aks.ipynb",
"chars": 76341,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<i>Copyright (c) Microsoft Corporat"
},
{
"path": "examples/sentence_similarity/baseline_deep_dive.ipynb",
"chars": 89308,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<i>Copyright (c) Microsoft Corporat"
},
{
"path": "examples/sentence_similarity/bert_encoder.ipynb",
"chars": 6592,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Sentence Similarity with Pretrain"
},
{
"path": "examples/sentence_similarity/bert_senteval.ipynb",
"chars": 50459,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Parallel Experimentation with BER"
},
{
"path": "examples/sentence_similarity/gensen_aml_deep_dive.ipynb",
"chars": 51488,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"Copyright (c) Microsoft Corporation"
},
{
"path": "examples/sentence_similarity/gensen_config.json",
"chars": 1137,
"preview": "{\n \"training\": {\n \"optimizer\": \"adam\",\n \"clip_c\": 1,\n \"lrate\": 0.0001,\n \"batch_size\": 48,\n \"n_gpus\": 1,\n"
},
{
"path": "examples/sentence_similarity/gensen_local.ipynb",
"chars": 30071,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"\\n\",\n \"Copyright (c) Microsoft C"
},
{
"path": "examples/sentence_similarity/gensen_train.py",
"chars": 23768,
"preview": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed und"
},
{
"path": "examples/sentence_similarity/gensen_wrapper.py",
"chars": 4972,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\nimport json\nimport os\n\nfro"
},
{
"path": "examples/sentiment_analysis/absa/README.md",
"chars": 1572,
"preview": "# Aspect Based Sentiment Analysis\n\nThis folder contains examples and best practices, written in Jupyter notebooks, for t"
},
{
"path": "examples/sentiment_analysis/absa/absa.ipynb",
"chars": 24921,
"preview": "{\"cells\":[{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"Copyright (c) Microsoft Corporation. All rights "
},
{
"path": "examples/sentiment_analysis/absa/absa_azureml.ipynb",
"chars": 25011,
"preview": "{\"cells\":[{\"cell_type\":\"markdown\",\"metadata\":{},\"outputs\":[],\"source\":[\"Copyright (c) Microsoft Corporation. All rights "
},
{
"path": "examples/sentiment_analysis/absa/dataset/data.md",
"chars": 363,
"preview": "# About the Dataset\n\nReview data for this demo is sourced from the text reviews of [Women's E-Commerce Clothing Review]("
},
{
"path": "examples/text_classification/README.md",
"chars": 2248,
"preview": "# Text Classification\nThis folder contains examples and best practices, written in Jupyter notebooks, for building text "
},
{
"path": "examples/text_classification/tc_bert_azureml.ipynb",
"chars": 35965,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"*Copyright (c) Microsoft Corporatio"
},
{
"path": "examples/text_classification/tc_mnli_mtdnn.ipynb",
"chars": 71075,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"*Copyright (c) Microsoft Corporatio"
},
{
"path": "examples/text_classification/tc_mnli_transformers.ipynb",
"chars": 25453,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"*Copyright (c) Microsoft Corporatio"
},
{
"path": "examples/text_classification/tc_multi_languages_transformers.ipynb",
"chars": 22579,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"*Copyright (c) Microsoft Corporatio"
},
{
"path": "examples/text_summarization/abstractive_summarization_bertsum_cnndm_distributed_train.py",
"chars": 8913,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport argparse\nimport os"
},
{
"path": "examples/text_summarization/abstractive_summarization_bertsumabs_cnndm.ipynb",
"chars": 16925,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"Copyright (c) Microsoft Corporation"
},
{
"path": "examples/text_summarization/abstractive_summarization_minilm_cnndm.ipynb",
"chars": 17945,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"Copyright (c) Microsoft Corporation"
},
{
"path": "examples/text_summarization/abstractive_summarization_unilm_cnndm.ipynb",
"chars": 17579,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"Copyright (c) Microsoft Corporation"
},
{
"path": "examples/text_summarization/abstractive_summarization_unilm_cnndm.py",
"chars": 3148,
"preview": "import datetime\nimport argparse\nimport jsonlines\n\nimport torch\n\nfrom utils_nlp.models.transformers.abstractive_summariza"
},
{
"path": "examples/text_summarization/extractive_summarization_cnndm_aml_distributed.ipynb",
"chars": 17420,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"Copyright (c) Microsoft Corporation"
},
{
"path": "examples/text_summarization/extractive_summarization_cnndm_distributed_train.py",
"chars": 8080,
"preview": "# Copyright (c) Microsoft Corporation.\n# Licensed under the MIT License.\n\nimport argparse\nimport os\nimport sys\nimport ti"
},
{
"path": "examples/text_summarization/extractive_summarization_cnndm_transformer.ipynb",
"chars": 24077,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"Copyright (c) Microsoft Corporation"
},
{
"path": "examples/text_summarization/summarization_evaluation.ipynb",
"chars": 8680,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"Copyright (c) Microsoft Corporation"
},
{
"path": "pyproject.toml",
"chars": 30,
"preview": "[tool.black]\nline-length = 88\n"
},
{
"path": "setup.py",
"chars": 2711,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n#!/usr/bin/env python\n# -"
},
{
"path": "tests/README.md",
"chars": 4113,
"preview": "# Tests\n\nThis project uses unit, smoke and integration tests with Python files and notebooks.\n\n * In the unit tests we j"
},
{
"path": "tests/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "tests/ci/azureml_integration_tests.yml",
"chars": 2459,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# More info on scheduling"
},
{
"path": "tests/ci/component_governance.yml",
"chars": 655,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# Pull request against th"
},
{
"path": "tests/ci/cpu_integration_tests_linux.yml",
"chars": 1817,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\n# More info on schedulin"
},
{
"path": "tests/ci/cpu_unit_tests_linux.yml",
"chars": 1560,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# Pull request against th"
},
{
"path": "tests/ci/gpu_integration_tests_linux.yml",
"chars": 1815,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\n# More info on schedulin"
},
{
"path": "tests/ci/gpu_unit_tests_linux.yml",
"chars": 1321,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# Pull request against th"
},
{
"path": "tests/ci/notebooks_cpu_unit_tests_linux.yml",
"chars": 1324,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# Pull request against th"
},
{
"path": "tests/ci/notebooks_gpu_unit_tests_linux.yml",
"chars": 1326,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# Pull request against th"
},
{
"path": "tests/conftest.py",
"chars": 12482,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# NOTE: This file is used"
},
{
"path": "tests/integration/test_ddp_summarization.py",
"chars": 2183,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport os\nimport pytest\ni"
},
{
"path": "tests/integration/test_gpu_utils.py",
"chars": 246,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\nimport torc"
},
{
"path": "tests/integration/test_notebooks_abstractive_summarization_bertsumabs.py",
"chars": 987,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport papermill as pm\nim"
},
{
"path": "tests/integration/test_notebooks_embeddings.py",
"chars": 544,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\r\n# Licensed under the MIT License.\r\n\r\nimport pytest\r\nimport "
},
{
"path": "tests/integration/test_notebooks_entailment.py",
"chars": 2207,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\nimport pape"
},
{
"path": "tests/integration/test_notebooks_extractive_summarization.py",
"chars": 1947,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport papermill as pm\nim"
},
{
"path": "tests/integration/test_notebooks_interpretability.py",
"chars": 1043,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\nimport nump"
},
{
"path": "tests/integration/test_notebooks_minilm_abstractive_summarization.py",
"chars": 1974,
"preview": "# Copyright (c) Microsoft Corporation.\n# Licensed under the MIT License.\n\nimport papermill as pm\nimport pytest\nimport sc"
},
{
"path": "tests/integration/test_notebooks_named_entity_recognition.py",
"chars": 831,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\nimport pape"
},
{
"path": "tests/integration/test_notebooks_question_answering.py",
"chars": 4184,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\nimport pape"
},
{
"path": "tests/integration/test_notebooks_sentence_similarity.py",
"chars": 6554,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\nimport pape"
},
{
"path": "tests/integration/test_notebooks_text_classification.py",
"chars": 2751,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport os\nimport json\nimp"
},
{
"path": "tests/integration/test_notebooks_unilm_abstractive_summarization.py",
"chars": 2012,
"preview": "# Copyright (c) Microsoft Corporation.\n# Licensed under the MIT License.\n\nimport papermill as pm\nimport pytest\nimport sc"
},
{
"path": "tests/notebooks_common.py",
"chars": 466,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport os\n\n# Unless manua"
},
{
"path": "tests/smoke/test_dataset.py",
"chars": 774,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport os\nimport pytest\n\n"
},
{
"path": "tests/smoke/test_gpu_utils.py",
"chars": 240,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\nimport torc"
},
{
"path": "tests/smoke/test_word_embeddings.py",
"chars": 1684,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport os\n\nimport pytest\n"
},
{
"path": "tests/unit/test_abstractive_summarization_bertsum.py",
"chars": 4997,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport argparse\nimport nl"
},
{
"path": "tests/unit/test_abstractive_summarization_seq2seq.py",
"chars": 8341,
"preview": "# Copyright (c) Microsoft Corporation.\n# Licensed under the MIT License.\n\nimport os\nimport pytest\n\nfrom utils_nlp.models"
},
{
"path": "tests/unit/test_bert_common.py",
"chars": 3130,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\n\nfrom utils"
},
{
"path": "tests/unit/test_bert_encoder.py",
"chars": 611,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\n\nfrom utils"
},
{
"path": "tests/unit/test_bert_sentence_encoding.py",
"chars": 957,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\n\nfrom utils"
},
{
"path": "tests/unit/test_common_pytorch_utils.py",
"chars": 8156,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"PyTorch utils tests.\"\""
},
{
"path": "tests/unit/test_data_loaders.py",
"chars": 3608,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport random\n\nimport num"
},
{
"path": "tests/unit/test_dataset.py",
"chars": 6280,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport os\nimport pytest\n\n"
},
{
"path": "tests/unit/test_dataset_pytorch.py",
"chars": 1737,
"preview": "from utils_nlp.models.transformers.datasets import QADataset\n\n\ndef test_QADataset(qa_test_df):\n dataset = QADataset(\n"
},
{
"path": "tests/unit/test_distributed_sampler.py",
"chars": 1045,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\nfrom utils_"
},
{
"path": "tests/unit/test_eval_classification.py",
"chars": 428,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport numpy as np\n\nfrom "
},
{
"path": "tests/unit/test_eval_compute_rouge.py",
"chars": 8111,
"preview": "import os\nimport pytest\nfrom utils_nlp.eval import compute_rouge_perl, compute_rouge_python\n\nABS_TOL = 0.00001\n\nR1R = 0."
},
{
"path": "tests/unit/test_extractive_summarization.py",
"chars": 2615,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport nltk\nimport pytest"
},
{
"path": "tests/unit/test_gensen_utils.py",
"chars": 1856,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport os\n\nimport pandas "
},
{
"path": "tests/unit/test_interpreter.py",
"chars": 3453,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License\n\nimport random\n\nimport pyte"
},
{
"path": "tests/unit/test_models_transformers_question_answering.py",
"chars": 11098,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport os\n\nimport pytest\n"
},
{
"path": "tests/unit/test_notebooks_cpu.py",
"chars": 614,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport os\nimport pytest\nf"
},
{
"path": "tests/unit/test_notebooks_gpu.py",
"chars": 631,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport os\nimport pytest\nf"
},
{
"path": "tests/unit/test_preprocess.py",
"chars": 3297,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\nimport pand"
},
{
"path": "tests/unit/test_timer.py",
"chars": 942,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\nimport pytest\nimport tim"
},
{
"path": "tests/unit/test_transformers_sequence_classification.py",
"chars": 2095,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\nimport pand"
},
{
"path": "tests/unit/test_transformers_token_classification.py",
"chars": 1248,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport pytest\n\nfrom utils"
},
{
"path": "tools/README.md",
"chars": 274,
"preview": "# Tools\n\nThis submodule includes:\n1. A [script](generate_conda_file.py) to generate the Conda environment file for runn"
},
{
"path": "tools/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "tools/generate_conda_file.py",
"chars": 6406,
"preview": "#!/usr/bin/python\n\n# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# This"
},
{
"path": "tools/generate_requirements_txt.py",
"chars": 1247,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# This file outputs a req"
},
{
"path": "tools/remove_pixelserver.py",
"chars": 2200,
"preview": "#!/usr/bin/python\n\n# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nimport"
},
{
"path": "utils_nlp/README.md",
"chars": 3418,
"preview": "# NLP Utilities\n\nModern NLP research and development can involve tedious tasks ranging from data loading, dataset unders"
},
{
"path": "utils_nlp/__init__.py",
"chars": 383,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n__title__ = \"Microsoft NL"
},
{
"path": "utils_nlp/azureml/README.md",
"chars": 587,
"preview": "## [AzureML](.)\n\nThe AzureML submodule contains utilities to connect to a\n[workspace](https://docs.microsoft.com/en-us/a"
},
{
"path": "utils_nlp/azureml/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "utils_nlp/azureml/azureml_bert_util.py",
"chars": 5235,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# Original source:\n# http"
},
{
"path": "utils_nlp/azureml/azureml_utils.py",
"chars": 5252,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Helper functions for i"
},
{
"path": "utils_nlp/common/README.md",
"chars": 448,
"preview": "## [Common](.)\n\nThis submodule contains high-level common utilities used across multiple algorithms and \nframeworks as w"
},
{
"path": "utils_nlp/common/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "utils_nlp/common/pytorch_utils.py",
"chars": 6808,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Common PyTorch utiliti"
},
{
"path": "utils_nlp/common/timer.py",
"chars": 1694,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Timer utilities for be"
},
{
"path": "utils_nlp/dataset/README.md",
"chars": 1961,
"preview": "## [Dataset](.)\nThis submodule includes helper functions for downloading datasets and formatting them appropriately as w"
},
{
"path": "utils_nlp/dataset/__init__.py",
"chars": 299,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\nfrom enum import Enum\nimp"
},
{
"path": "utils_nlp/dataset/bbc_hindi.py",
"chars": 7236,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"\n Utility functions"
},
{
"path": "utils_nlp/dataset/cnndm.py",
"chars": 10314,
"preview": "# Copyright (c) Microsoft Corporation.\n# Licensed under the MIT License.\n\n# This script reuses some code from https://gi"
},
{
"path": "utils_nlp/dataset/dac.py",
"chars": 7078,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Dataset for Arabic Cla"
},
{
"path": "utils_nlp/dataset/data_loaders.py",
"chars": 5181,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Data loaders for sampl"
},
{
"path": "utils_nlp/dataset/msrpc.py",
"chars": 2574,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"\n Utility functions"
},
{
"path": "utils_nlp/dataset/multinli.py",
"chars": 10451,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"\n Utility functions"
},
{
"path": "utils_nlp/dataset/ner_utils.py",
"chars": 2009,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Common helper function"
},
{
"path": "utils_nlp/dataset/preprocess.py",
"chars": 5614,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"\n Utility functions"
},
{
"path": "utils_nlp/dataset/sentence_selection.py",
"chars": 4527,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# This script reuses some"
},
{
"path": "utils_nlp/dataset/snli.py",
"chars": 5942,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"\n Utility functions"
},
{
"path": "utils_nlp/dataset/squad.py",
"chars": 4080,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\r\n# Licensed under the MIT License.\r\n\r\nimport os\r\nimport json"
},
{
"path": "utils_nlp/dataset/stsbenchmark.py",
"chars": 3839,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"\n Utility functions"
},
{
"path": "utils_nlp/dataset/url_utils.py",
"chars": 4394,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Common utilities for d"
},
{
"path": "utils_nlp/dataset/wikigold.py",
"chars": 8793,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"\n Utility functions"
},
{
"path": "utils_nlp/dataset/xnli.py",
"chars": 3541,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"\n Utility functions"
},
{
"path": "utils_nlp/dataset/xnli_torch_dataset.py",
"chars": 4482,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Utility functions for "
},
{
"path": "utils_nlp/eval/README.md",
"chars": 372,
"preview": "## [Evaluation](.)\nThe evaluation (eval) submodule includes functionalities for computing metrics for evaluating NLP mod"
},
{
"path": "utils_nlp/eval/SentEval/.gitignore",
"chars": 136,
"preview": "# SentEval data and .pyc files\n\n\n\n# python\n__pycache__/\n*.py[cod]\n*$py.class\n\n# log files\n*.log\n*.txt\n\n# data files\ndata"
},
{
"path": "utils_nlp/eval/SentEval/LICENSE",
"chars": 1529,
"preview": "BSD License\n\nFor SentEval software\n\nCopyright (c) 2017-present, Facebook, Inc. All rights reserved.\n\nRedistribution and "
},
{
"path": "utils_nlp/eval/SentEval/README.md",
"chars": 14470,
"preview": "# SentEval: evaluation toolkit for sentence embeddings\n\nSentEval is a library for evaluating the quality of sentence emb"
},
{
"path": "utils_nlp/eval/SentEval/senteval/__init__.py",
"chars": 264,
"preview": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license f"
},
{
"path": "utils_nlp/eval/SentEval/senteval/binary.py",
"chars": 3712,
"preview": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license f"
},
{
"path": "utils_nlp/eval/SentEval/senteval/engine.py",
"chars": 5966,
"preview": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license f"
},
{
"path": "utils_nlp/eval/SentEval/senteval/mrpc.py",
"chars": 4202,
"preview": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license f"
},
{
"path": "utils_nlp/eval/SentEval/senteval/probing.py",
"chars": 6786,
"preview": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license f"
},
{
"path": "utils_nlp/eval/SentEval/senteval/rank.py",
"chars": 4643,
"preview": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license f"
},
{
"path": "utils_nlp/eval/SentEval/senteval/sick.py",
"chars": 9261,
"preview": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license f"
},
{
"path": "utils_nlp/eval/SentEval/senteval/snli.py",
"chars": 4577,
"preview": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license f"
},
{
"path": "utils_nlp/eval/SentEval/senteval/sst.py",
"chars": 3946,
"preview": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license f"
},
{
"path": "utils_nlp/eval/SentEval/senteval/sts.py",
"chars": 6848,
"preview": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license f"
},
{
"path": "utils_nlp/eval/SentEval/senteval/tools/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "utils_nlp/eval/SentEval/senteval/tools/classifier.py",
"chars": 7737,
"preview": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license f"
},
{
"path": "utils_nlp/eval/SentEval/senteval/tools/ranking.py",
"chars": 15275,
"preview": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license f"
},
{
"path": "utils_nlp/eval/SentEval/senteval/tools/relatedness.py",
"chars": 4540,
"preview": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license f"
},
{
"path": "utils_nlp/eval/SentEval/senteval/tools/validation.py",
"chars": 10358,
"preview": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license f"
},
{
"path": "utils_nlp/eval/SentEval/senteval/trec.py",
"chars": 3565,
"preview": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license f"
},
{
"path": "utils_nlp/eval/SentEval/senteval/utils.py",
"chars": 2717,
"preview": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license f"
},
{
"path": "utils_nlp/eval/SentEval/setup.py",
"chars": 568,
"preview": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license f"
},
{
"path": "utils_nlp/eval/__init__.py",
"chars": 74,
"preview": "from .rouge.compute_rouge import compute_rouge_perl, compute_rouge_python\n"
},
{
"path": "utils_nlp/eval/classification.py",
"chars": 3133,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Utilities functions fo"
},
{
"path": "utils_nlp/eval/evaluate_squad.py",
"chars": 3561,
"preview": "\"\"\" Official evaluation script for v1.1 of the SQuAD dataset. \"\"\"\n\n# Original source:\n# https://github.com/allenai/bi-at"
},
{
"path": "utils_nlp/eval/evaluate_summarization.py",
"chars": 1324,
"preview": "# Copyright (c) Microsoft Corporation.\n# Licensed under the MIT License.\n\nimport os\nfrom random import random, seed\n\nfro"
},
{
"path": "utils_nlp/eval/question_answering.py",
"chars": 12892,
"preview": "\"\"\" Official evaluation script for SQuAD version 2.0.\n Modified by XLNet authors to update `find_best_threshold` scri"
},
{
"path": "utils_nlp/eval/rouge/compute_rouge.py",
"chars": 5046,
"preview": "# Copyright (c) Microsoft Corporation.\n# Licensed under the MIT License.\n\nimport os\nimport shutil\nimport time\nimport tem"
},
{
"path": "utils_nlp/eval/rouge/rouge_ext.py",
"chars": 24565,
"preview": "# This script is adopted from https://github.com/Diego999/py-rouge/blob/master/rouge/rouge.py\n# to compute ROUGE scores "
},
{
"path": "utils_nlp/eval/senteval.py",
"chars": 2232,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Utilities for evaluati"
},
{
"path": "utils_nlp/interpreter/Interpreter.py",
"chars": 6796,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\"\"\"Utilities that enables"
},
{
"path": "utils_nlp/interpreter/README.md",
"chars": 3298,
"preview": "# Towards a Deep and Unified Understanding of Deep Neural Models in NLP\n\nThis submodule contains a tool for explaining h"
},
{
"path": "utils_nlp/interpreter/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "utils_nlp/language_utils/hi/hindi_stemmer.py",
"chars": 2095,
"preview": "#! /usr/bin/env python3.1\n# Script was downloaded from https://research.variancia.com/hindi_stemmer/\n\"\"\" Lightweight Hin"
},
{
"path": "utils_nlp/models/README.md",
"chars": 1307,
"preview": "# Models\nThe models submodule contains implementations of various algorithms that can be used in addition to external pa"
},
{
"path": "utils_nlp/models/bert/README.md",
"chars": 1515,
"preview": "# BERT-based Classes\n\nThis folder contains utility functions and classes based on the implementation of [Transformers](h"
},
{
"path": "utils_nlp/models/bert/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "utils_nlp/models/bert/common.py",
"chars": 19171,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\n# This script reuses som"
},
{
"path": "utils_nlp/models/bert/sequence_classification.py",
"chars": 10391,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n\nfrom collections import "
},
{
"path": "utils_nlp/models/bert/sequence_classification_distributed.py",
"chars": 12153,
"preview": "# Copyright (c) Microsoft Corporation. All rights reserved.\n# Licensed under the MIT License.\n\n# This script reuses some"
}
]
// ... and 49 more files (download for full content)
About this extraction
This page contains the full source code of the microsoft/nlp-recipes GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 249 files (2.1 MB), approximately 568.5k tokens, and a symbol index with 963 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.