Repository: allenai/ir_datasets Branch: master Commit: ae24b5302c56 Files: 269 Total size: 3.1 MB Directory structure: gitextract_2j6ggfs5/ ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug_report.md │ │ ├── dataset-addition.md │ │ ├── documentation.md │ │ └── feature_request.md │ └── workflows/ │ ├── deploy.yml │ └── test.yml ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── examples/ │ ├── adding_datasets.ipynb │ ├── clirmatrix_example.py │ ├── ir_datasets.ipynb │ └── ir_datasets_cli.ipynb ├── ir_datasets/ │ ├── __init__.py │ ├── __main__.py │ ├── commands/ │ │ ├── __init__.py │ │ ├── build_c4_checkpoints.py │ │ ├── build_clueweb_warc_indexes.py │ │ ├── build_download_cache.py │ │ ├── clean.py │ │ ├── doc_fifos.py │ │ ├── export.py │ │ ├── generate_metadata.py │ │ ├── list.py │ │ └── lookup.py │ ├── datasets/ │ │ ├── __init__.py │ │ ├── antique.py │ │ ├── aol_ia.py │ │ ├── aquaint.py │ │ ├── argsme.py │ │ ├── base.py │ │ ├── beir.py │ │ ├── c4.py │ │ ├── car.py │ │ ├── clinicaltrials.py │ │ ├── clirmatrix.py │ │ ├── clueweb09.py │ │ ├── clueweb12.py │ │ ├── codec.py │ │ ├── codesearchnet.py │ │ ├── cord19.py │ │ ├── cranfield.py │ │ ├── csl.py │ │ ├── disks45.py │ │ ├── dpr_w100.py │ │ ├── gov.py │ │ ├── gov2.py │ │ ├── hc4.py │ │ ├── highwire.py │ │ ├── istella22.py │ │ ├── kilt.py │ │ ├── lotte.py │ │ ├── medline.py │ │ ├── miracl.py │ │ ├── mmarco.py │ │ ├── mr_tydi.py │ │ ├── msmarco_document.py │ │ ├── msmarco_document_v2.py │ │ ├── msmarco_passage.py │ │ ├── msmarco_passage_v2.py │ │ ├── msmarco_qna.py │ │ ├── nano_beir.py │ │ ├── natural_questions.py │ │ ├── neuclir.py │ │ ├── neumarco.py │ │ ├── nfcorpus.py │ │ ├── nyt.py │ │ ├── pmc.py │ │ ├── sara.py │ │ ├── touche.py │ │ ├── touche_image.py │ │ ├── trec_arabic.py │ │ ├── trec_cast.py │ │ ├── trec_fair.py │ │ ├── trec_mandarin.py │ │ ├── trec_robust04.py │ │ ├── trec_spanish.py │ │ ├── trec_tot.py │ │ ├── trec_tot_2025.py │ │ ├── tripclick.py │ │ ├── tweets2013_ia.py │ │ ├── vaswani.py │ │ ├── wapo.py │ │ ├── wikiclir.py │ │ └── wikir.py │ ├── docs/ │ │ ├── antique.yaml │ │ ├── aol-ia.yaml │ │ ├── aquaint.yaml │ │ ├── argsme.yaml │ │ ├── beir.yaml │ │ ├── bibliography.bib │ │ ├── c4.yaml │ │ ├── car.yaml │ │ ├── clinicaltrials.yaml │ │ ├── clirmatrix.yaml │ │ ├── clueweb09.yaml │ │ ├── clueweb12.yaml │ │ ├── codec.yaml │ │ ├── codesearchnet.yaml │ │ ├── cord19.yaml │ │ ├── cranfield.yaml │ │ ├── csl.yaml │ │ ├── disks45.yaml │ │ ├── dpr-w100.yaml │ │ ├── gov.yaml │ │ ├── gov2.yaml │ │ ├── hc4.yaml │ │ ├── highwire.yaml │ │ ├── istella22.yaml │ │ ├── kilt.yaml │ │ ├── lotte.yaml │ │ ├── medline.yaml │ │ ├── miracl.yaml │ │ ├── mmarco.yaml │ │ ├── mr-tydi.yaml │ │ ├── msmarco-document-v2.yaml │ │ ├── msmarco-document.yaml │ │ ├── msmarco-passage-v2.yaml │ │ ├── msmarco-passage.yaml │ │ ├── msmarco-qna.yaml │ │ ├── nano-beir.yaml │ │ ├── natural-questions.yaml │ │ ├── neuclir.yaml │ │ ├── neumarco.yaml │ │ ├── nfcorpus.yaml │ │ ├── nyt.yaml │ │ ├── pmc.yaml │ │ ├── sara.yaml │ │ ├── touche-image.yaml │ │ ├── touche.yaml │ │ ├── trec-arabic.yaml │ │ ├── trec-cast.yaml │ │ ├── trec-fair.yaml │ │ ├── trec-mandarin.yaml │ │ ├── trec-robust04.yaml │ │ ├── trec-spanish.yaml │ │ ├── trec-tot-2025.yaml │ │ ├── trec-tot.yaml │ │ ├── tripclick.yaml │ │ ├── tweets2013-ia.yaml │ │ ├── vaswani.yaml │ │ ├── wapo.yaml │ │ ├── wikiclir.yaml │ │ └── wikir.yaml │ ├── etc/ │ │ ├── downloads.json │ │ └── metadata.json │ ├── formats/ │ │ ├── __init__.py │ │ ├── argsme.py │ │ ├── base.py │ │ ├── clirmatrix.py │ │ ├── csv_fmt.py │ │ ├── extracted_cc.py │ │ ├── jsonl.py │ │ ├── ntcir.py │ │ ├── touche.py │ │ ├── touche_image.py │ │ ├── trec.py │ │ ├── tsv.py │ │ └── webarc.py │ ├── indices/ │ │ ├── __init__.py │ │ ├── base.py │ │ ├── cache_docstore.py │ │ ├── clueweb_warc.py │ │ ├── indexed_tsv_docstore.py │ │ ├── lz4_pickle.py │ │ ├── numpy_sorted_index.py │ │ └── zpickle_docstore.py │ ├── lazy_libs.py │ ├── log.py │ ├── util/ │ │ ├── __init__.py │ │ ├── docs/ │ │ │ ├── __init__.py │ │ │ ├── lazy.py │ │ │ ├── multiple.py │ │ │ └── subset.py │ │ ├── download.py │ │ ├── fileio.py │ │ ├── hash.py │ │ ├── html_parsing.py │ │ ├── metadata.py │ │ └── registry.py │ └── wrappers/ │ ├── __init__.py │ └── html_extractor.py ├── pyproject.toml ├── requirements-test.txt ├── requirements.txt └── test/ ├── __init__.py ├── downloads.py ├── dummy/ │ ├── docs.tsv │ ├── qrels │ └── queries.tsv ├── formats/ │ ├── __init__.py │ ├── test_trec.py │ └── test_tsv.py ├── indices/ │ ├── __init__.py │ ├── lz4_pickle.py │ └── numpy_sorted.py ├── integration/ │ ├── __init__.py │ ├── antique.py │ ├── aol_ia.py │ ├── aquaint.py │ ├── argsme.py │ ├── base.py │ ├── beir.py │ ├── c4.py │ ├── car.py │ ├── clinicaltrials.py │ ├── clirmatrix.py │ ├── clueweb09.py │ ├── clueweb12.py │ ├── codec.py │ ├── codesearchnet.py │ ├── cord19.py │ ├── cranfield.py │ ├── csl.py │ ├── disks45.py │ ├── dpr_w100.py │ ├── dummy.py │ ├── gov.py │ ├── gov2.py │ ├── hc4.py │ ├── highwire.py │ ├── istella22.py │ ├── kilt.py │ ├── lotte.py │ ├── medline.py │ ├── miracl.py │ ├── mmarco.py │ ├── mr_tydi.py │ ├── msmarco_document.py │ ├── msmarco_document_v2.py │ ├── msmarco_passage.py │ ├── msmarco_passage_v2.py │ ├── msmarco_qna.py │ ├── nano_beir.py │ ├── natural_questions.py │ ├── neuclir.py │ ├── neumarco.py │ ├── nfcorpus.py │ ├── nyt.py │ ├── pmc.py │ ├── sara.py │ ├── touche.py │ ├── touche_image.py │ ├── trec_arabic.py │ ├── trec_cast.py │ ├── trec_fair.py │ ├── trec_mandarin.py │ ├── trec_robust04.py │ ├── trec_spanish.py │ ├── trec_tot.py │ ├── trec_tot_2024.py │ ├── trec_tot_2025/ │ │ ├── test_docs_iter.py │ │ ├── test_docs_store.py │ │ ├── test_qrel_iter.py │ │ └── test_queries_iter.py │ ├── tripclick.py │ ├── tweets2013_ia.py │ ├── vaswani.py │ ├── wapo.py │ ├── wikiclir.py │ └── wikir.py ├── metadata.py ├── test_defaulttext.py ├── util/ │ └── docs/ │ ├── __init__.py │ ├── data.py │ ├── test_multiple.py │ └── test_subset.py └── util.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.md ================================================ --- name: Bug report about: Errors in behavior or functionality title: '' labels: bug assignees: '' --- **Describe the bug** A clear and concise description of what the bug is. **Affected dataset(s)** **To Reproduce** Steps to reproduce the behavior: 1. Go to '...' 2. Click on '....' 3. Scroll down to '....' 4. See error **Expected behavior** A clear and concise description of what you expected to happen. **Additional context** Add any other context about the problem here. ================================================ FILE: .github/ISSUE_TEMPLATE/dataset-addition.md ================================================ --- name: Dataset Addition about: Propose adding a new dataset, collection of related datasets, or feature to existing dataset title: '' labels: add-dataset assignees: '' --- **Dataset Information:** **Links to Resources:** **Dataset ID(s) & supported entities:** - **Checklist** Mark each task once completed. All should be checked prior to merging a new dataset. - [ ] Dataset definition (in `ir_datasets/datasets/[topid].py`) - [ ] Tests (in `tests/integration/[topid].py`) - [ ] Metadata generated (using `ir_datasets generate_metadata` command, should appear in `ir_datasets/etc/metadata.json`) - [ ] Documentation (in `ir_datasets/etc/[topid].yaml`) - [ ] Documentation generated in https://github.com/seanmacavaney/ir-datasets.com/ - [ ] Downloadable content (in `ir_datasets/etc/downloads.json`) - [ ] Download verification action (in `.github/workflows/verify_downloads.yml`). Only one needed per `topid`. - [ ] Any small public files from NIST (or other potentially troublesome files) mirrored in https://github.com/seanmacavaney/irds-mirror/. Mirrored status properly reflected in `downloads.json`. **Additional comments/concerns/ideas/etc.** ================================================ FILE: .github/ISSUE_TEMPLATE/documentation.md ================================================ --- name: Documentation about: Additions to or improvmenets to the documentation title: '' labels: documentation assignees: '' --- **Dataset(s)** **Describe the proposed change** ================================================ FILE: .github/ISSUE_TEMPLATE/feature_request.md ================================================ --- name: Feature request about: Suggest an idea for this project title: '' labels: enhancement assignees: '' --- **Is your feature request related to a problem? Please describe.** A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] **Describe the solution you'd like** A clear and concise description of what you want to happen. **Describe alternatives you've considered** A clear and concise description of any alternative solutions or features you've considered. **Additional context** Add any other context or screenshots about the feature request here. ================================================ FILE: .github/workflows/deploy.yml ================================================ name: deploy on: release: types: [created] jobs: pypi: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - uses: actions/setup-python@v2 with: python-version: '3.x' - name: install-deps run: | python -m pip install --upgrade pip pip install build setuptools wheel twine - name: build run: | python -m build - name: upload env: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | twine upload dist/* ================================================ FILE: .github/workflows/test.yml ================================================ name: test on: push: {branches: [master]} # pushes to master pull_request: {} # all PRs jobs: pytest: strategy: matrix: python-version: ['3.10', '3.12'] os: ['ubuntu-latest', 'windows-latest', 'macos-latest'] runs-on: ${{ matrix.os }} steps: - name: Checkout uses: actions/checkout@v4 - name: Install Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install Dependencies run: | pip install --upgrade -r requirements.txt -r requirements-test.txt pip install -e '.[all]' - name: Unit Test if: matrix.os == 'ubuntu-latest' || matrix.os == 'macOs-latest' run: | pip install pytest pytest test/util.py test/metadata.py test/integration/dummy.py test/integration/vaswani.py test/formats/ test/test_defaulttext.py - name: Unit Test (Windows) if: matrix.os == 'windows-latest' shell: cmd run: | pip install pytest pytest test\util.py test\metadata.py test\integration\dummy.py test\integration\vaswani.py test\formats\ test\test_defaulttext.py env: PATH: 'C:/Program Files/zlib/bin/' ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ .DS_Store ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: MANIFEST.in ================================================ recursive-include ir_datasets *.yaml recursive-include ir_datasets *.bib recursive-include ir_datasets *.json ================================================ FILE: README.md ================================================ # ir_datasets `ir_datasets` is a python package that provides a common interface to many IR ad-hoc ranking benchmarks, training datasets, etc. The package takes care of downloading datasets (including documents, queries, relevance judgments, etc.) when available from public sources. Instructions on how to obtain datasets are provided when they are not publicly available. `ir_datasets` provides a common iterator format to allow them to be easily used in python. It attempts to provide the data in an unaltered form (i.e., keeping all fields and markup), while handling differences in file formats, encoding, etc. Adapters provide extra functionality, e.g., to allow quick lookups of documents by ID. A command line interface is also available. You can find a list of datasets and their features [here](https://ir-datasets.com/). Want a new dataset, added functionality, or a bug fixed? Feel free to post an issue or make a pull request! ## Getting Started For a quick start with the Python API, check out our Colab tutorials: [Python](https://colab.research.google.com/github/allenai/ir_datasets/blob/master/examples/ir_datasets.ipynb) [Command Line](https://colab.research.google.com/github/allenai/ir_datasets/blob/master/examples/ir_datasets_cli.ipynb) Install via pip: ``` pip install ir_datasets ``` If you want the main branch, you install as such: ``` pip install git+https://github.com/allenai/ir_datasets.git ``` If you want to run an editable version locally: ``` $ git clone https://github.com/allenai/ir_datasets $ cd ir_datasets $ pip install -e . ``` Tested with python versions 3.7, 3.8, 3.9, and 3.10. (Mininum python version is 3.7.) ## Features **Python and Command Line Interfaces**. Access datasts both through a simple Python API and via the command line. ```python import ir_datasets dataset = ir_datasets.load('msmarco-passage/train') # Documents for doc in dataset.docs_iter(): print(doc) # GenericDoc(doc_id='0', text='The presence of communication amid scientific minds was equa... # GenericDoc(doc_id='1', text='The Manhattan Project and its atomic bomb helped bring an en... # ... ``` ```bash ir_datasets export msmarco-passage/train docs | head -n2 0 The presence of communication amid scientific minds was equally important to the success of the Manh... 1 The Manhattan Project and its atomic bomb helped bring an end to World War II. Its legacy of peacefu... ``` **Automatically downloads source files** (when available). Will download and verify the source files for queries, documents, qrels, etc. when they are publicly available, as they are needed. A CI build checks weekly to ensure that all the downloadable content is available and correct: [![Downloadable Content](https://github.com/seanmacavaney/ir-datasets.com/actions/workflows/verify_downloads.yml/badge.svg)](https://github.com/seanmacavaney/ir-datasets.com/actions/workflows/verify_downloads.yml). We mirror some troublesome files on [mirror.ir-datasets.com](https://mirror.ir-datasets.com/), and automatically switch to the mirror when the original source is not available. ```python import ir_datasets dataset = ir_datasets.load('msmarco-passage/train') for doc in dataset.docs_iter(): # Will download and extract MS-MARCO's collection.tar.gz the first time ... for query in dataset.queries_iter(): # Will download and extract MS-MARCO's queries.tar.gz the first time ... ``` **Instructions for dataset access** (when not publicly available). Provides instructions on how to get a copy of the data when it is not publicly available online (e.g., when it requires a data usage agreement). ```python import ir_datasets dataset = ir_datasets.load('trec-arabic') for doc in dataset.docs_iter(): ... # Provides the following instructions: # The dataset is based on the Arabic Newswire corpus. It is available from the LDC via: # To proceed, symlink the source file here: [gives path] ``` **Support for datasets big and small**. By using iterators, supports large datasets that may not fit into system memory, such as ClueWeb. ```python import ir_datasets dataset = ir_datasets.load('clueweb09') for doc in dataset.docs_iter(): ... # will iterate through all ~1B documents ``` **Fixes known dataset issues**. For instance, automatically corrects the document UTF-8 encoding problem in the MS-MARCO passage collection. ```python import ir_datasets dataset = ir_datasets.load('msmarco-passage') docstore = dataset.docs_store() docstore.get('243').text # "John Maynard Keynes, 1st Baron Keynes, CB, FBA (/ˈkeɪnz/ KAYNZ; 5 June 1883 – 21 April [SNIP]" # Naïve UTF-8 decoding yields double-encoding artifacts like: # "John Maynard Keynes, 1st Baron Keynes, CB, FBA (/Ë\x88keɪnz/ KAYNZ; 5 June 1883 â\x80\x93 21 April [SNIP]" # ~~~~~~ ~~ ~~~~~~~~~ ``` **Fast Random Document Access.** Builds data structures that allow fast and efficient lookup of document content. For large datasets, such as ClueWeb, uses [checkpoint files](https://ir-datasets.com/clueweb_warc_checkpoints.md) to load documents from source 40x faster than normal. Results are cached for even faster subsequent accesses. ```python import ir_datasets dataset = ir_datasets.load('clueweb12') docstore = dataset.docs_store() docstore.get_many(['clueweb12-0000tw-05-00014', 'clueweb12-0000tw-05-12119', 'clueweb12-0106wb-18-19516']) # {'clueweb12-0000tw-05-00014': ..., 'clueweb12-0000tw-05-12119': ..., 'clueweb12-0106wb-18-19516': ...} ``` **Fancy Iter Slicing.** Sometimes it's helpful to be able to select ranges of data (e.g., for processing document collections in parallel on multiple devices). Efficient implementations of slicing operations allow for much faster dataset partitioning than using `itertools.slice`. ```python import ir_datasets dataset = ir_datasets.load('clueweb12') dataset.docs_iter()[500:1000] # normal slicing behavior # WarcDoc(doc_id='clueweb12-0000tw-00-00502', ...), WarcDoc(doc_id='clueweb12-0000tw-00-00503', ...), ... dataset.docs_iter()[-10:-8] # includes negative indexing # WarcDoc(doc_id='clueweb12-1914wb-28-24245', ...), WarcDoc(doc_id='clueweb12-1914wb-28-24246', ...) dataset.docs_iter()[::100] # includes support for skip (only positive values) # WarcDoc(doc_id='clueweb12-0000tw-00-00000', ...), WarcDoc(doc_id='clueweb12-0000tw-00-00100', ...), ... dataset.docs_iter()[1/3:2/3] # supports proportional slicing (this takes the middle third of the collection) # WarcDoc(doc_id='clueweb12-0605wb-28-12714', ...), WarcDoc(doc_id='clueweb12-0605wb-28-12715', ...), ... ``` ## Datasets Available datasets include: - [ANTIQUE](https://ir-datasets.com/antique.html) - [AQUAINT](https://ir-datasets.com/aquaint.html) - [BEIR (benchmark suite)](https://ir-datasets.com/beir.html) - [TREC CAR](https://ir-datasets.com/car.html) - [C4](https://ir-datasets.com/c4.html) - [ClueWeb09](https://ir-datasets.com/clueweb09.html) - [ClueWeb12](https://ir-datasets.com/clueweb12.html) - [CLIRMatrix](https://ir-datasets.com/clirmatrix.html) - [CodeSearchNet](https://ir-datasets.com/codesearchnet.html) - [CORD-19](https://ir-datasets.com/cord19.html) - [DPR Wiki100](https://ir-datasets.com/dpr-w100.html) - [GOV](https://ir-datasets.com/gov.html) - [GOV2](https://ir-datasets.com/gov2.html) - [HC4](https://ir-datasets.com/hc4.html) - [Highwire (TREC Genomics 2006-07)](https://ir-datasets.com/highwire.html) - [Medline](https://ir-datasets.com/medline.html) - [MSMARCO (document)](https://ir-datasets.com/msmarco-document.html) - [MSMARCO (passage)](https://ir-datasets.com/msmarco-passage.html) - [MSMARCO (QnA)](https://ir-datasets.com/msmarco-qna.html) - [Natural Questions](https://ir-datasets.com/natural-questions.html) - [NFCorpus (NutritionFacts)](https://ir-datasets.com/nfcorpus.html) - [NYT](https://ir-datasets.com/nyt.html) - [PubMed Central (TREC CDS)](https://ir-datasets.com/pmc.html) - [TREC Arabic](https://ir-datasets.com/trec-arabic.html) - [TREC Fair Ranking 2021](https://ir-datasets.com/trec-fair-2021.html) - [TREC Mandarin](https://ir-datasets.com/trec-mandarin.html) - [TREC Robust 2004](https://ir-datasets.com/trec-robust04.html) - [TREC Spanish](https://ir-datasets.com/trec-spanish.html) - [TripClick](https://ir-datasets.com/tripclick.html) - [Tweets 2013 (Internet Archive)](https://ir-datasets.com/tweets2013-ia.html) - [Vaswani](https://ir-datasets.com/vaswani.html) - [Washington Post](https://ir-datasets.com/wapo.html) - [WikIR](https://ir-datasets.com/wikir.html) There are "subsets" under each dataset. For instance, `clueweb12/b13/trec-misinfo-2019` provides the queries and judgments from the [2019 TREC misinformation track](https://trec.nist.gov/data/misinfo2019.html), and `msmarco-document/orcas` provides the [ORCAS dataset](https://microsoft.github.io/msmarco/ORCAS). They tend to be organized with the document collection at the top level. See the ir_dataets docs ([ir_datasets.com](https://ir-datasets.com/)) for details about each dataset, its available subsets, and what data they provide. ## Environment variables - `IR_DATASETS_HOME`: Home directory for ir_datasets data (default `~/.ir_datasets/`). Contains directories for each top-level dataset. - `IR_DATASETS_TMP`: Temporary working directory (default `/tmp/ir_datasets/`). - `IR_DATASETS_DL_TIMEOUT`: Download stream read timeout, in seconds (default `15`). If no data is received within this duration, the connection will be assumed to be dead, and another download may be attempted. - `IR_DATASETS_DL_TRIES`: Default number of download attempts before exception is thrown (default `3`). When the server accepts Range requests, uses them. Otherwise, will download the entire file again - `IR_DATASETS_DL_DISABLE_PBAR`: Set to `true` to disable the progress bar for downloads. Useful in settings where an interactive console is not available. - `IR_DATASETS_DL_SKIP_SSL`: Set to `true` to disable checking SSL certificates when downloading files. Useful as a short-term solution when SSL certificates expire or are otherwise invalid. Note that this does not disable hash verification of the downloaded content. - `IR_DATASETS_SKIP_DISK_FREE`: Set to `true` to disable checks for enough free space on disk before downloading content or otherwise creating large files. - `IR_DATASETS_SMALL_FILE_SIZE`: The size of files that are considered "small", in bytes. Instructions for linking small files rather then downloading them are not shown. Defaults to 5000000 (5MB). ## Citing When using datasets provided by this package, be sure to properly cite them. Bibtex for each dataset can be found on the [datasets documentation page](https://ir-datasets.com/). If you use this tool, please cite [our SIGIR resource paper](https://arxiv.org/pdf/2103.02280.pdf): ``` @inproceedings{macavaney:sigir2021-irds, author = {MacAvaney, Sean and Yates, Andrew and Feldman, Sergey and Downey, Doug and Cohan, Arman and Goharian, Nazli}, title = {Simplified Data Wrangling with ir_datasets}, year = {2021}, booktitle = {SIGIR} } ``` ## Credits Contributors to this repository: - Sean MacAvaney (University of Glasgow) - Shuo Sun (Johns Hopkins University) - Thomas Jänich (University of Glasgow) - Jan Heinrich Reimer (Martin Luther University Halle-Wittenberg) - Maik Fröbe (Martin Luther University Halle-Wittenberg) - Eugene Yang (Johns Hopkins University) - Augustin Godinot (NAVERLABS Europe, ENS Paris-Saclay) ================================================ FILE: examples/adding_datasets.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# ir_datasets - Adding Datasets" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This tutorial covers the process for adding a new dataset to the `ir_datasets` package.\n", "\n", "This tutorial is for datasets that are inteded to be added to the main package. For an example of an extension, see [this example extension](https://github.com/seanmacavaney/dummy-irds-ext).\n", "\n", "Before starting, we recommend [opening an issue](https://github.com/allenai/ir_datasets/issues/new/choose) so various decisions about how to support the dataset can be discussed.\n", "\n", "There are four files involved in adding a dataset to the `ir_datasets` package:\n", " - `ir_datasets/datasets/[dataset-id].py` - Contains the definition of the dataset and any specialized code for handling it.\n", " - `ir_datasets/etc/downloads.json` - Contains information about how to download and verify dataset source files.\n", " - `ir_datasets/docs/[dataset-id].yaml` - Contains documentation of the dataset.\n", " - `test/integration/[dataset-id].py` - Contains automated tests to ensure the dataset is processed as expected.\n", " \n", "We will now show examples of each of these files for a toy dataset called `dummy`, with files hosted here: https://github.com/seanmacavaney/dummy-irds-ext/tree/master/data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "File: `ir_datasets/datasets/dummy.py`\n", "\n", "```python\n", "import ir_datasets\n", "from ir_datasets.formats import TsvDocs, TsvQueries, TrecQrels\n", "\n", "# A unique identifier for this dataset. This should match the file name (with \"-\" instead of \"_\")\n", "NAME = 'dummy'\n", "\n", "# What do the relevance levels in qrels mean?\n", "QREL_DEFS = {\n", " 1: 'relevant',\n", " 0: 'not relevant',\n", "}\n", "\n", "# This message is shown to the user before downloads are started\n", "DUA = 'Please confirm that you agree to the data usage agreement at '\n", "\n", "# An initialization function is used to keep the namespace clean\n", "def _init():\n", " # The directory where this dataset's data files will be stored\n", " base_path = ir_datasets.util.home_path() / NAME\n", " \n", " # Load an object that is used for providing the documentation\n", " documentation = YamlDocumentation(f'docs/{NAME}.yaml')\n", " \n", " # A reference to the downloads file, under the key \"dummy\". (DLC stands for DownLoadable Content)\n", " dlc = DownloadConfig.context(NAME, base_path, dua=DUA)\n", " \n", " # How to process the documents. Since they are in a typical TSV format, we'll use TsvDocs.\n", " # Note that other dataset formats may require you to write a custom docs handler (BaseDocs).\n", " # Note that this doesn't process the documents now; it just defines how they are processed.\n", " docs = TsvDocs(dlc['docs'], namespace=NAME, lang='en')\n", " \n", " # How to process the queries. Similar to the documents, you may need to write a custom\n", " # queries handler (BaseQueries).\n", " queries = TsvQueries(dlc['queries'], namespace=NAME, lang='en')\n", " \n", " # Qrels: The qrels file is in the TREC format, so we'll use TrecQrels to process them\n", " qrels = TrecQrels(dlc['qrels'], QREL_DEFS)\n", " \n", " # Package the docs, queries, qrels, and documentation into a Dataset object\n", " dataset = Dataset(docs, queries, qrels, documentation('_'))\n", " \n", " # Register the dataset in ir_datasets\n", " ir_datasets.registry.register(NAME, dataset)\n", " \n", " return dataset # used for exposing dataset to the namespace\n", "\n", "dataset = _init()\n", "```\n", "\n", "Note that you also need to add this file to `ir_datasets/datasets/__init__.py`:\n", "\n", "```python\n", "from . import dummy\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "File: `ir_datasets/etc/downloads.json`\n", "\n", "(add lines like these to the file)\n", "\n", "```json\n", "\"dummy\": {\n", " \"docs\": {\n", " \"url\": \"https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/docs.tsv\",\n", " \"expected_md5\": \"c7bb5a1a3a07d51de50e8414245c2be4\",\n", " \"cache_path\": \"docs.tsv\"\n", " },\n", " \"queries\": {\n", " \"url\": \"https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/queries.tsv\",\n", " \"expected_md5\": \"08ba86d990cbe6890f727946346964db\",\n", " \"cache_path\": \"queries.tsv\"\n", " },\n", " \"qrels\": {\n", " \"url\": \"https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/qrels\",\n", " \"expected_md5\": \"79ed359fe0afa0f67eb39f468d162920\",\n", " \"cache_path\": \"qrels\"\n", " }\n", "}\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "File: `ir_datasets/docs/dummy.yaml`\n", "\n", "```yaml\n", "_: # matches documentation key above\n", " pretty_name: 'Dummy' # a more human-readable way to present this dataset than the dataset-id\n", " desc: '\n", "

\n", "HTML-encoded and human-readable information about this dataset.\n", "Include a brief description of the dataset.\n", "Be sure to include important decisions made when processing it.\n", "Also, link to more information, e.g. websites, papers, etc.\n", "

\n", "' \n", " bibtex: |\n", " @misc{dummy,\n", " title={Dummy: a made-up dataset},\n", " year={2021}\n", " }\n", "```\n", "\n", "To generate the HTML documentation files, run `python -m ir_datasets documentation`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "File: `test/integration/dummy.py`\n", "\n", "```python\n", "from ir_datasets.formats import GenericQuery, GenericDoc, TrecQrel\n", "from .base import DatasetIntegrationTest\n", "\n", "class TestDummy(DatasetIntegrationTest):\n", " def test_docs(self):\n", " # Test that the dataset 'dummy' has 15 documents, and test the specific docs at indices 0, 9, and 14\n", " self._test_docs('dummy', count=15, items={\n", " 0: GenericDoc('T1', 'CUT, CAP AND BALANCE. TAXED ENOUGH ALREADY!'),\n", " 9: GenericDoc('T10', 'Perhaps this is the kind of thinking we need in Washington ...'),\n", " 14: GenericDoc('T15', \"I've been visiting Trump Int'l Golf Links Scotland and the course will be unmatched anywhere in the world. Spectacular!\"),\n", " })\n", "\n", " def test_queries(self):\n", " # Test that the dataset 'dummy' has 4 queries, and test the specific queries at indices 0 and 3\n", " self._test_queries('dummy', count=4, items={\n", " 0: GenericQuery('1', 'republican party'),\n", " 3: GenericQuery('4', 'media'),\n", " })\n", "\n", " def test_qrels(self):\n", " # Test that the dataset 'dummy' has 60 qrels, and test the specific qrels at indices 0, 9, and 59\n", " self._test_qrels('dummy', count=60, items={\n", " 0: TrecQrel('1', 'T1', 0, '0'),\n", " 9: TrecQrel('1', 'T10', 0, '0'),\n", " 59: TrecQrel('4', 'T15', 0, '0'),\n", " })\n", "```\n", "\n", "Note that within a DatasetIntegrationTest, you can use `self._build_test_docs('dummy')`, `self._build_test_queries('dummy')`, `self._build_test_qrels('dummy')` to generate sample test cases. But be sure to check that the tests they generate are properly processed, and feel free to add additional test cases, especially to test dataset-specific handlers." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 } ================================================ FILE: examples/clirmatrix_example.py ================================================ import ir_datasets """ dataset name clirmatrix/[query language code]/dataset/[doc language code]/[split] options: -------- dataset: bi139-base/bi139-full/multi8 supported query/doc language codes: bi139-base/bi139-full: ['af', 'als', 'am', 'an', 'ar', 'arz', 'ast', 'az', 'azb', 'ba', 'bar', 'be', 'bg', 'bn', 'bpy', 'br', 'bs', 'bug', 'ca', 'cdo', 'ce', 'ceb', 'ckb', 'cs', 'cv', 'cy', 'da', 'de', 'diq', 'el', 'eml', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fo', 'fr', 'fy', 'ga', 'gd', 'gl', 'gu', 'he', 'hi', 'hr', 'hsb', 'ht', 'hu', 'hy', 'ia', 'id', 'ilo', 'io', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'kn', 'ko', 'ku', 'ky', 'la', 'lb', 'li', 'lmo', 'lt', 'lv', 'mai', 'mg', 'mhr', 'min', 'mk', 'ml', 'mn', 'mr', 'mrj', 'ms', 'my', 'mzn', 'nap', 'nds', 'ne', 'new', 'nl', 'nn', 'no', 'oc', 'or', 'os', 'pa', 'pl', 'pms', 'pnb', 'ps', 'pt', 'qu', 'ro', 'ru', 'sa', 'sah', 'scn', 'sco', 'sd', 'sh', 'si', 'simple', 'sk', 'sl', 'sq', 'sr', 'su', 'sv', 'sw', 'szl', 'ta', 'te', 'tg', 'th', 'tl', 'tr', 'tt', 'uk', 'ur', 'uz', 'vec', 'vi', 'vo', 'wa', 'war', 'wuu', 'xmf', 'yi', 'yo', 'zh'] multi8: ['ar', 'de', 'en', 'es', 'fr', 'ja', 'ru', 'zh'] split: train/dev/test1/test2 """ #examples #reference python notebook: https://colab.research.google.com/github/allenai/ir_datasets/blob/master/examples/ir_datasets.ipynb#scrollTo=n7mY16MRH0hx dataset = ir_datasets.load("clirmatrix/en/bi139-base/zh/test1") docstore = dataset.docs_store() for qrels in dataset.qrels_iter(): print(docstore.get(qrels.doc_id)) break for query in dataset.queries_iter(): print(query) break dataset = ir_datasets.load("clirmatrix/en/multi8/zh/train") docstore = dataset.docs_store() for qrels in dataset.qrels_iter(): print(docstore.get(qrels.doc_id)) break for query in dataset.queries_iter(): print(query) break dataset = ir_datasets.load("clirmatrix/an/bi139-full/zh/dev") docstore = dataset.docs_store() for qrels in dataset.qrels_iter(): print(docstore.get(qrels.doc_id)) break for query in dataset.queries_iter(): print(query) break ================================================ FILE: examples/ir_datasets.ipynb ================================================ { "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "ir-datasets.ipynb", "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "snL2s_xoHpph" }, "source": [ "# ir_datasets - Tutorial" ] }, { "cell_type": "markdown", "metadata": { "id": "n7mY16MRH0hx" }, "source": [ "## Getting Started\n", "\n", "We'll start out by installing the package. The package is available on pypi,\n", "so you can install it with your favorite package manager." ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "M_6mg0PbHaFD", "outputId": "0764869d-bb51-4a9e-edb2-35c9cf56a876" }, "source": [ "!pip install ir_datasets" ], "execution_count": 1, "outputs": [ { "output_type": "stream", "text": [ "Requirement already satisfied: ir_datasets in /usr/local/lib/python3.6/dist-packages (0.2.0)\n", "Requirement already satisfied: pyyaml>=5.3.1 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (5.4.1)\n", "Requirement already satisfied: trec-car-tools>=2.5.3 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (2.5.3)\n", "Requirement already satisfied: zlib-state>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (0.1.3)\n", "Requirement already satisfied: requests>=2.22.0 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (2.23.0)\n", "Requirement already satisfied: numpy>=1.18.1 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (1.19.5)\n", "Requirement already satisfied: lxml>=4.5.2 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (4.6.2)\n", "Requirement already satisfied: tqdm>=4.38.0 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (4.41.1)\n", "Requirement already satisfied: lz4>=3.1.1 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (3.1.3)\n", "Requirement already satisfied: beautifulsoup4>=4.4.1 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (4.6.3)\n", "Requirement already satisfied: warc3-wet-clueweb09>=0.2.5 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (0.2.5)\n", "Requirement already satisfied: warc3-wet>=0.2.3 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (0.2.3)\n", "Requirement already satisfied: typing>=3.6.2 in /usr/local/lib/python3.6/dist-packages (from trec-car-tools>=2.5.3->ir_datasets) (3.7.4.3)\n", "Requirement already satisfied: cbor>=1.0.0 in /usr/local/lib/python3.6/dist-packages (from trec-car-tools>=2.5.3->ir_datasets) (1.0.0)\n", "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests>=2.22.0->ir_datasets) (1.24.3)\n", "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.22.0->ir_datasets) (3.0.4)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.22.0->ir_datasets) (2020.12.5)\n", "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.22.0->ir_datasets) (2.10)\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "DH_aBA7hIDZ4" }, "source": [ "You can now load up your favorite dataset. You can find the full listing of datasets [here](https://ir-datasets.com/all.html). Here's an example for `cord19/trec-covid`:" ] }, { "cell_type": "code", "metadata": { "id": "dFIuPyqdHVQ0" }, "source": [ "import ir_datasets\n", "dataset = ir_datasets.load('cord19/trec-covid')" ], "execution_count": 2, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "ILomHf8CIdOf" }, "source": [ "## Documents\n", "\n", "`doc` entities map a `doc_id` to one or more text fields.\n", "\n", "Let's see how many documents are in this collection. The first time you run this command, it will need to download and process the collection, which may take some time:" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "v3rCW-JUHpFz", "outputId": "c2cba6ee-3f55-4369-de41-17972b570ad8" }, "source": [ "dataset.docs_count()" ], "execution_count": 3, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "192509" ] }, "metadata": { "tags": [] }, "execution_count": 3 } ] }, { "cell_type": "markdown", "metadata": { "id": "bd2f31HzI2s5" }, "source": [ "Now let's see some docments. You can iterate through the documents in the collection using `docs_iter`. Since there's so many, we'll just look at the top 10:" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "odfCkvALHXzz", "outputId": "3f4241e6-7828-4fc1-d18b-9610b7874eec" }, "source": [ "for doc in dataset.docs_iter()[:10]:\n", " print(doc)" ], "execution_count": 4, "outputs": [ { "output_type": "stream", "text": [ "Cord19Doc(doc_id='ug7v899j', title='Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia', doi='10.1186/1471-2334-1-6', date='2001-07-04', abstract='OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolates (60%) were associated with pneumonia, 14 (35%) with upper respiratory tract infections, and 2 (5%) with bronchiolitis. Cough (82.5%), fever (75%), and malaise (58.8%) were the most common symptoms, and crepitations (60%), and wheezes (40%) were the most common signs. Most patients with pneumonia had crepitations (79.2%) but only 25% had bronchial breathing. Immunocompromised patients were more likely than non-immunocompromised patients to present with pneumonia (8/9 versus 16/31, P = 0.05). Of the 24 patients with pneumonia, 14 (58.3%) had uneventful recovery, 4 (16.7%) recovered following some complications, 3 (12.5%) died because of M pneumoniae infection, and 3 (12.5%) died due to underlying comorbidities. The 3 patients who died of M pneumoniae pneumonia had other comorbidities. CONCLUSION: our results were similar to published data except for the finding that infections were more common in infants and preschool children and that the mortality rate of pneumonia in patients with comorbidities was high.')\n", "Cord19Doc(doc_id='02tnwd4m', title='Nitric oxide: a pro-inflammatory mediator in lung disease?', doi='10.1186/rr14', date='2000-08-15', abstract='Inflammatory diseases of the respiratory tract are commonly associated with elevated production of nitric oxide (NO•) and increased indices of NO• -dependent oxidative stress. Although NO• is known to have anti-microbial, anti-inflammatory and anti-oxidant properties, various lines of evidence support the contribution of NO• to lung injury in several disease models. On the basis of biochemical evidence, it is often presumed that such NO• -dependent oxidations are due to the formation of the oxidant peroxynitrite, although alternative mechanisms involving the phagocyte-derived heme proteins myeloperoxidase and eosinophil peroxidase might be operative during conditions of inflammation. Because of the overwhelming literature on NO• generation and activities in the respiratory tract, it would be beyond the scope of this commentary to review this area comprehensively. Instead, it focuses on recent evidence and concepts of the presumed contribution of NO• to inflammatory diseases of the lung.')\n", "Cord19Doc(doc_id='ejv2xln0', title='Surfactant protein-D and pulmonary host defense', doi='10.1186/rr19', date='2000-08-25', abstract='Surfactant protein-D (SP-D) participates in the innate response to inhaled microorganisms and organic antigens, and contributes to immune and inflammatory regulation within the lung. SP-D is synthesized and secreted by alveolar and bronchiolar epithelial cells, but is also expressed by epithelial cells lining various exocrine ducts and the mucosa of the gastrointestinal and genitourinary tracts. SP-D, a collagenous calcium-dependent lectin (or collectin), binds to surface glycoconjugates expressed by a wide variety of microorganisms, and to oligosaccharides associated with the surface of various complex organic antigens. SP-D also specifically interacts with glycoconjugates and other molecules expressed on the surface of macrophages, neutrophils, and lymphocytes. In addition, SP-D binds to specific surfactant-associated lipids and can influence the organization of lipid mixtures containing phosphatidylinositol in vitro. Consistent with these diverse in vitro activities is the observation that SP-D-deficient transgenic mice show abnormal accumulations of surfactant lipids, and respond abnormally to challenge with respiratory viruses and bacterial lipopolysaccharides. The phenotype of macrophages isolated from the lungs of SP-D-deficient mice is altered, and there is circumstantial evidence that abnormal oxidant metabolism and/or increased metalloproteinase expression contributes to the development of emphysema. The expression of SP-D is increased in response to many forms of lung injury, and deficient accumulation of appropriately oligomerized SP-D might contribute to the pathogenesis of a variety of human lung diseases.')\n", "Cord19Doc(doc_id='2b73a28n', title='Role of endothelin-1 in lung disease', doi='10.1186/rr44', date='2001-02-22', abstract='Endothelin-1 (ET-1) is a 21 amino acid peptide with diverse biological activity that has been implicated in numerous diseases. ET-1 is a potent mitogen regulator of smooth muscle tone, and inflammatory mediator that may play a key role in diseases of the airways, pulmonary circulation, and inflammatory lung diseases, both acute and chronic. This review will focus on the biology of ET-1 and its role in lung disease.')\n", "Cord19Doc(doc_id='9785vg6d', title='Gene expression in epithelial cells in response to pneumovirus infection', doi='10.1186/rr61', date='2001-05-11', abstract='Respiratory syncytial virus (RSV) and pneumonia virus of mice (PVM) are viruses of the family Paramyxoviridae, subfamily pneumovirus, which cause clinically important respiratory infections in humans and rodents, respectively. The respiratory epithelial target cells respond to viral infection with specific alterations in gene expression, including production of chemoattractant cytokines, adhesion molecules, elements that are related to the apoptosis response, and others that remain incompletely understood. Here we review our current understanding of these mucosal responses and discuss several genomic approaches, including differential display reverse transcription-polymerase chain reaction (PCR) and gene array strategies, that will permit us to unravel the nature of these responses in a more complete and systematic manner.')\n", "Cord19Doc(doc_id='zjufx4fo', title='Sequence requirements for RNA strand transfer during nidovirus discontinuous subgenomic RNA synthesis', doi='10.1093/emboj/20.24.7220', date='2001-12-17', abstract='Nidovirus subgenomic mRNAs contain a leader sequence derived from the 5′ end of the genome fused to different sequences (‘bodies’) derived from the 3′ end. Their generation involves a unique mechanism of discontinuous subgenomic RNA synthesis that resembles copy-choice RNA recombination. During this process, the nascent RNA strand is transferred from one site in the template to another, during either plus or minus strand synthesis, to yield subgenomic RNA molecules. Central to this process are transcription-regulating sequences (TRSs), which are present at both template sites and ensure the fidelity of strand transfer. Here we present results of a comprehensive co-variation mutagenesis study of equine arteritis virus TRSs, demonstrating that discontinuous RNA synthesis depends not only on base pairing between sense leader TRS and antisense body TRS, but also on the primary sequence of the body TRS. While the leader TRS merely plays a targeting role for strand transfer, the body TRS fulfils multiple functions. The sequences of mRNA leader–body junctions of TRS mutants strongly suggested that the discontinuous step occurs during minus strand synthesis.')\n", "Cord19Doc(doc_id='5yhe786e', title='Debate: Transfusing to normal haemoglobin levels will not improve outcome', doi='10.1186/cc987', date='2001-03-08', abstract='Recent evidence suggests that critically ill patients are able to tolerate lower levels of haemoglobin than was previously believed. It is our goal to show that transfusing to a level of 100 g/l does not improve mortality and other clinically important outcomes in a critical care setting. Although many questions remain, many laboratory and clinical studies, including a recent randomized controlled trial (RCT), have established that transfusing to normal haemoglobin concentrations does not improve organ failure and mortality in the critically ill patient. In addition, a restrictive transfusion strategy will reduce exposure to allogeneic transfusions, result in more efficient use of red blood cells (RBCs), save blood overall, and decrease health care costs.')\n", "Cord19Doc(doc_id='8zchiykl', title='The 21st International Symposium on Intensive Care and Emergency Medicine, Brussels, Belgium, 20-23 March 2001', doi='10.1186/cc1013', date='2001-05-02', abstract=\"The 21st International Symposium on Intensive Care and Emergency Medicine was dominated by the results of recent clinical trials in sepsis and acute respiratory distress syndrome (ARDS). The promise of extracorporeal liver replacement therapy and noninvasive ventilation were other areas of interest. Ethical issues also received attention. Overall, the 'state of the art' lectures, pro/con debates, seminars and tutorials were of a high standard. The meeting was marked by a sense of renewed enthusiasm that positive progress is occurring in intensive care medicine.\")\n", "Cord19Doc(doc_id='8qnrcgnk', title='Heme oxygenase-1 and carbon monoxide in pulmonary medicine', doi='10.1186/1465-9921-4-7', date='2003-08-07', abstract='Heme oxygenase-1 (HO-1), an inducible stress protein, confers cytoprotection against oxidative stress in vitro and in vivo. In addition to its physiological role in heme degradation, HO-1 may influence a number of cellular processes, including growth, inflammation, and apoptosis. By virtue of anti-inflammatory effects, HO-1 limits tissue damage in response to proinflammatory stimuli and prevents allograft rejection after transplantation. The transcriptional upregulation of HO-1 responds to many agents, such as hypoxia, bacterial lipopolysaccharide, and reactive oxygen/nitrogen species. HO-1 and its constitutively expressed isozyme, heme oxygenase-2, catalyze the rate-limiting step in the conversion of heme to its metabolites, bilirubin IXα, ferrous iron, and carbon monoxide (CO). The mechanisms by which HO-1 provides protection most likely involve its enzymatic reaction products. Remarkably, administration of CO at low concentrations can substitute for HO-1 with respect to anti-inflammatory and anti-apoptotic effects, suggesting a role for CO as a key mediator of HO-1 function. Chronic, low-level, exogenous exposure to CO from cigarette smoking contributes to the importance of CO in pulmonary medicine. The implications of the HO-1/CO system in pulmonary diseases will be discussed in this review, with an emphasis on inflammatory states.')\n", "Cord19Doc(doc_id='jg13scgo', title='Technical Description of RODS: A Real-time Public Health Surveillance System', doi='10.1197/jamia.m1345', date='2003-09-01', abstract='This report describes the design and implementation of the Real-time Outbreak and Disease Surveillance (RODS) system, a computer-based public health surveillance system for early detection of disease outbreaks. Hospitals send RODS data from clinical encounters over virtual private networks and leased lines using the Health Level 7 (HL7) message protocol. The data are sent in real time. RODS automatically classifies the registration chief complaint from the visit into one of seven syndrome categories using Bayesian classifiers. It stores the data in a relational database, aggregates the data for analysis using data warehousing techniques, applies univariate and multivariate statistical detection algorithms to the data, and alerts users of when the algorithms identify anomalous patterns in the syndrome counts. RODS also has a Web-based user interface that supports temporal and spatial analyses. RODS processes sales of over-the-counter health care products in a similar manner but receives such data in batch mode on a daily basis. RODS was used during the 2002 Winter Olympics and currently operates in two states—Pennsylvania and Utah. It has been and continues to be a resource for implementing, evaluating, and applying new methods of public health surveillance.')\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "lUB0AUfWJESJ" }, "source": [ "You can see each document is represented as a `Cord19Doc`, which is a `namedtuple`. Named tuples are a light-weight data structure that consists of a pre-defined sequence of named fields.\n", "\n", "If you want more information aobut what document fields are available in this collection, you can\n", "[check the documentation](https://ir-datasets.com/cord19.html#cord19) or inspect the dataset's `docs_cls()`:" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Cej2STMCI_eh", "outputId": "55e06f14-390f-4dce-9ba2-576a33c50b6c" }, "source": [ "dataset.docs_cls()" ], "execution_count": 5, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "ir_datasets.datasets.cord19.Cord19Doc" ] }, "metadata": { "tags": [] }, "execution_count": 5 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "p2BxO7-vJWg7", "outputId": "ea763b20-59e6-4d97-b28b-0f7bbabb2f65" }, "source": [ "dataset.docs_cls()._fields" ], "execution_count": 6, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "('doc_id', 'title', 'doi', 'date', 'abstract')" ] }, "metadata": { "tags": [] }, "execution_count": 6 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "2ORa7nztJXyq", "outputId": "a2e836be-ebe4-4f71-f272-2536961ef271" }, "source": [ "dataset.docs_cls().__annotations__" ], "execution_count": 7, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "OrderedDict([('doc_id', str),\n", " ('title', str),\n", " ('doi', str),\n", " ('date', str),\n", " ('abstract', str)])" ] }, "metadata": { "tags": [] }, "execution_count": 7 } ] }, { "cell_type": "markdown", "metadata": { "id": "jwcOyKP5Juct" }, "source": [ "Did you notice the `[:10]` above? We can do all sorts of fancy slicing on document iterators. Here, we select every other document from the top 10:" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "VDYPMpOVJZmM", "outputId": "402d5201-ceb9-4bcb-a985-e0052d650994" }, "source": [ "for doc in dataset.docs_iter()[:10:2]:\n", " print(doc)" ], "execution_count": 8, "outputs": [ { "output_type": "stream", "text": [ "Cord19Doc(doc_id='ug7v899j', title='Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia', doi='10.1186/1471-2334-1-6', date='2001-07-04', abstract='OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolates (60%) were associated with pneumonia, 14 (35%) with upper respiratory tract infections, and 2 (5%) with bronchiolitis. Cough (82.5%), fever (75%), and malaise (58.8%) were the most common symptoms, and crepitations (60%), and wheezes (40%) were the most common signs. Most patients with pneumonia had crepitations (79.2%) but only 25% had bronchial breathing. Immunocompromised patients were more likely than non-immunocompromised patients to present with pneumonia (8/9 versus 16/31, P = 0.05). Of the 24 patients with pneumonia, 14 (58.3%) had uneventful recovery, 4 (16.7%) recovered following some complications, 3 (12.5%) died because of M pneumoniae infection, and 3 (12.5%) died due to underlying comorbidities. The 3 patients who died of M pneumoniae pneumonia had other comorbidities. CONCLUSION: our results were similar to published data except for the finding that infections were more common in infants and preschool children and that the mortality rate of pneumonia in patients with comorbidities was high.')\n", "Cord19Doc(doc_id='ejv2xln0', title='Surfactant protein-D and pulmonary host defense', doi='10.1186/rr19', date='2000-08-25', abstract='Surfactant protein-D (SP-D) participates in the innate response to inhaled microorganisms and organic antigens, and contributes to immune and inflammatory regulation within the lung. SP-D is synthesized and secreted by alveolar and bronchiolar epithelial cells, but is also expressed by epithelial cells lining various exocrine ducts and the mucosa of the gastrointestinal and genitourinary tracts. SP-D, a collagenous calcium-dependent lectin (or collectin), binds to surface glycoconjugates expressed by a wide variety of microorganisms, and to oligosaccharides associated with the surface of various complex organic antigens. SP-D also specifically interacts with glycoconjugates and other molecules expressed on the surface of macrophages, neutrophils, and lymphocytes. In addition, SP-D binds to specific surfactant-associated lipids and can influence the organization of lipid mixtures containing phosphatidylinositol in vitro. Consistent with these diverse in vitro activities is the observation that SP-D-deficient transgenic mice show abnormal accumulations of surfactant lipids, and respond abnormally to challenge with respiratory viruses and bacterial lipopolysaccharides. The phenotype of macrophages isolated from the lungs of SP-D-deficient mice is altered, and there is circumstantial evidence that abnormal oxidant metabolism and/or increased metalloproteinase expression contributes to the development of emphysema. The expression of SP-D is increased in response to many forms of lung injury, and deficient accumulation of appropriately oligomerized SP-D might contribute to the pathogenesis of a variety of human lung diseases.')\n", "Cord19Doc(doc_id='9785vg6d', title='Gene expression in epithelial cells in response to pneumovirus infection', doi='10.1186/rr61', date='2001-05-11', abstract='Respiratory syncytial virus (RSV) and pneumonia virus of mice (PVM) are viruses of the family Paramyxoviridae, subfamily pneumovirus, which cause clinically important respiratory infections in humans and rodents, respectively. The respiratory epithelial target cells respond to viral infection with specific alterations in gene expression, including production of chemoattractant cytokines, adhesion molecules, elements that are related to the apoptosis response, and others that remain incompletely understood. Here we review our current understanding of these mucosal responses and discuss several genomic approaches, including differential display reverse transcription-polymerase chain reaction (PCR) and gene array strategies, that will permit us to unravel the nature of these responses in a more complete and systematic manner.')\n", "Cord19Doc(doc_id='5yhe786e', title='Debate: Transfusing to normal haemoglobin levels will not improve outcome', doi='10.1186/cc987', date='2001-03-08', abstract='Recent evidence suggests that critically ill patients are able to tolerate lower levels of haemoglobin than was previously believed. It is our goal to show that transfusing to a level of 100 g/l does not improve mortality and other clinically important outcomes in a critical care setting. Although many questions remain, many laboratory and clinical studies, including a recent randomized controlled trial (RCT), have established that transfusing to normal haemoglobin concentrations does not improve organ failure and mortality in the critically ill patient. In addition, a restrictive transfusion strategy will reduce exposure to allogeneic transfusions, result in more efficient use of red blood cells (RBCs), save blood overall, and decrease health care costs.')\n", "Cord19Doc(doc_id='8qnrcgnk', title='Heme oxygenase-1 and carbon monoxide in pulmonary medicine', doi='10.1186/1465-9921-4-7', date='2003-08-07', abstract='Heme oxygenase-1 (HO-1), an inducible stress protein, confers cytoprotection against oxidative stress in vitro and in vivo. In addition to its physiological role in heme degradation, HO-1 may influence a number of cellular processes, including growth, inflammation, and apoptosis. By virtue of anti-inflammatory effects, HO-1 limits tissue damage in response to proinflammatory stimuli and prevents allograft rejection after transplantation. The transcriptional upregulation of HO-1 responds to many agents, such as hypoxia, bacterial lipopolysaccharide, and reactive oxygen/nitrogen species. HO-1 and its constitutively expressed isozyme, heme oxygenase-2, catalyze the rate-limiting step in the conversion of heme to its metabolites, bilirubin IXα, ferrous iron, and carbon monoxide (CO). The mechanisms by which HO-1 provides protection most likely involve its enzymatic reaction products. Remarkably, administration of CO at low concentrations can substitute for HO-1 with respect to anti-inflammatory and anti-apoptotic effects, suggesting a role for CO as a key mediator of HO-1 function. Chronic, low-level, exogenous exposure to CO from cigarette smoking contributes to the importance of CO in pulmonary medicine. The implications of the HO-1/CO system in pulmonary diseases will be discussed in this review, with an emphasis on inflammatory states.')\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "iizcVwqmJ-TW" }, "source": [ "Or the last 10 documents:" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "iVOvbOGOJ4A6", "outputId": "ca1daed3-3394-472f-8bbd-60a3e2faf0a3" }, "source": [ "for doc in dataset.docs_iter()[-10:]:\n", " print(doc)" ], "execution_count": 9, "outputs": [ { "output_type": "stream", "text": [ "Cord19Doc(doc_id='7e8r61e7', title='Can Pediatric COVID-19 Testing Sensitivity Be Improved With Sequential Tests?', doi='10.1213/ane.0000000000004982', date='2020-05-26', abstract='')\n", "Cord19Doc(doc_id='r3ud8t8w', title='rAre graphene and graphene-derived products capable of preventing COVID-19 infection?', doi='10.1016/j.mehy.2020.110031', date='2020-06-24', abstract=\"The Severe Acute Respiratory Syndrome CoronaVirus 2 (SARS-CoV-2) causes the new coronavirus disease 2019 (COVID-19). This disease is a severe respiratory tract infection that spread rapidly around the world. In this pandemic situation, the researchers' effort is to understand the targets of the virus, mechanism of their cause, and transmission from animal to human and vice-versa. Therefore, to support COVID-19 research and development, we have proposed approaches based on graphene and graphene-derived nanomaterials against COVID-19.\")\n", "Cord19Doc(doc_id='6jittbis', title='Heterogeneity and plasticity of porcine alveolar macrophage and pulmonary interstitial macrophage isolated from healthy pigs in vitro', doi='10.1242/bio.046342', date='2019-10-15', abstract='This study investigated the heterogeneity and plasticity of porcine alveolar macrophages (PAM) and pulmonary interstitial macrophages (IM) isolated from healthy pigs, including phenotype, function and gene expression. Dynamic changes of nitric oxide (NO) levels secreted by PAM and IM with stimulation of different doses of lipopolysaccharide (LPS) were investigated by Griess method, and the viability of the PAM and IM cells was investigated by MTT assay. Flow cytometry, fluorescence quantitative PCR and ELISA techniques were used to measure cell phenotype, gene expression and cytokine secretion, respectively. The PAM and IM cells in normal healthy pigs showed heterogeneity with 95.42±1.51% and 31.99±5.84% of CD163+ macrophage, respectively. The NO level in IM was significantly higher versus PAM after LPS treatment. Consistently, the ratio of Arg I/iNOS in IM was much lower than that in PAM, suggesting that the PAM belong to M2 macrophages and the IM belong to M1 macrophages. The PAM and IM cells in normal healthy pigs also showed plasticity. The Arg I/iNOS ratio and TIMP1/MMP12 ratio were significantly decreased in LPS- or LPS+IFNγ-treated PAM and IM, suggesting that cells were polarized towards M1 macrophages under LPS or LPS+IFNγ stimulation. On the contrary, IL-4 and IL-13 stimulation on PAM and IM lead to M2 polarization. A similar result was found in IL-1β gene expression and TNFα secretion. In conclusion, porcine macrophages have shown heterogeneity and plasticity on polarization under the stimulation of LPS, IFNγ, IL-4 and IL-13.')\n", "Cord19Doc(doc_id='kaku49xd', title='Review of Current Advances in Serologic Testing for COVID-19', doi='10.1093/ajcp/aqaa112', date='2020-06-25', abstract='OBJECTIVES: To examine and summarize the current literature on serologic methods for the detection of antibodies to severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2). METHODS: A literature review was performed using searches in databases including PubMed, medRxiv, and bioRxiv. Thirty-two peer-reviewed papers and 23 preprints were examined. RESULTS: The studies included lateral flow immunoassay, enzyme-linked immunosorbent assay, chemiluminescence immunoassay, and neutralizing antibody assays. The use of all major SARS-CoV-2 antigens was demonstrated to have diagnostic value. Assays measuring total antibody reactivity had the highest sensitivity. In addition, all the methods provided opportunities to characterize the humoral immune response by isotype. The combined use of IgM and IgG detection resulted in a higher sensitivity than that observed when detecting either isotype alone. Although IgA was rarely studied, it was also demonstrated to be a sensitive marker of infection, and levels correlated with disease severity and neutralizing activity. CONCLUSIONS: The use of serologic testing, in conjunction with reverse transcription polymerase chain reaction testing, was demonstrated to significantly increase the sensitivity of detection of patients infected with SARS-CoV-2. There was conflicting evidence regarding whether antibody titers correlated with clinical severity. However, preliminary investigations indicated some immunoassays may be a surrogate for the prediction of neutralizing antibody titers and the selection of recovered patients for convalescent serum donation.')\n", "Cord19Doc(doc_id='ni94qi4r', title='Liver tests abnormalities in COVID-19: trick or treat?', doi='10.1016/j.jhep.2020.05.033', date='2020-05-27', abstract='')\n", "Cord19Doc(doc_id='z4ro6lmh', title='Rapid radiological improvement of COVID-19 pneumonia after treatment with tocilizumab', doi='10.1007/s15010-020-01449-w', date='2020-06-15', abstract='')\n", "Cord19Doc(doc_id='hi8k8wvb', title='SARS E protein in phospholipid bilayers: an anomalous X-ray reflectivity study', doi='10.1016/j.physb.2004.11.015', date='2005-02-28', abstract='Abstract We report on an anomalous X-ray reflectivity study to locate a labelled residue of a membrane protein with respect to the lipid bilayer. From such experiments, important constraints on the protein or peptide conformation can be derived. Specifically, our aim is to localize an iodine-labelled phenylalanine in the SARS E protein, incorporated in DMPC phospholipid bilayers, which are deposited in the form of thick multilamellar stacks on silicon surfaces. Here, we discuss the experimental aspects and the difficulties associated with the Fourier synthesis analysis that gives the electron density profile of the membranes.')\n", "Cord19Doc(doc_id='ma3ndg41', title='Italian Society of Interventional Cardiology (GISE) position paper for Cath lab‐specific preparedness recommendations for healthcare providers in case of suspected, probable or confirmed cases of COVID‐19', doi='10.1002/ccd.28888', date='2020-04-11', abstract='COVID‐19 pandemic raised the issue to guarantee the proper level of care to patients with acute cardiovascular diseases and concomitant suspected or confirmed COVID‐19 and, in the meantime safety and protection of healthcare providers. The aim of this position paper is to provide standards to healthcare facilities and healthcare providers on infection prevention and control measures during the management of suspected and confirmed cases of 2019‐nCoV infection accessing in cath‐lab. The document represents the view of the Italian Society of Interventional Cardiology (GISE), and it is based on recommendations from the main World and European Health Organizations (WHO, and ECDC) as well as from the Italian Society of Anesthesia, Analgesia, Resuscitation and Intensive Care (SIAARTI).')\n", "Cord19Doc(doc_id='wh10285j', title=\"Nimble, Together: A Training Program's Response to the COVID-19 Pandemic\", doi='10.1097/sla.0000000000003994', date='2020-04-29', abstract='')\n", "Cord19Doc(doc_id='pnl9th2c', title='Vascular Life during the COVID-19 Pandemic Reminds Us to Prepare for the Unexpected', doi='10.1016/j.ejvs.2020.04.040', date='2020-05-12', abstract='')\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "nm8hUpVWKOwM" }, "source": [ "You can also select by percentages, e.g., `[:1/3]` slects the first third, `[1/3:2/3]` selects the second third, and `[2/3:]` selects the final third. This is hany when splitting document processing across processes, machines, or GPUs.\n", "\n", "These slices are smart: they avoid processing each document in the collection and jump to the right position in the source files to process." ] }, { "cell_type": "markdown", "metadata": { "id": "kZjhG-5XKqPR" }, "source": [ "Now let's say you know a document'd ID and want to find its text. You can use `docs_store()` to accomplish this." ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "hXp1nxooJ6uP", "outputId": "fc52d452-754c-42fd-ae3f-2f37364c4462" }, "source": [ "docstore = dataset.docs_store()\n", "docstore.get('3wuh6k6g')" ], "execution_count": 10, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Cord19Doc(doc_id='3wuh6k6g', title='Understand Research Hotspots Surrounding COVID-19 and Other Coronavirus Infections Using Topic Modeling', doi='10.1101/2020.03.26.20044164', date='2020-03-30', abstract='Background: Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) is a virus that causes severe respiratory illness in humans, which eventually results in the current outbreak of novel coronavirus disease (COVID-19) around the world. The research community is interested to know what are the hotspots in coronavirus (CoV) research and how much is known about COVID-19. This study aimed to evaluate the characteristics of publications involving coronaviruses as well as COVID-19 by using a topic modeling analysis. Methods: We extracted all abstracts and retained the most informative words from the COVID-19 Open Research Dataset, which contains all the 35,092 pieces of coronavirus related literature published up to March 20, 2020. Using Latent Dirichlet Allocation modeling, we trained an eight-topic model from the corpus. We then analyzed the semantic relationships between topics and compared the topic distribution between COVID-19 and other CoV infections. Results: Eight topics emerged overall: clinical characterization, pathogenesis research, therapeutics research, epidemiological study, virus transmission, vaccines research, virus diagnostics, and viral genomics. It was observed that COVID-19 research puts more emphasis on clinical characterization, epidemiological study, and virus transmission at present. In contrast, topics about diagnostics, therapeutics, vaccines, genomics and pathogenesis only accounted for less than 10% or even 4% of all the COVID-19 publications, much lower than those of other CoV infections. Conclusions: These results identified knowledge gaps in the area of COVID-19 and offered directions for future research. Keywords: COVID-19, coronavirus, topic modeling, hotspots, text mining')" ] }, "metadata": { "tags": [] }, "execution_count": 10 } ] }, { "cell_type": "markdown", "metadata": { "id": "m7IN1f9_LMS1" }, "source": [ "Or, a list of IDs. Maybe you're re-ranking these documents." ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "oXvKBt6-LPtS", "outputId": "733320e0-2762-44ba-ce4c-226295d7878d" }, "source": [ "docstore.get_many(['ax6v6ham', '44l5q07k', '8xm0kacj', '3wuh6k6g', 'fiievwy7'])" ], "execution_count": 11, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'3wuh6k6g': Cord19Doc(doc_id='3wuh6k6g', title='Understand Research Hotspots Surrounding COVID-19 and Other Coronavirus Infections Using Topic Modeling', doi='10.1101/2020.03.26.20044164', date='2020-03-30', abstract='Background: Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) is a virus that causes severe respiratory illness in humans, which eventually results in the current outbreak of novel coronavirus disease (COVID-19) around the world. The research community is interested to know what are the hotspots in coronavirus (CoV) research and how much is known about COVID-19. This study aimed to evaluate the characteristics of publications involving coronaviruses as well as COVID-19 by using a topic modeling analysis. Methods: We extracted all abstracts and retained the most informative words from the COVID-19 Open Research Dataset, which contains all the 35,092 pieces of coronavirus related literature published up to March 20, 2020. Using Latent Dirichlet Allocation modeling, we trained an eight-topic model from the corpus. We then analyzed the semantic relationships between topics and compared the topic distribution between COVID-19 and other CoV infections. Results: Eight topics emerged overall: clinical characterization, pathogenesis research, therapeutics research, epidemiological study, virus transmission, vaccines research, virus diagnostics, and viral genomics. It was observed that COVID-19 research puts more emphasis on clinical characterization, epidemiological study, and virus transmission at present. In contrast, topics about diagnostics, therapeutics, vaccines, genomics and pathogenesis only accounted for less than 10% or even 4% of all the COVID-19 publications, much lower than those of other CoV infections. Conclusions: These results identified knowledge gaps in the area of COVID-19 and offered directions for future research. Keywords: COVID-19, coronavirus, topic modeling, hotspots, text mining'),\n", " '44l5q07k': Cord19Doc(doc_id='44l5q07k', title='Rôle des animaux vertébrés dans l’épidémiologie des zoonoses', doi='10.1016/s1773-035x(15)30110-6', date='2015-05-31', abstract='Résumé Les zoonoses, distinguées ici des maladies humaines d’origine animale, représentent un ensemble d’entités pathologiques dont les agents responsables circulent régulièrement entre l’espèce humaine et de nombreuses espèces de vertébrés. L’analyse de divers exemples, quelles que soient les voies de transmission et les causes favorisantes de la contamination, met en avant une régulière rareté du passage direct du réservoir animal vers l’espèce humaine, à opposer à la diversité et surtout à la gravité possible des évolutions sanitaires ultérieures possibles, parfois liées à des comportements humains peu adaptés. D’un point de vue pratique, il semblerait plus pertinent de se pencher d’abord sur un meilleur contrôle de la diffusion des agents pathogènes au sein des populations humaines que d’agir a priori sur le réservoir animal, qu’il soit domestique ou sauvage, source potentielle de ces agents pathogènes mais selon des modalités difficiles à anticiper. La relation entre biodiversité et santé est abordée à l’occasion de la discussion. Summary Zoonoses, to be distinguished here from human diseases with an animal origin, represent a large quantity of pathological entities the corresponding pathogens of which are regularly shared between human beings and many different vertebrates species. Working on selected examples and situations, whatever the contamination routes and the facilitating reasons, puts into light a real rarity of a direct transmission from the animal reservoir to human beings. On the opposite, the diversity and the possible severity of some of the sanitary consequences, quite often in relation to human behaviours, must be stressed. On a practical point of view, it seems more adapted to try first to improve the control of pathogens diffusion within human populations after a contamination than to try to start to work first on the reservoir, be it domestic or wild, the potential source of these pathogens, but following routes quite difficult to anticipate. The relationship between biodiversity and health is discussed.'),\n", " '8xm0kacj': Cord19Doc(doc_id='8xm0kacj', title='Host range of SARS-CoV-2 and implications for public health', doi='10.1016/s2666-5247(20)30069-0', date='2020-06-18', abstract=''),\n", " 'ax6v6ham': Cord19Doc(doc_id='ax6v6ham', title='Close relationship between SARS-coronavirus and group 2 coronavirus.', doi='', date='2006', abstract='The sudden appearance and potential lethality of severe acute respiratory syndrome (SARS)-associated coronavirus (SARS-CoV) in humans has resulted in a focusing of new attention on the determination of both its origins and evolution. The relationship existing between SARS-CoV and other groups of coronaviruses was determined via analyses of phylogenetic trees and comparative genomic analyses of the coronavirus genes: polymerase (Orf1ab), spike (S), envelope (E), membrane (M) and nucleocapsid (N). Although the coronaviruses are traditionally classed into 3 groups, with SARS-CoV forming a 4th group, the phylogenetic position and origins of SARS-CoV remain a matter of some controversy. Thus, we conducted extensive phylogenetic analyses of the genes common to all coronavirus groups, using the Neighbor-joining, Maximum-likelihood, and Bayesian methods. Our data evidenced largely identical topology for all of the obtained phylogenetic trees, thus supporting the hypothesis that the relationship existing between SARS-CoV and group 2 coronavirus is a monophyletic one. Additional comparative genomic studies, including sequence similarity and protein secondary structure analyses, suggested that SARS-CoV may bear a closer relationship with group 2 than with the other coronavirus groups. Although our data strongly suggest that group 2 coronaviruses are most closely related with SARS-CoV, further and more detailed analyses may provide us with an increased amount of information regarding the origins and evolution of the coronaviruses, most notably SARS-CoV.'),\n", " 'fiievwy7': Cord19Doc(doc_id='fiievwy7', title='SARS-CoV-2 will continue to circulate in the human population: an opinion from the point of view of the virus-host relationship', doi='10.1007/s00011-020-01352-y', date='2020-04-30', abstract='At the population level, the virus-host relationship is not set up to end with the complete elimination of either or both. Pathogen-resistant individuals will always remain in the host population. In turn, the virus can never completely eliminate the host population, because evolutionarily such an event is a dead end for the virus as an obligate intracellular parasite. A certain existential balance exists in the virus-host relationship. Against this backdrop, viral epidemics and pandemics only become manifest and egregious to human beings when tens and hundreds of thousands of people die and the question emerges what caused the high mortality peaks on the death chart. The answer seems clear; the emerging strain of the virus is new to the host population, and new mutations of the virus and natural selection will lead to a survival of only genetically resistant individuals in a host population. The dangers inherent to a novel virus are due to new features generally inthe molecular structure of proteins, which enable the virus to infect the cells of the host organism more intensively, dramatically challenging host immunity, and thus be transmitted more readily in the host population. In this article, we will concentrate on the facts currently available about severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2), which has caused COVID-19 (coronavirus disease 2019) pandemic and try to predict its development and consequences based on the virus-host relationship. In fact, only two scenarios will occur simultaneously in the very near future: people who are genetically resistant to the virus will get sick, recover, and develop immunity, while people who are sensitive to the virus will need drugs and vaccines, which will have to be researched and developed if they are to recover. If the pandemic does not stop, in a few decades it is anticipated that SARS-CoV-2 will become as safe as the four non-severe acute respiratory syndrome human coronaviruses (HCoV-NL63, HCoV-HKU1, HCoV-OC43, and HCoV-229E) currently circulating but causing low mortality in the human population.')}" ] }, "metadata": { "tags": [] }, "execution_count": 11 } ] }, { "cell_type": "markdown", "metadata": { "id": "d_oB4z4ALcO0" }, "source": [ "If you don't care about the order they are returned in, you can use `get_many_iter()`. This avoids keeping all the results in memory, and reads them in the order in which they appear on disk." ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "uBIivxhyK2kz", "outputId": "3423a7cd-e9ca-448c-e46b-98a057153601" }, "source": [ "for doc in docstore.get_many_iter(['ax6v6ham', '44l5q07k', '8xm0kacj', '3wuh6k6g', 'fiievwy7']):\n", " print(doc)" ], "execution_count": 12, "outputs": [ { "output_type": "stream", "text": [ "Cord19Doc(doc_id='3wuh6k6g', title='Understand Research Hotspots Surrounding COVID-19 and Other Coronavirus Infections Using Topic Modeling', doi='10.1101/2020.03.26.20044164', date='2020-03-30', abstract='Background: Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) is a virus that causes severe respiratory illness in humans, which eventually results in the current outbreak of novel coronavirus disease (COVID-19) around the world. The research community is interested to know what are the hotspots in coronavirus (CoV) research and how much is known about COVID-19. This study aimed to evaluate the characteristics of publications involving coronaviruses as well as COVID-19 by using a topic modeling analysis. Methods: We extracted all abstracts and retained the most informative words from the COVID-19 Open Research Dataset, which contains all the 35,092 pieces of coronavirus related literature published up to March 20, 2020. Using Latent Dirichlet Allocation modeling, we trained an eight-topic model from the corpus. We then analyzed the semantic relationships between topics and compared the topic distribution between COVID-19 and other CoV infections. Results: Eight topics emerged overall: clinical characterization, pathogenesis research, therapeutics research, epidemiological study, virus transmission, vaccines research, virus diagnostics, and viral genomics. It was observed that COVID-19 research puts more emphasis on clinical characterization, epidemiological study, and virus transmission at present. In contrast, topics about diagnostics, therapeutics, vaccines, genomics and pathogenesis only accounted for less than 10% or even 4% of all the COVID-19 publications, much lower than those of other CoV infections. Conclusions: These results identified knowledge gaps in the area of COVID-19 and offered directions for future research. Keywords: COVID-19, coronavirus, topic modeling, hotspots, text mining')\n", "Cord19Doc(doc_id='ax6v6ham', title='Close relationship between SARS-coronavirus and group 2 coronavirus.', doi='', date='2006', abstract='The sudden appearance and potential lethality of severe acute respiratory syndrome (SARS)-associated coronavirus (SARS-CoV) in humans has resulted in a focusing of new attention on the determination of both its origins and evolution. The relationship existing between SARS-CoV and other groups of coronaviruses was determined via analyses of phylogenetic trees and comparative genomic analyses of the coronavirus genes: polymerase (Orf1ab), spike (S), envelope (E), membrane (M) and nucleocapsid (N). Although the coronaviruses are traditionally classed into 3 groups, with SARS-CoV forming a 4th group, the phylogenetic position and origins of SARS-CoV remain a matter of some controversy. Thus, we conducted extensive phylogenetic analyses of the genes common to all coronavirus groups, using the Neighbor-joining, Maximum-likelihood, and Bayesian methods. Our data evidenced largely identical topology for all of the obtained phylogenetic trees, thus supporting the hypothesis that the relationship existing between SARS-CoV and group 2 coronavirus is a monophyletic one. Additional comparative genomic studies, including sequence similarity and protein secondary structure analyses, suggested that SARS-CoV may bear a closer relationship with group 2 than with the other coronavirus groups. Although our data strongly suggest that group 2 coronaviruses are most closely related with SARS-CoV, further and more detailed analyses may provide us with an increased amount of information regarding the origins and evolution of the coronaviruses, most notably SARS-CoV.')\n", "Cord19Doc(doc_id='fiievwy7', title='SARS-CoV-2 will continue to circulate in the human population: an opinion from the point of view of the virus-host relationship', doi='10.1007/s00011-020-01352-y', date='2020-04-30', abstract='At the population level, the virus-host relationship is not set up to end with the complete elimination of either or both. Pathogen-resistant individuals will always remain in the host population. In turn, the virus can never completely eliminate the host population, because evolutionarily such an event is a dead end for the virus as an obligate intracellular parasite. A certain existential balance exists in the virus-host relationship. Against this backdrop, viral epidemics and pandemics only become manifest and egregious to human beings when tens and hundreds of thousands of people die and the question emerges what caused the high mortality peaks on the death chart. The answer seems clear; the emerging strain of the virus is new to the host population, and new mutations of the virus and natural selection will lead to a survival of only genetically resistant individuals in a host population. The dangers inherent to a novel virus are due to new features generally inthe molecular structure of proteins, which enable the virus to infect the cells of the host organism more intensively, dramatically challenging host immunity, and thus be transmitted more readily in the host population. In this article, we will concentrate on the facts currently available about severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2), which has caused COVID-19 (coronavirus disease 2019) pandemic and try to predict its development and consequences based on the virus-host relationship. In fact, only two scenarios will occur simultaneously in the very near future: people who are genetically resistant to the virus will get sick, recover, and develop immunity, while people who are sensitive to the virus will need drugs and vaccines, which will have to be researched and developed if they are to recover. If the pandemic does not stop, in a few decades it is anticipated that SARS-CoV-2 will become as safe as the four non-severe acute respiratory syndrome human coronaviruses (HCoV-NL63, HCoV-HKU1, HCoV-OC43, and HCoV-229E) currently circulating but causing low mortality in the human population.')\n", "Cord19Doc(doc_id='44l5q07k', title='Rôle des animaux vertébrés dans l’épidémiologie des zoonoses', doi='10.1016/s1773-035x(15)30110-6', date='2015-05-31', abstract='Résumé Les zoonoses, distinguées ici des maladies humaines d’origine animale, représentent un ensemble d’entités pathologiques dont les agents responsables circulent régulièrement entre l’espèce humaine et de nombreuses espèces de vertébrés. L’analyse de divers exemples, quelles que soient les voies de transmission et les causes favorisantes de la contamination, met en avant une régulière rareté du passage direct du réservoir animal vers l’espèce humaine, à opposer à la diversité et surtout à la gravité possible des évolutions sanitaires ultérieures possibles, parfois liées à des comportements humains peu adaptés. D’un point de vue pratique, il semblerait plus pertinent de se pencher d’abord sur un meilleur contrôle de la diffusion des agents pathogènes au sein des populations humaines que d’agir a priori sur le réservoir animal, qu’il soit domestique ou sauvage, source potentielle de ces agents pathogènes mais selon des modalités difficiles à anticiper. La relation entre biodiversité et santé est abordée à l’occasion de la discussion. Summary Zoonoses, to be distinguished here from human diseases with an animal origin, represent a large quantity of pathological entities the corresponding pathogens of which are regularly shared between human beings and many different vertebrates species. Working on selected examples and situations, whatever the contamination routes and the facilitating reasons, puts into light a real rarity of a direct transmission from the animal reservoir to human beings. On the opposite, the diversity and the possible severity of some of the sanitary consequences, quite often in relation to human behaviours, must be stressed. On a practical point of view, it seems more adapted to try first to improve the control of pathogens diffusion within human populations after a contamination than to try to start to work first on the reservoir, be it domestic or wild, the potential source of these pathogens, but following routes quite difficult to anticipate. The relationship between biodiversity and health is discussed.')\n", "Cord19Doc(doc_id='8xm0kacj', title='Host range of SARS-CoV-2 and implications for public health', doi='10.1016/s2666-5247(20)30069-0', date='2020-06-18', abstract='')\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "Vis7u-VeMX70" }, "source": [ "## Queries\n", "\n", "`queries` (topics) map a `query_id` to one or more text fields. Akint to `docs`, you can iterate over queries for a collection using `queries_iter()`:" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "-Tl3npjZLEWB", "outputId": "57bb154e-cb8a-4e6e-b57d-7ae3e5261c22" }, "source": [ "for query in dataset.queries_iter():\n", " print(query)" ], "execution_count": 13, "outputs": [ { "output_type": "stream", "text": [ "TrecQuery(query_id='1', title='coronavirus origin', description='what is the origin of COVID-19', narrative=\"seeking range of information about the SARS-CoV-2 virus's origin, including its evolution, animal source, and first transmission into humans\")\n", "TrecQuery(query_id='2', title='coronavirus response to weather changes', description='how does the coronavirus respond to changes in the weather', narrative='seeking range of information about the SARS-CoV-2 virus viability in different weather/climate conditions as well as information related to transmission of the virus in different climate conditions')\n", "TrecQuery(query_id='3', title='coronavirus immunity', description='will SARS-CoV2 infected people develop immunity? Is cross protection possible?', narrative='seeking studies of immunity developed due to infection with SARS-CoV2 or cross protection gained due to infection with other coronavirus types')\n", "TrecQuery(query_id='4', title='how do people die from the coronavirus', description='what causes death from Covid-19?', narrative='Studies looking at mechanisms of death from Covid-19.')\n", "TrecQuery(query_id='5', title='animal models of COVID-19', description='what drugs have been active against SARS-CoV or SARS-CoV-2 in animal studies?', narrative='Papers that describe the results of testing drugs that bind to spike proteins of the virus or any other drugs in any animal models. Papers about SARS-CoV-2 infection in cell culture assays are also relevant.')\n", "TrecQuery(query_id='6', title='coronavirus test rapid testing', description='what types of rapid testing for Covid-19 have been developed?', narrative='Looking for studies identifying ways to diagnose Covid-19 more rapidly.')\n", "TrecQuery(query_id='7', title='serological tests for coronavirus', description='are there serological tests that detect antibodies to coronavirus?', narrative='Looking for assays that measure immune response to COVID-19 that will help determine past infection and subsequent possible immunity.')\n", "TrecQuery(query_id='8', title='coronavirus under reporting', description='how has lack of testing availability led to underreporting of true incidence of Covid-19?', narrative='Looking for studies answering questions of impact of lack of complete testing for Covid-19 on incidence and prevalence of Covid-19.')\n", "TrecQuery(query_id='9', title='coronavirus in Canada', description='how has COVID-19 affected Canada', narrative='seeking data related to infections (confirm, suspected, and projected) and health outcomes (symptoms, hospitalization, intensive care, mortality)')\n", "TrecQuery(query_id='10', title='coronavirus social distancing impact', description='has social distancing had an impact on slowing the spread of COVID-19?', narrative=\"seeking specific information on studies that have measured COVID-19's transmission in one or more social distancing (or non-social distancing) approaches\")\n", "TrecQuery(query_id='11', title='coronavirus hospital rationing', description='what are the guidelines for triaging patients infected with coronavirus?', narrative='Seeking information on any guidelines for prioritizing COVID-19 patients infected with coronavirus based on demographics, clinical signs, serology and other tests.')\n", "TrecQuery(query_id='12', title='coronavirus quarantine', description='what are best practices in hospitals and at home in maintaining quarantine?', narrative='Seeking information on best practices for activities and duration of quarantine for those exposed and/ infected to COVID-19 virus.')\n", "TrecQuery(query_id='13', title='how does coronavirus spread', description='what are the transmission routes of coronavirus?', narrative='Looking for information on all possible ways to contract COVID-19 from people, animals and objects')\n", "TrecQuery(query_id='14', title='coronavirus super spreaders', description='what evidence is there related to COVID-19 super spreaders', narrative='seeking range of information related to the number and proportion of super spreaders, their patterns of behavior that lead to spread, and potential prevention strategies targeted specifically toward super spreaders')\n", "TrecQuery(query_id='15', title='coronavirus outside body', description='how long can the coronavirus live outside the body', narrative=\"seeking range of information on the SARS-CoV-2's virus's survival in different environments (surfaces, liquids, etc.) outside the human body while still being viable for transmission to another human\")\n", "TrecQuery(query_id='16', title='how long does coronavirus survive on surfaces', description='how long does coronavirus remain stable on surfaces?', narrative='Studies of time SARS-CoV-2 remains stable after being deposited from an infected person on everyday surfaces in a household or hospital setting, such as through coughing or touching objects.')\n", "TrecQuery(query_id='17', title='coronavirus clinical trials', description='are there any clinical trials available for the coronavirus', narrative='seeking specific COVID-19 clinical trials ranging from trials in recruitment to completed trials with results')\n", "TrecQuery(query_id='18', title='masks prevent coronavirus', description='what are the best masks for preventing infection by Covid-19?', narrative='What types of masks should or should not be used to prevent infection by Covid-19?')\n", "TrecQuery(query_id='19', title='what alcohol sanitizer kills coronavirus', description='what type of hand sanitizer is needed to destroy Covid-19?', narrative='Studies assessing chemicals and their concentrations needed to destroy the Covid-19 virus.')\n", "TrecQuery(query_id='20', title='coronavirus and ACE inhibitors', description='are patients taking Angiotensin-converting enzyme inhibitors (ACE) at increased risk for COVID-19?', narrative='Looking for information on interactions between coronavirus and angiotensin converting enzyme 2 (ACE2) receptors, risk for patients taking these medications, and recommendations for these patients.')\n", "TrecQuery(query_id='21', title='coronavirus mortality', description='what are the mortality rates overall and in specific populations', narrative='Seeking information on COVID-19 fatality rates in different countries and in different population groups based on gender, blood types, or other factors')\n", "TrecQuery(query_id='22', title='coronavirus heart impacts', description='are cardiac complications likely in patients with COVID-19?', narrative='Seeking information on the types, frequency and mechanisms of cardiac complications caused by coronavirus.')\n", "TrecQuery(query_id='23', title='coronavirus hypertension', description='what kinds of complications related to COVID-19 are associated with hypertension?', narrative='seeking specific outcomes that hypertensive (any type) patients are more/less likely to face if infected with the virus')\n", "TrecQuery(query_id='24', title='coronavirus diabetes', description='what kinds of complications related to COVID-19 are associated with diabetes', narrative='seeking specific outcomes that diabetic (any type) patients are more/less likely to face if infected with the virus')\n", "TrecQuery(query_id='25', title='coronavirus biomarkers', description='which biomarkers predict the severe clinical course of 2019-nCOV infection?', narrative='Looking for information on biomarkers that predict disease outcomes in people infected with coronavirus, specifically those that predict severe and fatal outcomes.')\n", "TrecQuery(query_id='26', title='coronavirus early symptoms', description='what are the initial symptoms of Covid-19?', narrative='Studies of patients and the first clinical manifestations they develop upon active infection?')\n", "TrecQuery(query_id='27', title='coronavirus asymptomatic', description='what is known about those infected with Covid-19 but are asymptomatic?', narrative='Studies of people who are known to be infected with Covid-19 but show no symptoms?')\n", "TrecQuery(query_id='28', title='coronavirus hydroxychloroquine', description='what evidence is there for the value of hydroxychloroquine in treating Covid-19?', narrative='Basic science or clinical studies assessing the benefit and harms of treating Covid-19 with hydroxychloroquine.')\n", "TrecQuery(query_id='29', title='coronavirus drug repurposing', description='which SARS-CoV-2 proteins-human proteins interactions indicate potential for drug targets. Are there approved drugs that can be repurposed based on this information?', narrative='Seeking information about protein-protein interactions for any of the SARS-CoV-2 structural proteins that represent a promising therapeutic target, and the drug molecules that may inhibit the virus and the host cell receptors at entry step.')\n", "TrecQuery(query_id='30', title='coronavirus remdesivir', description='is remdesivir an effective treatment for COVID-19', narrative='seeking specific information on clinical outcomes in COVID-19 patients treated with remdesivir')\n", "TrecQuery(query_id='31', title='difference between coronavirus and flu', description='How does the coronavirus differ from seasonal flu?', narrative='Includes studies ranging from those focusing on genomic differences to global public health impacts, but must draw direct comparisons between COVID-19 and seasonal influenza.')\n", "TrecQuery(query_id='32', title='coronavirus subtypes', description='Does SARS-CoV-2 have any subtypes, and if so what are they?', narrative='Papers that discuss subtypes of the virus, from named subtypes to speculative subtypes based on genomic or geographic clustering.')\n", "TrecQuery(query_id='33', title='coronavirus vaccine candidates', description='What vaccine candidates are being tested for Covid-19?', narrative='Seeking studies that discuss possible, but specific, COVID-19 vaccines. Includes articles from those describing the mechanisms of action of specific proposed vaccines to actual clinical trials, but excluding articles that do not name a specific vaccine candidate.')\n", "TrecQuery(query_id='34', title='coronavirus recovery', description='What are the longer-term complications of those who recover from COVID-19?', narrative='Seeking information on the health outcomes for those that recover from the virus. Excludes studies only focusing on adverse effects related to a particular COVID-19 drug.')\n", "TrecQuery(query_id='35', title='coronavirus public datasets', description='What new public datasets are available related to COVID-19?', narrative='Seeking articles that specifically release new data related to SARS-CoV-2 or COVID-19, including genomic data, patient data, public health data, etc. Articles that reference previously existing datasets are not relevant.')\n", "TrecQuery(query_id='36', title='SARS-CoV-2 spike structure', description='What is the protein structure of the SARS-CoV-2 spike?', narrative='Looking for studies of the structure of the spike protein on the virus using any methods, such as cryo-EM or crystallography')\n", "TrecQuery(query_id='37', title='SARS-CoV-2 phylogenetic analysis', description='What is the result of phylogenetic analysis of SARS-CoV-2 genome sequence?', narrative='Looking for a range of studies which provide the results of phylogenetic network analysis on the SARS-CoV-2 genome')\n", "TrecQuery(query_id='38', title='COVID inflammatory response', description='What is the mechanism of inflammatory response and pathogenesis of COVID-19 cases?', narrative='Looking for a range of studies which describes the inflammatory response cells and pathogenesis during the Coronavirus Disease 2019 (COVID-19) outbreak, including the mechanism of anti-inflammatory drugs, corticosteroids, and vitamin supplements')\n", "TrecQuery(query_id='39', title='COVID-19 cytokine storm', description='What is the mechanism of cytokine storm syndrome on the COVID-19?', narrative='Looking for studies that describes mechanism of development of cytokine storm syndrome among COVID-19 cases and the range of drugs used for the therapy of cytokine storm')\n", "TrecQuery(query_id='40', title='coronavirus mutations', description='What are the observed mutations in the SARS-CoV-2 genome and how often do the mutations occur?', narrative='Looking for studies that describes the emergence of genomic diversity of the coronavirus due to recurrent mutations which explore the potential genomic site of the mutation, mechanisms and its potential or observed clinical implications in the pathogenicity of the virus')\n", "TrecQuery(query_id='41', title='COVID-19 in African-Americans', description='What are the impacts of COVID-19 among African-Americans that differ from the rest of the U.S. population?', narrative='Looking for studies that analyze burden of illness and death among the African-American/black racial/ethnic group. This includes potential reasons for transmission, morbidity, and mortality. This may include discussion of other minority groups, but all studies should contain specific information on the health disparities faced by African-Americans in this pandemic.')\n", "TrecQuery(query_id='42', title='Vitamin D and COVID-19', description='Does Vitamin D impact COVID-19 prevention and treatment?', narrative='This includes studies describing possible role of Vitamin D in prevention of COVID-19, suppression of cytokine storm, clinical outcomes, and associations between Vitamin D status and COVID-19 mortality.')\n", "TrecQuery(query_id='43', title='violence during pandemic', description='How has the COVID-19 pandemic impacted violence in society, including violent crimes?', narrative='Looking for analyses and data on how the pandemic is impacting rates of violence, including domestic/family violence related to quarantine.')\n", "TrecQuery(query_id='44', title='impact of masks on coronavirus transmission', description='How much impact do masks have on preventing the spread of the COVID-19?', narrative='Looking for studies of how masks slow SARS-CoV-2 transmission, including impact on R0. Studies can include both lab and population studies.')\n", "TrecQuery(query_id='45', title='coronavirus mental health impact', description='How has the COVID-19 pandemic impacted mental health?', narrative='Includes increasing/decreasing rates of depression, anxiety, panic disorder, and other psychiatric and mental health conditions.')\n", "TrecQuery(query_id='46', title='dexamethasone coronavirus', description='what evidence is there for dexamethasone as a treatment for COVID-19?', narrative='Looking for studies on the impact of dexamethasone treatment in COVID-19 patients, including health benefits as well as adverse effects. This also includes specific populations that are benefitted/harmed by dexamethasone.')\n", "TrecQuery(query_id='47', title='COVID-19 outcomes in children', description='what are the health outcomes for children who contract COVID-19?', narrative='Looking for studies on health outcomes in children related to COVID-19. This includes studies attempting to explain the underlying biological mechanisms for why children differ from adults in response to infection.')\n", "TrecQuery(query_id='48', title='school reopening coronavirus', description='what are the benefits and risks of re-opening schools in the midst of the COVID-19 pandemic?', narrative='With the possibility of schools re-opening while the COVID-19 pandemic is still ongoing, this topic is looking for evidence or projections on what the potential implications of this are in terms of COVID-19 cases, hospitalizations, or deaths, as well as other benefits or harms to re-opening schools. This includes both the impact on students, teachers, families, and the wider community.')\n", "TrecQuery(query_id='49', title='post-infection COVID-19 immunity', description='do individuals who recover from COVID-19 show sufficient immune response, including antibody levels and T-cell mediated immunity, to prevent re-infection?', narrative='There is concern about re-infection for COVID-19, so this topic is looking for studies suggesting post-infection immunity, including post-infection antibody levels (over time) and evidence for individuals who have been infected more than once.')\n", "TrecQuery(query_id='50', title='mRNA vaccine coronavirus', description='what is known about an mRNA vaccine for the SARS-CoV-2 virus?', narrative='Looking for studies specifically focusing on mRNA vaccines for COVID-19, including how mRNA vaccines work, why they are promising, and any results from actual clinical studies.')\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "-0TnBGErO7VS" }, "source": [ "Iterables of namedtuples are handy structures because they are lightweight and do not load all the content into memory. But in case you need that, you can easily convert them into other data structures. Here's an example building a Pandas DataFrame of the queries:" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "nFsImZY1PNGa", "outputId": "6bf596b8-8412-48e7-be75-eaf5e703eb01" }, "source": [ "import pandas as pd\n", "pd.DataFrame(dataset.queries_iter())" ], "execution_count": 14, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
query_idtitledescriptionnarrative
01coronavirus originwhat is the origin of COVID-19seeking range of information about the SARS-Co...
12coronavirus response to weather changeshow does the coronavirus respond to changes in...seeking range of information about the SARS-Co...
23coronavirus immunitywill SARS-CoV2 infected people develop immunit...seeking studies of immunity developed due to i...
34how do people die from the coronaviruswhat causes death from Covid-19?Studies looking at mechanisms of death from Co...
45animal models of COVID-19what drugs have been active against SARS-CoV o...Papers that describe the results of testing d...
56coronavirus test rapid testingwhat types of rapid testing for Covid-19 have ...Looking for studies identifying ways to diagno...
67serological tests for coronavirusare there serological tests that detect antibo...Looking for assays that measure immune respons...
78coronavirus under reportinghow has lack of testing availability led to un...Looking for studies answering questions of imp...
89coronavirus in Canadahow has COVID-19 affected Canadaseeking data related to infections (confirm, s...
910coronavirus social distancing impacthas social distancing had an impact on slowing...seeking specific information on studies that h...
1011coronavirus hospital rationingwhat are the guidelines for triaging patients ...Seeking information on any guidelines for prio...
1112coronavirus quarantinewhat are best practices in hospitals and at ho...Seeking information on best practices for acti...
1213how does coronavirus spreadwhat are the transmission routes of coronavirus?Looking for information on all possible ways t...
1314coronavirus super spreaderswhat evidence is there related to COVID-19 sup...seeking range of information related to the nu...
1415coronavirus outside bodyhow long can the coronavirus live outside the ...seeking range of information on the SARS-CoV-2...
1516how long does coronavirus survive on surfaceshow long does coronavirus remain stable on su...Studies of time SARS-CoV-2 remains stable afte...
1617coronavirus clinical trialsare there any clinical trials available for th...seeking specific COVID-19 clinical trials rang...
1718masks prevent coronaviruswhat are the best masks for preventing infecti...What types of masks should or should not be us...
1819what alcohol sanitizer kills coronaviruswhat type of hand sanitizer is needed to destr...Studies assessing chemicals and their concentr...
1920coronavirus and ACE inhibitorsare patients taking Angiotensin-converting enz...Looking for information on interactions betwee...
2021coronavirus mortalitywhat are the mortality rates overall and in sp...Seeking information on COVID-19 fatality rate...
2122coronavirus heart impactsare cardiac complications likely in patients w...Seeking information on the types, frequency an...
2223coronavirus hypertensionwhat kinds of complications related to COVID-1...seeking specific outcomes that hypertensive (...
2324coronavirus diabeteswhat kinds of complications related to COVID-1...seeking specific outcomes that diabetic (any t...
2425coronavirus biomarkerswhich biomarkers predict the severe clinical c...Looking for information on biomarkers that pre...
2526coronavirus early symptomswhat are the initial symptoms of Covid-19?Studies of patients and the first clinical man...
2627coronavirus asymptomaticwhat is known about those infected with Covid-...Studies of people who are known to be infected...
2728coronavirus hydroxychloroquinewhat evidence is there for the value of hydrox...Basic science or clinical studies assessing th...
2829coronavirus drug repurposingwhich SARS-CoV-2 proteins-human proteins inter...Seeking information about protein-protein inte...
2930coronavirus remdesiviris remdesivir an effective treatment for COVID-19seeking specific information on clinical outco...
3031difference between coronavirus and fluHow does the coronavirus differ from seasonal ...Includes studies ranging from those focusing o...
3132coronavirus subtypesDoes SARS-CoV-2 have any subtypes, and if so w...Papers that discuss subtypes of the virus, fro...
3233coronavirus vaccine candidatesWhat vaccine candidates are being tested for C...Seeking studies that discuss possible, but spe...
3334coronavirus recoveryWhat are the longer-term complications of thos...Seeking information on the health outcomes for...
3435coronavirus public datasetsWhat new public datasets are available related...Seeking articles that specifically release new...
3536SARS-CoV-2 spike structureWhat is the protein structure of the SARS-CoV-...Looking for studies of the structure of the sp...
3637SARS-CoV-2 phylogenetic analysisWhat is the result of phylogenetic analysis of...Looking for a range of studies which provide t...
3738COVID inflammatory responseWhat is the mechanism of inflammatory response...Looking for a range of studies which describes...
3839COVID-19 cytokine stormWhat is the mechanism of cytokine storm syndro...Looking for studies that describes mechanism o...
3940coronavirus mutationsWhat are the observed mutations in the SARS-Co...Looking for studies that describes the emergen...
4041COVID-19 in African-AmericansWhat are the impacts of COVID-19 among African...Looking for studies that analyze burden of ill...
4142Vitamin D and COVID-19Does Vitamin D impact COVID-19 prevention and ...This includes studies describing possible role...
4243violence during pandemicHow has the COVID-19 pandemic impacted violenc...Looking for analyses and data on how the pande...
4344impact of masks on coronavirus transmissionHow much impact do masks have on preventing th...Looking for studies of how masks slow SARS-CoV...
4445coronavirus mental health impactHow has the COVID-19 pandemic impacted mental ...Includes increasing/decreasing rates of depres...
4546dexamethasone coronaviruswhat evidence is there for dexamethasone as a ...Looking for studies on the impact of dexametha...
4647COVID-19 outcomes in childrenwhat are the health outcomes for children who ...Looking for studies on health outcomes in chil...
4748school reopening coronaviruswhat are the benefits and risks of re-opening ...With the possibility of schools re-opening whi...
4849post-infection COVID-19 immunitydo individuals who recover from COVID-19 show ...There is concern about re-infection for COVID-...
4950mRNA vaccine coronaviruswhat is known about an mRNA vaccine for the SA...Looking for studies specifically focusing on m...
\n", "
" ], "text/plain": [ " query_id ... narrative\n", "0 1 ... seeking range of information about the SARS-Co...\n", "1 2 ... seeking range of information about the SARS-Co...\n", "2 3 ... seeking studies of immunity developed due to i...\n", "3 4 ... Studies looking at mechanisms of death from Co...\n", "4 5 ... Papers that describe the results of testing d...\n", "5 6 ... Looking for studies identifying ways to diagno...\n", "6 7 ... Looking for assays that measure immune respons...\n", "7 8 ... Looking for studies answering questions of imp...\n", "8 9 ... seeking data related to infections (confirm, s...\n", "9 10 ... seeking specific information on studies that h...\n", "10 11 ... Seeking information on any guidelines for prio...\n", "11 12 ... Seeking information on best practices for acti...\n", "12 13 ... Looking for information on all possible ways t...\n", "13 14 ... seeking range of information related to the nu...\n", "14 15 ... seeking range of information on the SARS-CoV-2...\n", "15 16 ... Studies of time SARS-CoV-2 remains stable afte...\n", "16 17 ... seeking specific COVID-19 clinical trials rang...\n", "17 18 ... What types of masks should or should not be us...\n", "18 19 ... Studies assessing chemicals and their concentr...\n", "19 20 ... Looking for information on interactions betwee...\n", "20 21 ... Seeking information on COVID-19 fatality rate...\n", "21 22 ... Seeking information on the types, frequency an...\n", "22 23 ... seeking specific outcomes that hypertensive (...\n", "23 24 ... seeking specific outcomes that diabetic (any t...\n", "24 25 ... Looking for information on biomarkers that pre...\n", "25 26 ... Studies of patients and the first clinical man...\n", "26 27 ... Studies of people who are known to be infected...\n", "27 28 ... Basic science or clinical studies assessing th...\n", "28 29 ... Seeking information about protein-protein inte...\n", "29 30 ... seeking specific information on clinical outco...\n", "30 31 ... Includes studies ranging from those focusing o...\n", "31 32 ... Papers that discuss subtypes of the virus, fro...\n", "32 33 ... Seeking studies that discuss possible, but spe...\n", "33 34 ... Seeking information on the health outcomes for...\n", "34 35 ... Seeking articles that specifically release new...\n", "35 36 ... Looking for studies of the structure of the sp...\n", "36 37 ... Looking for a range of studies which provide t...\n", "37 38 ... Looking for a range of studies which describes...\n", "38 39 ... Looking for studies that describes mechanism o...\n", "39 40 ... Looking for studies that describes the emergen...\n", "40 41 ... Looking for studies that analyze burden of ill...\n", "41 42 ... This includes studies describing possible role...\n", "42 43 ... Looking for analyses and data on how the pande...\n", "43 44 ... Looking for studies of how masks slow SARS-CoV...\n", "44 45 ... Includes increasing/decreasing rates of depres...\n", "45 46 ... Looking for studies on the impact of dexametha...\n", "46 47 ... Looking for studies on health outcomes in chil...\n", "47 48 ... With the possibility of schools re-opening whi...\n", "48 49 ... There is concern about re-infection for COVID-...\n", "49 50 ... Looking for studies specifically focusing on m...\n", "\n", "[50 rows x 4 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 14 } ] }, { "cell_type": "markdown", "metadata": { "id": "x8guincjNQA1" }, "source": [ "Again, we can [check the documentation](https://ir-datasets.com/cord19.html#cord19/trec-covid) for information about what fields are available. Or we can use `queries_cls()`:" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "lxSvF03NM2Dt", "outputId": "d2ca9dd1-5914-4697-d8b4-8b904c85902a" }, "source": [ "dataset.queries_cls()" ], "execution_count": 15, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "ir_datasets.formats.trec.TrecQuery" ] }, "metadata": { "tags": [] }, "execution_count": 15 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "BRBNpFHZNK70", "outputId": "5142178d-5844-4224-e5b1-c7547d7dec86" }, "source": [ "dataset.queries_cls()._fields" ], "execution_count": 16, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "('query_id', 'title', 'description', 'narrative')" ] }, "metadata": { "tags": [] }, "execution_count": 16 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "PycaUfckNM80", "outputId": "1a5f74a7-1131-4937-e3e5-1ae06e8015cd" }, "source": [ "dataset.queries_cls().__annotations__" ], "execution_count": 17, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "OrderedDict([('query_id', str),\n", " ('title', str),\n", " ('description', str),\n", " ('narrative', str)])" ] }, "metadata": { "tags": [] }, "execution_count": 17 } ] }, { "cell_type": "markdown", "metadata": { "id": "p5IA74C3Nlr0" }, "source": [ "## Query Relevance Assessments\n", "\n", "`qrels` (query relevance assessments/judgments) map a `query_id` and `doc_id` to a relevance score.\n", "\n", "You probably guessed it; we can fetch qrels for a dataset with `qrels_iter()`. There's a lot of them, so we'll just show them in a DataFrame to start with:" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "id": "VYldukNANN8I", "outputId": "998a629b-89f2-4d23-9de9-4289c92e3287" }, "source": [ "pd.DataFrame(dataset.qrels_iter())" ], "execution_count": 18, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
query_iddoc_idrelevanceiteration
01005b2j4b24.5
1100fmeepz14
21010vptx320.5
310194oljo12.5
41021q988414
...............
6931350zvop8bxh25
6931450zwf26o6315
6931550zwsvlnwe05
6931650zxr01yln15
6931750zz8wvos915
\n", "

69318 rows × 4 columns

\n", "
" ], "text/plain": [ " query_id doc_id relevance iteration\n", "0 1 005b2j4b 2 4.5\n", "1 1 00fmeepz 1 4\n", "2 1 010vptx3 2 0.5\n", "3 1 0194oljo 1 2.5\n", "4 1 021q9884 1 4\n", "... ... ... ... ...\n", "69313 50 zvop8bxh 2 5\n", "69314 50 zwf26o63 1 5\n", "69315 50 zwsvlnwe 0 5\n", "69316 50 zxr01yln 1 5\n", "69317 50 zz8wvos9 1 5\n", "\n", "[69318 rows x 4 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 18 } ] }, { "cell_type": "markdown", "metadata": { "id": "piCD2tZkOGaQ" }, "source": [ "What does relevance=0, 1, and 2 mean? You can find out with `qrels_defs`:" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Fy_xf2vQN-Jx", "outputId": "4b6264db-7483-4630-a558-a36c8d7420c8" }, "source": [ "dataset.qrels_defs()" ], "execution_count": 19, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{0: 'Not Relevant: everything else.',\n", " 1: 'Partially Relevant: the article answers part of the question but would need to be combined with other information to get a complete answer.',\n", " 2: 'Relevant: the article is fully responsive to the information need as expressed by the topic, i.e. answers the Question in the topic. The article need not contain all information on the topic, but must, on its own, provide an answer to the question.'}" ] }, "metadata": { "tags": [] }, "execution_count": 19 } ] }, { "cell_type": "markdown", "metadata": { "id": "B8noJu7xOQ23" }, "source": [ "Of course we can also get information about the `TrecQrel` type using `qrels_cls()`:" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0q8oZBIQOEnX", "outputId": "64ff29a9-e273-4ead-b94d-f5b5aa8eda48" }, "source": [ "dataset.qrels_cls()" ], "execution_count": 20, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "ir_datasets.formats.trec.TrecQrel" ] }, "metadata": { "tags": [] }, "execution_count": 20 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4tJ_aXCBOYBG", "outputId": "479d1dc0-5d02-429c-a243-7b24feda1e4e" }, "source": [ "dataset.qrels_cls()._fields" ], "execution_count": 21, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "('query_id', 'doc_id', 'relevance', 'iteration')" ] }, "metadata": { "tags": [] }, "execution_count": 21 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "QAImnTAtOZoz", "outputId": "dbf62766-db08-424e-c3fa-a22634500907" }, "source": [ "dataset.qrels_cls().__annotations__" ], "execution_count": 22, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "OrderedDict([('query_id', str),\n", " ('doc_id', str),\n", " ('relevance', int),\n", " ('iteration', str)])" ] }, "metadata": { "tags": [] }, "execution_count": 22 } ] }, { "cell_type": "markdown", "metadata": { "id": "GQb4jO-HOf6q" }, "source": [ "## Wrapping Up\n", "\n", "So that's the core functionality. You can find more information in the [documentation](https://ir-datasets.com/)." ] }, { "cell_type": "code", "metadata": { "id": "EldJW1rhObGy" }, "source": [ "" ], "execution_count": 22, "outputs": [] } ] } ================================================ FILE: examples/ir_datasets_cli.ipynb ================================================ { "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "ir_datasets-cli.ipynb", "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "SP6ophbQq5I0" }, "source": [ "# ir_datasets - Tutorial - CLI\n", "\n", "**NOTE: This tutorial is for the command-line interface. See the other tutorial for Python.**" ] }, { "cell_type": "markdown", "metadata": { "id": "cl8KYrJTq-g0" }, "source": [ "## Getting Started\n", "\n", "We'll start out by installing the package. The package is available on pypi,\n", "so you can install it with your favorite package manager." ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "vbGhAIREqw1c", "outputId": "1d7fcdb3-93a2-4668-fd7d-787d1471f648" }, "source": [ "!pip install ir_datasets" ], "execution_count": 1, "outputs": [ { "output_type": "stream", "text": [ "Requirement already satisfied: ir_datasets in /usr/local/lib/python3.7/dist-packages (0.3.1)\n", "Requirement already satisfied: tqdm>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (4.41.1)\n", "Requirement already satisfied: warc3-wet>=0.2.3 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (0.2.3)\n", "Requirement already satisfied: warc3-wet-clueweb09>=0.2.5 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (0.2.5)\n", "Requirement already satisfied: beautifulsoup4>=4.4.1 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (4.6.3)\n", "Requirement already satisfied: requests>=2.22.0 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (2.23.0)\n", "Requirement already satisfied: lz4>=3.1.1 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (3.1.3)\n", "Requirement already satisfied: ijson>=3.1.3 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (3.1.4)\n", "Requirement already satisfied: pyyaml>=5.3.1 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (5.4.1)\n", "Requirement already satisfied: zlib-state>=0.1.3 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (0.1.3)\n", "Requirement already satisfied: lxml>=4.5.2 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (4.6.2)\n", "Requirement already satisfied: trec-car-tools>=2.5.3 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (2.5.3)\n", "Requirement already satisfied: numpy>=1.18.1 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (1.19.5)\n", "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.22.0->ir_datasets) (1.24.3)\n", "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.22.0->ir_datasets) (3.0.4)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.22.0->ir_datasets) (2020.12.5)\n", "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.22.0->ir_datasets) (2.10)\n", "Requirement already satisfied: cbor>=1.0.0 in /usr/local/lib/python3.7/dist-packages (from trec-car-tools>=2.5.3->ir_datasets) (1.0.0)\n", "Requirement already satisfied: typing>=3.6.2 in /usr/local/lib/python3.7/dist-packages (from trec-car-tools>=2.5.3->ir_datasets) (3.7.4.3)\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "7v_X6XqlrTan" }, "source": [ "## export\n", "\n", "The `ir_datasets export` command outputs data to stdout as TSV,\n", "JSON, and other formats.\n", "\n", "The command format is:\n", "\n", "```\n", "ir_datasets export \n", "```\n", "\n", "with optional other arguments following entity-type.\n", "\n", "`` is the dataset's identifier, found [in the catalog](https://ir-datasets.com/). `` is one of: `docs`, `queries`, `qrels`, `scoreddocs`." ] }, { "cell_type": "markdown", "metadata": { "id": "myI4M6OCsJQL" }, "source": [ "Let's start by getting the top 10 documents from the `cord19/trec-covid` collection. The first time you run the command, it will automatically download the dataset.\n" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wt-QU7q1q-Mn", "outputId": "469d6b3f-4a0f-44db-a42b-ee8fef1232fe" }, "source": [ "!ir_datasets export cord19/trec-covid docs | head -n 10" ], "execution_count": 2, "outputs": [ { "output_type": "stream", "text": [ "[INFO] No fields supplied. Using all fields: ('doc_id', 'title', 'doi', 'date', 'abstract')\n", "ug7v899j\tClinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia\t10.1186/1471-2334-1-6\t2001-07-04\tOBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolates (60%) were associated with pneumonia, 14 (35%) with upper respiratory tract infections, and 2 (5%) with bronchiolitis. Cough (82.5%), fever (75%), and malaise (58.8%) were the most common symptoms, and crepitations (60%), and wheezes (40%) were the most common signs. Most patients with pneumonia had crepitations (79.2%) but only 25% had bronchial breathing. Immunocompromised patients were more likely than non-immunocompromised patients to present with pneumonia (8/9 versus 16/31, P = 0.05). Of the 24 patients with pneumonia, 14 (58.3%) had uneventful recovery, 4 (16.7%) recovered following some complications, 3 (12.5%) died because of M pneumoniae infection, and 3 (12.5%) died due to underlying comorbidities. The 3 patients who died of M pneumoniae pneumonia had other comorbidities. CONCLUSION: our results were similar to published data except for the finding that infections were more common in infants and preschool children and that the mortality rate of pneumonia in patients with comorbidities was high.\n", "02tnwd4m\tNitric oxide: a pro-inflammatory mediator in lung disease?\t10.1186/rr14\t2000-08-15\tInflammatory diseases of the respiratory tract are commonly associated with elevated production of nitric oxide (NO•) and increased indices of NO• -dependent oxidative stress. Although NO• is known to have anti-microbial, anti-inflammatory and anti-oxidant properties, various lines of evidence support the contribution of NO• to lung injury in several disease models. On the basis of biochemical evidence, it is often presumed that such NO• -dependent oxidations are due to the formation of the oxidant peroxynitrite, although alternative mechanisms involving the phagocyte-derived heme proteins myeloperoxidase and eosinophil peroxidase might be operative during conditions of inflammation. Because of the overwhelming literature on NO• generation and activities in the respiratory tract, it would be beyond the scope of this commentary to review this area comprehensively. Instead, it focuses on recent evidence and concepts of the presumed contribution of NO• to inflammatory diseases of the lung.\n", "ejv2xln0\tSurfactant protein-D and pulmonary host defense\t10.1186/rr19\t2000-08-25\tSurfactant protein-D (SP-D) participates in the innate response to inhaled microorganisms and organic antigens, and contributes to immune and inflammatory regulation within the lung. SP-D is synthesized and secreted by alveolar and bronchiolar epithelial cells, but is also expressed by epithelial cells lining various exocrine ducts and the mucosa of the gastrointestinal and genitourinary tracts. SP-D, a collagenous calcium-dependent lectin (or collectin), binds to surface glycoconjugates expressed by a wide variety of microorganisms, and to oligosaccharides associated with the surface of various complex organic antigens. SP-D also specifically interacts with glycoconjugates and other molecules expressed on the surface of macrophages, neutrophils, and lymphocytes. In addition, SP-D binds to specific surfactant-associated lipids and can influence the organization of lipid mixtures containing phosphatidylinositol in vitro. Consistent with these diverse in vitro activities is the observation that SP-D-deficient transgenic mice show abnormal accumulations of surfactant lipids, and respond abnormally to challenge with respiratory viruses and bacterial lipopolysaccharides. The phenotype of macrophages isolated from the lungs of SP-D-deficient mice is altered, and there is circumstantial evidence that abnormal oxidant metabolism and/or increased metalloproteinase expression contributes to the development of emphysema. The expression of SP-D is increased in response to many forms of lung injury, and deficient accumulation of appropriately oligomerized SP-D might contribute to the pathogenesis of a variety of human lung diseases.\n", "2b73a28n\tRole of endothelin-1 in lung disease\t10.1186/rr44\t2001-02-22\tEndothelin-1 (ET-1) is a 21 amino acid peptide with diverse biological activity that has been implicated in numerous diseases. ET-1 is a potent mitogen regulator of smooth muscle tone, and inflammatory mediator that may play a key role in diseases of the airways, pulmonary circulation, and inflammatory lung diseases, both acute and chronic. This review will focus on the biology of ET-1 and its role in lung disease.\n", "9785vg6d\tGene expression in epithelial cells in response to pneumovirus infection\t10.1186/rr61\t2001-05-11\tRespiratory syncytial virus (RSV) and pneumonia virus of mice (PVM) are viruses of the family Paramyxoviridae, subfamily pneumovirus, which cause clinically important respiratory infections in humans and rodents, respectively. The respiratory epithelial target cells respond to viral infection with specific alterations in gene expression, including production of chemoattractant cytokines, adhesion molecules, elements that are related to the apoptosis response, and others that remain incompletely understood. Here we review our current understanding of these mucosal responses and discuss several genomic approaches, including differential display reverse transcription-polymerase chain reaction (PCR) and gene array strategies, that will permit us to unravel the nature of these responses in a more complete and systematic manner.\n", "zjufx4fo\tSequence requirements for RNA strand transfer during nidovirus discontinuous subgenomic RNA synthesis\t10.1093/emboj/20.24.7220\t2001-12-17\tNidovirus subgenomic mRNAs contain a leader sequence derived from the 5′ end of the genome fused to different sequences (‘bodies’) derived from the 3′ end. Their generation involves a unique mechanism of discontinuous subgenomic RNA synthesis that resembles copy-choice RNA recombination. During this process, the nascent RNA strand is transferred from one site in the template to another, during either plus or minus strand synthesis, to yield subgenomic RNA molecules. Central to this process are transcription-regulating sequences (TRSs), which are present at both template sites and ensure the fidelity of strand transfer. Here we present results of a comprehensive co-variation mutagenesis study of equine arteritis virus TRSs, demonstrating that discontinuous RNA synthesis depends not only on base pairing between sense leader TRS and antisense body TRS, but also on the primary sequence of the body TRS. While the leader TRS merely plays a targeting role for strand transfer, the body TRS fulfils multiple functions. The sequences of mRNA leader–body junctions of TRS mutants strongly suggested that the discontinuous step occurs during minus strand synthesis.\n", "5yhe786e\tDebate: Transfusing to normal haemoglobin levels will not improve outcome\t10.1186/cc987\t2001-03-08\tRecent evidence suggests that critically ill patients are able to tolerate lower levels of haemoglobin than was previously believed. It is our goal to show that transfusing to a level of 100 g/l does not improve mortality and other clinically important outcomes in a critical care setting. Although many questions remain, many laboratory and clinical studies, including a recent randomized controlled trial (RCT), have established that transfusing to normal haemoglobin concentrations does not improve organ failure and mortality in the critically ill patient. In addition, a restrictive transfusion strategy will reduce exposure to allogeneic transfusions, result in more efficient use of red blood cells (RBCs), save blood overall, and decrease health care costs.\n", "8zchiykl\tThe 21st International Symposium on Intensive Care and Emergency Medicine, Brussels, Belgium, 20-23 March 2001\t10.1186/cc1013\t2001-05-02\tThe 21st International Symposium on Intensive Care and Emergency Medicine was dominated by the results of recent clinical trials in sepsis and acute respiratory distress syndrome (ARDS). The promise of extracorporeal liver replacement therapy and noninvasive ventilation were other areas of interest. Ethical issues also received attention. Overall, the 'state of the art' lectures, pro/con debates, seminars and tutorials were of a high standard. The meeting was marked by a sense of renewed enthusiasm that positive progress is occurring in intensive care medicine.\n", "8qnrcgnk\tHeme oxygenase-1 and carbon monoxide in pulmonary medicine\t10.1186/1465-9921-4-7\t2003-08-07\tHeme oxygenase-1 (HO-1), an inducible stress protein, confers cytoprotection against oxidative stress in vitro and in vivo. In addition to its physiological role in heme degradation, HO-1 may influence a number of cellular processes, including growth, inflammation, and apoptosis. By virtue of anti-inflammatory effects, HO-1 limits tissue damage in response to proinflammatory stimuli and prevents allograft rejection after transplantation. The transcriptional upregulation of HO-1 responds to many agents, such as hypoxia, bacterial lipopolysaccharide, and reactive oxygen/nitrogen species. HO-1 and its constitutively expressed isozyme, heme oxygenase-2, catalyze the rate-limiting step in the conversion of heme to its metabolites, bilirubin IXα, ferrous iron, and carbon monoxide (CO). The mechanisms by which HO-1 provides protection most likely involve its enzymatic reaction products. Remarkably, administration of CO at low concentrations can substitute for HO-1 with respect to anti-inflammatory and anti-apoptotic effects, suggesting a role for CO as a key mediator of HO-1 function. Chronic, low-level, exogenous exposure to CO from cigarette smoking contributes to the importance of CO in pulmonary medicine. The implications of the HO-1/CO system in pulmonary diseases will be discussed in this review, with an emphasis on inflammatory states.\n", "jg13scgo\tTechnical Description of RODS: A Real-time Public Health Surveillance System\t10.1197/jamia.m1345\t2003-09-01\tThis report describes the design and implementation of the Real-time Outbreak and Disease Surveillance (RODS) system, a computer-based public health surveillance system for early detection of disease outbreaks. Hospitals send RODS data from clinical encounters over virtual private networks and leased lines using the Health Level 7 (HL7) message protocol. The data are sent in real time. RODS automatically classifies the registration chief complaint from the visit into one of seven syndrome categories using Bayesian classifiers. It stores the data in a relational database, aggregates the data for analysis using data warehousing techniques, applies univariate and multivariate statistical detection algorithms to the data, and alerts users of when the algorithms identify anomalous patterns in the syndrome counts. RODS also has a Web-based user interface that supports temporal and spatial analyses. RODS processes sales of over-the-counter health care products in a similar manner but receives such data in batch mode on a daily basis. RODS was used during the 2002 Winter Olympics and currently operates in two states—Pennsylvania and Utah. It has been and continues to be a resource for implementing, evaluating, and applying new methods of public health surveillance.\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "HTQaik0isguS" }, "source": [ "You can export in other formats too. Here's an exporting in JSON-Lines." ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "XaYh4lwLrTDZ", "outputId": "b50827d6-02e6-409c-bdcf-72dfbfdf1529" }, "source": [ "!ir_datasets export cord19/trec-covid docs --format jsonl | head -n 10" ], "execution_count": 3, "outputs": [ { "output_type": "stream", "text": [ "{\"doc_id\": \"ug7v899j\", \"title\": \"Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia\", \"doi\": \"10.1186/1471-2334-1-6\", \"date\": \"2001-07-04\", \"abstract\": \"OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolates (60%) were associated with pneumonia, 14 (35%) with upper respiratory tract infections, and 2 (5%) with bronchiolitis. Cough (82.5%), fever (75%), and malaise (58.8%) were the most common symptoms, and crepitations (60%), and wheezes (40%) were the most common signs. Most patients with pneumonia had crepitations (79.2%) but only 25% had bronchial breathing. Immunocompromised patients were more likely than non-immunocompromised patients to present with pneumonia (8/9 versus 16/31, P = 0.05). Of the 24 patients with pneumonia, 14 (58.3%) had uneventful recovery, 4 (16.7%) recovered following some complications, 3 (12.5%) died because of M pneumoniae infection, and 3 (12.5%) died due to underlying comorbidities. The 3 patients who died of M pneumoniae pneumonia had other comorbidities. CONCLUSION: our results were similar to published data except for the finding that infections were more common in infants and preschool children and that the mortality rate of pneumonia in patients with comorbidities was high.\"}\n", "{\"doc_id\": \"02tnwd4m\", \"title\": \"Nitric oxide: a pro-inflammatory mediator in lung disease?\", \"doi\": \"10.1186/rr14\", \"date\": \"2000-08-15\", \"abstract\": \"Inflammatory diseases of the respiratory tract are commonly associated with elevated production of nitric oxide (NO\\u2022) and increased indices of NO\\u2022 -dependent oxidative stress. Although NO\\u2022 is known to have anti-microbial, anti-inflammatory and anti-oxidant properties, various lines of evidence support the contribution of NO\\u2022 to lung injury in several disease models. On the basis of biochemical evidence, it is often presumed that such NO\\u2022 -dependent oxidations are due to the formation of the oxidant peroxynitrite, although alternative mechanisms involving the phagocyte-derived heme proteins myeloperoxidase and eosinophil peroxidase might be operative during conditions of inflammation. Because of the overwhelming literature on NO\\u2022 generation and activities in the respiratory tract, it would be beyond the scope of this commentary to review this area comprehensively. Instead, it focuses on recent evidence and concepts of the presumed contribution of NO\\u2022 to inflammatory diseases of the lung.\"}\n", "{\"doc_id\": \"ejv2xln0\", \"title\": \"Surfactant protein-D and pulmonary host defense\", \"doi\": \"10.1186/rr19\", \"date\": \"2000-08-25\", \"abstract\": \"Surfactant protein-D (SP-D) participates in the innate response to inhaled microorganisms and organic antigens, and contributes to immune and inflammatory regulation within the lung. SP-D is synthesized and secreted by alveolar and bronchiolar epithelial cells, but is also expressed by epithelial cells lining various exocrine ducts and the mucosa of the gastrointestinal and genitourinary tracts. SP-D, a collagenous calcium-dependent lectin (or collectin), binds to surface glycoconjugates expressed by a wide variety of microorganisms, and to oligosaccharides associated with the surface of various complex organic antigens. SP-D also specifically interacts with glycoconjugates and other molecules expressed on the surface of macrophages, neutrophils, and lymphocytes. In addition, SP-D binds to specific surfactant-associated lipids and can influence the organization of lipid mixtures containing phosphatidylinositol in vitro. Consistent with these diverse in vitro activities is the observation that SP-D-deficient transgenic mice show abnormal accumulations of surfactant lipids, and respond abnormally to challenge with respiratory viruses and bacterial lipopolysaccharides. The phenotype of macrophages isolated from the lungs of SP-D-deficient mice is altered, and there is circumstantial evidence that abnormal oxidant metabolism and/or increased metalloproteinase expression contributes to the development of emphysema. The expression of SP-D is increased in response to many forms of lung injury, and deficient accumulation of appropriately oligomerized SP-D might contribute to the pathogenesis of a variety of human lung diseases.\"}\n", "{\"doc_id\": \"2b73a28n\", \"title\": \"Role of endothelin-1 in lung disease\", \"doi\": \"10.1186/rr44\", \"date\": \"2001-02-22\", \"abstract\": \"Endothelin-1 (ET-1) is a 21 amino acid peptide with diverse biological activity that has been implicated in numerous diseases. ET-1 is a potent mitogen regulator of smooth muscle tone, and inflammatory mediator that may play a key role in diseases of the airways, pulmonary circulation, and inflammatory lung diseases, both acute and chronic. This review will focus on the biology of ET-1 and its role in lung disease.\"}\n", "{\"doc_id\": \"9785vg6d\", \"title\": \"Gene expression in epithelial cells in response to pneumovirus infection\", \"doi\": \"10.1186/rr61\", \"date\": \"2001-05-11\", \"abstract\": \"Respiratory syncytial virus (RSV) and pneumonia virus of mice (PVM) are viruses of the family Paramyxoviridae, subfamily pneumovirus, which cause clinically important respiratory infections in humans and rodents, respectively. The respiratory epithelial target cells respond to viral infection with specific alterations in gene expression, including production of chemoattractant cytokines, adhesion molecules, elements that are related to the apoptosis response, and others that remain incompletely understood. Here we review our current understanding of these mucosal responses and discuss several genomic approaches, including differential display reverse transcription-polymerase chain reaction (PCR) and gene array strategies, that will permit us to unravel the nature of these responses in a more complete and systematic manner.\"}\n", "{\"doc_id\": \"zjufx4fo\", \"title\": \"Sequence requirements for RNA strand transfer during nidovirus discontinuous subgenomic RNA synthesis\", \"doi\": \"10.1093/emboj/20.24.7220\", \"date\": \"2001-12-17\", \"abstract\": \"Nidovirus subgenomic mRNAs contain a leader sequence derived from the 5\\u2032 end of the genome fused to different sequences (\\u2018bodies\\u2019) derived from the 3\\u2032 end. Their generation involves a unique mechanism of discontinuous subgenomic RNA synthesis that resembles copy-choice RNA recombination. During this process, the nascent RNA strand is transferred from one site in the template to another, during either plus or minus strand synthesis, to yield subgenomic RNA molecules. Central to this process are transcription-regulating sequences (TRSs), which are present at both template sites and ensure the fidelity of strand transfer. Here we present results of a comprehensive co-variation mutagenesis study of equine arteritis virus TRSs, demonstrating that discontinuous RNA synthesis depends not only on base pairing between sense leader TRS and antisense body TRS, but also on the primary sequence of the body TRS. While the leader TRS merely plays a targeting role for strand transfer, the body TRS fulfils multiple functions. The sequences of mRNA leader\\u2013body junctions of TRS mutants strongly suggested that the discontinuous step occurs during minus strand synthesis.\"}\n", "{\"doc_id\": \"5yhe786e\", \"title\": \"Debate: Transfusing to normal haemoglobin levels will not improve outcome\", \"doi\": \"10.1186/cc987\", \"date\": \"2001-03-08\", \"abstract\": \"Recent evidence suggests that critically ill patients are able to tolerate lower levels of haemoglobin than was previously believed. It is our goal to show that transfusing to a level of 100 g/l does not improve mortality and other clinically important outcomes in a critical care setting. Although many questions remain, many laboratory and clinical studies, including a recent randomized controlled trial (RCT), have established that transfusing to normal haemoglobin concentrations does not improve organ failure and mortality in the critically ill patient. In addition, a restrictive transfusion strategy will reduce exposure to allogeneic transfusions, result in more efficient use of red blood cells (RBCs), save blood overall, and decrease health care costs.\"}\n", "{\"doc_id\": \"8zchiykl\", \"title\": \"The 21st International Symposium on Intensive Care and Emergency Medicine, Brussels, Belgium, 20-23 March 2001\", \"doi\": \"10.1186/cc1013\", \"date\": \"2001-05-02\", \"abstract\": \"The 21st International Symposium on Intensive Care and Emergency Medicine was dominated by the results of recent clinical trials in sepsis and acute respiratory distress syndrome (ARDS). The promise of extracorporeal liver replacement therapy and noninvasive ventilation were other areas of interest. Ethical issues also received attention. Overall, the 'state of the art' lectures, pro/con debates, seminars and tutorials were of a high standard. The meeting was marked by a sense of renewed enthusiasm that positive progress is occurring in intensive care medicine.\"}\n", "{\"doc_id\": \"8qnrcgnk\", \"title\": \"Heme oxygenase-1 and carbon monoxide in pulmonary medicine\", \"doi\": \"10.1186/1465-9921-4-7\", \"date\": \"2003-08-07\", \"abstract\": \"Heme oxygenase-1 (HO-1), an inducible stress protein, confers cytoprotection against oxidative stress in vitro and in vivo. In addition to its physiological role in heme degradation, HO-1 may influence a number of cellular processes, including growth, inflammation, and apoptosis. By virtue of anti-inflammatory effects, HO-1 limits tissue damage in response to proinflammatory stimuli and prevents allograft rejection after transplantation. The transcriptional upregulation of HO-1 responds to many agents, such as hypoxia, bacterial lipopolysaccharide, and reactive oxygen/nitrogen species. HO-1 and its constitutively expressed isozyme, heme oxygenase-2, catalyze the rate-limiting step in the conversion of heme to its metabolites, bilirubin IX\\u03b1, ferrous iron, and carbon monoxide (CO). The mechanisms by which HO-1 provides protection most likely involve its enzymatic reaction products. Remarkably, administration of CO at low concentrations can substitute for HO-1 with respect to anti-inflammatory and anti-apoptotic effects, suggesting a role for CO as a key mediator of HO-1 function. Chronic, low-level, exogenous exposure to CO from cigarette smoking contributes to the importance of CO in pulmonary medicine. The implications of the HO-1/CO system in pulmonary diseases will be discussed in this review, with an emphasis on inflammatory states.\"}\n", "{\"doc_id\": \"jg13scgo\", \"title\": \"Technical Description of RODS: A Real-time Public Health Surveillance System\", \"doi\": \"10.1197/jamia.m1345\", \"date\": \"2003-09-01\", \"abstract\": \"This report describes the design and implementation of the Real-time Outbreak and Disease Surveillance (RODS) system, a computer-based public health surveillance system for early detection of disease outbreaks. Hospitals send RODS data from clinical encounters over virtual private networks and leased lines using the Health Level 7 (HL7) message protocol. The data are sent in real time. RODS automatically classifies the registration chief complaint from the visit into one of seven syndrome categories using Bayesian classifiers. It stores the data in a relational database, aggregates the data for analysis using data warehousing techniques, applies univariate and multivariate statistical detection algorithms to the data, and alerts users of when the algorithms identify anomalous patterns in the syndrome counts. RODS also has a Web-based user interface that supports temporal and spatial analyses. RODS processes sales of over-the-counter health care products in a similar manner but receives such data in batch mode on a daily basis. RODS was used during the 2002 Winter Olympics and currently operates in two states\\u2014Pennsylvania and Utah. It has been and continues to be a resource for implementing, evaluating, and applying new methods of public health surveillance.\"}\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "hhVG2gp6sqdZ" }, "source": [ "If you do not want all the fields, you can specify which ones with `--fields`:" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "kFI8UHbzq6Cu", "outputId": "a06ac0f5-2248-4c09-e5e6-f8b49f5cf29f" }, "source": [ "!ir_datasets export cord19/trec-covid docs --format jsonl --fields doc_id date | head -n 10" ], "execution_count": 4, "outputs": [ { "output_type": "stream", "text": [ "{\"doc_id\": \"ug7v899j\", \"date\": \"2001-07-04\"}\n", "{\"doc_id\": \"02tnwd4m\", \"date\": \"2000-08-15\"}\n", "{\"doc_id\": \"ejv2xln0\", \"date\": \"2000-08-25\"}\n", "{\"doc_id\": \"2b73a28n\", \"date\": \"2001-02-22\"}\n", "{\"doc_id\": \"9785vg6d\", \"date\": \"2001-05-11\"}\n", "{\"doc_id\": \"zjufx4fo\", \"date\": \"2001-12-17\"}\n", "{\"doc_id\": \"5yhe786e\", \"date\": \"2001-03-08\"}\n", "{\"doc_id\": \"8zchiykl\", \"date\": \"2001-05-02\"}\n", "{\"doc_id\": \"8qnrcgnk\", \"date\": \"2003-08-07\"}\n", "{\"doc_id\": \"jg13scgo\", \"date\": \"2003-09-01\"}\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "WUjwx7i1s5HD" }, "source": [ "The export command works the same way for `queries`, `qrels`, and `scoreddocs` (where available). By default, `qrels` and `scoreddocs` output in the TREC format. But you can choose to export as tsv or jsonl as well." ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "JoeB2aresxAV", "outputId": "872f8a51-ceb2-4c29-84ba-eb503f58ce1d" }, "source": [ "!ir_datasets export cord19/trec-covid queries --fields query_id title | head -n 10" ], "execution_count": 5, "outputs": [ { "output_type": "stream", "text": [ "1\tcoronavirus origin\n", "2\tcoronavirus response to weather changes\n", "3\tcoronavirus immunity\n", "4\thow do people die from the coronavirus\n", "5\tanimal models of COVID-19\n", "6\tcoronavirus test rapid testing\n", "7\tserological tests for coronavirus\n", "8\tcoronavirus under reporting\n", "9\tcoronavirus in Canada\n", "10\tcoronavirus social distancing impact\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Me_hppfJtRxG", "outputId": "b2fdb388-7eea-4e47-f5ea-859e07fe1b74" }, "source": [ "!ir_datasets export cord19/trec-covid qrels | head -n 10" ], "execution_count": 6, "outputs": [ { "output_type": "stream", "text": [ "1 4.5 005b2j4b 2\n", "1 4 00fmeepz 1\n", "1 0.5 010vptx3 2\n", "1 2.5 0194oljo 1\n", "1 4 021q9884 1\n", "1 1 02f0opkr 1\n", "1 3.5 047xpt2c 0\n", "1 1 04ftw7k9 0\n", "1 1 05qglt1f 0\n", "1 3 05vx82oo 0\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "2zocHeB1tgKu" }, "source": [ "If you're savvy at the command line, piping can let you capture some dataset statistics pretty easily. Here's an example giving the label proportions using `awk`:" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "vqCnPJOVtaWl", "outputId": "6b041b9e-9b85-47bc-91c1-1595c9d5968b" }, "source": [ "!ir_datasets export cord19/trec-covid qrels | awk '{a[$4]+=1; s+=1}END{for (x in a){print x, a[x], a[x]/s}}'" ], "execution_count": 7, "outputs": [ { "output_type": "stream", "text": [ "-1 2 2.88525e-05\n", "0 42652 0.615309\n", "1 11055 0.159482\n", "2 15609 0.22518\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "IgE2qowjuZV8" }, "source": [ "## lookup\n", "\n", "You can look up documents by their ID with the `ir_datasets lookup` command. The command format is:\n", "\n", "```\n", "ir_datasets lookup ...\n", "```\n", "\n", "These lookups are generally O(1) and memory-efficient." ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "dBrzCdwbtug4", "outputId": "cd3522f7-3acf-450b-ae68-ac72ce9f0877" }, "source": [ "!ir_datasets lookup cord19/trec-covid 005b2j4b 00fmeepz 010vptx3" ], "execution_count": 8, "outputs": [ { "output_type": "stream", "text": [ "[INFO] No fields supplied. Using all fields: ('doc_id', 'title', 'doi', 'date', 'abstract')\n", "005b2j4b\tMonophyletic Relationship between Severe Acute Respiratory Syndrome Coronavirus and Group 2 Coronaviruses\t10.1086/382892\t2004-05-01\tAlthough primary genomic analysis has revealed that severe acute respiratory syndrome coronavirus (SARS CoV) is a new type of coronavirus, the different protein trees published in previous reports have provided no conclusive evidence indicating the phylogenetic position of SARS CoV. To clarify the phylogenetic relationship between SARS CoV and other coronaviruses, we compiled a large data set composed of 7 concatenated protein sequences and performed comprehensive analyses, using the maximum-likelihood, Bayesian-inference, and maximum-parsimony methods. All resulting phylogenetic trees displayed an identical topology and supported the hypothesis that the relationship between SARS CoV and group 2 CoVs is monophyletic. Relationships among all major groups were well resolved and were supported by all statistical analyses.\n", "00fmeepz\tComprehensive overview of COVID-19 based on current evidence\t\t2020\tIn December 2019, twenty-seven pneumonia patients with unknown causes originated in South China seafood market in Wuhan. The virus infection spread rapidly and swept through China in less than a month. Subsequently, the virus was proven a novel coronavirus and named SARS-CoV-2. The outbreak of novel coronavirus has been determined as a Public Health Emergency of International Concern (PHEIC) by WHO on January 31, 2020. Similar to other coronaviruses like the Middle East Respiratory Syndrome (MERS) CoV and Severe Acute Respiratory Syndrome (SARS) CoV, the novel coronavirus was reported to spread via respiratory droplets and close contact from human to human, which means the virus is highly infectious and dangerous. Unfortunately, till now the virus has spread to over 200 countries/territories/areas around the world and the Coronavirus Disease 2019 (COVID-19) outbreak is continuing to grow. Currently, information sharing and transparency are essential for risk assessment and epidemic control in all endemic areas. In this article, we compared SARS-CoV-2 with SARS-CoV and influenza virus, discussed current researching progress of COVID-19, including clinical characteristics, pathological changes, treatment measures, and so on.\n", "010vptx3\tThe SARS, MERS and novel coronavirus (COVID-19) epidemics, the newest and biggest global health threats: what lessons have we learned?\t10.1093/ije/dyaa033\t2020-02-22\tOBJECTIVES: To provide an overview of the three major deadly coronaviruses and identify areas for improvement of future preparedness plans, as well as provide a critical assessment of the risk factors and actionable items for stopping their spread, utilizing lessons learned from the first two deadly coronavirus outbreaks, as well as initial reports from the current novel coronavirus (COVID-19) epidemic in Wuhan, China. METHODS: Utilizing the Centers for Disease Control and Prevention (CDC, USA) website, and a comprehensive review of PubMed literature, we obtained information regarding clinical signs and symptoms, treatment and diagnosis, transmission methods, protection methods and risk factors for Middle East Respiratory Syndrome (MERS), Severe Acute Respiratory Syndrome (SARS) and COVID-19. Comparisons between the viruses were made. RESULTS: Inadequate risk assessment regarding the urgency of the situation, and limited reporting on the virus within China has, in part, led to the rapid spread of COVID-19 throughout mainland China and into proximal and distant countries. Compared with SARS and MERS, COVID-19 has spread more rapidly, due in part to increased globalization and the focus of the epidemic. Wuhan, China is a large hub connecting the North, South, East and West of China via railways and a major international airport. The availability of connecting flights, the timing of the outbreak during the Chinese (Lunar) New Year, and the massive rail transit hub located in Wuhan has enabled the virus to perforate throughout China, and eventually, globally. CONCLUSIONS: We conclude that we did not learn from the two prior epidemics of coronavirus and were ill-prepared to deal with the challenges the COVID-19 epidemic has posed. Future research should attempt to address the uses and implications of internet of things (IoT) technologies for mapping the spread of infection.\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "v6leewIGvYKf" }, "source": [ "You can also specify the fields to return." ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "GkVpdPsXvFIq", "outputId": "106acd40-9d69-495e-e9f8-7191b1c81d78" }, "source": [ "!ir_datasets lookup cord19/trec-covid 005b2j4b 00fmeepz 010vptx3 --fields doc_id title" ], "execution_count": 9, "outputs": [ { "output_type": "stream", "text": [ "005b2j4b\tMonophyletic Relationship between Severe Acute Respiratory Syndrome Coronavirus and Group 2 Coronaviruses\n", "00fmeepz\tComprehensive overview of COVID-19 based on current evidence\n", "010vptx3\tThe SARS, MERS and novel coronavirus (COVID-19) epidemics, the newest and biggest global health threats: what lessons have we learned?\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "DOm67Sbsvjon" }, "source": [ "And of course, you can do all sorts of fancy piping here as well. Let's find all highly-relevant documents for Query 50:" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "yVMHEFRXvLfh", "outputId": "e4ad1626-5057-40a3-c517-9cc12ad0052a" }, "source": [ "!ir_datasets lookup cord19/trec-covid $(ir_datasets export cord19/trec-covid qrels | awk '$1==50&&$4==2{printf \"%s \", $3}') --fields doc_id title" ], "execution_count": 10, "outputs": [ { "output_type": "stream", "text": [ "1v0f2dtx\tSARS-CoV-2 mRNA Vaccine Development Enabled by Prototype Pathogen Preparedness\n", "3a6l4ktt\tmRNA Vaccines: Possible Tools to Combat SARS-CoV-2\n", "6emy92i5\tmRNA Vaccines: Possible Tools to Combat SARS-CoV-2\n", "7q6xi2xx\tAn Evidence Based Perspective on mRNA-SARS-CoV-2 Vaccine Development\n", "akbq0ogs\tPhase 1/2 Study to Describe the Safety and Immunogenicity of a COVID-19 RNA Vaccine Candidate (BNT162b1) in Adults 18 to 55 Years of Age: Interim Report\n", "dcg6ui9d\tAn Evidence Based Perspective on mRNA-SARS-CoV-2 Vaccine Development\n", "g1j8wk11\tImmune-mediated approaches against COVID-19\n", "gidlrnu8\tDeconvoluting Lipid Nanoparticle Structure for Messenger RNA Delivery\n", "ino9srb6\tAn overview on COVID-19: reality and expectation\n", "kf7yz3oz\tVaccines and Therapies in Development for SARS-CoV-2 Infections.\n", "oiu80002\tSelf-amplifying RNA SARS-CoV-2 lipid nanoparticle vaccine candidate induces high neutralizing antibody titers in mice\n", "ozf05l65\tPreparing for Pandemics: RNA Vaccines at the Forefront\n", "q77da2y3\tDesigning a novel mRNA vaccine against SARS-CoV-2: An immunoinformatics approach\n", "u35rryzi\tVaccines and Therapies in Development for SARS-CoV-2 Infections\n", "v0m90h3n\tLinearDesign: Efficient Algorithms for Optimized mRNA Sequence Design\n", "vm3oirur\tPreclinical data from SARS-CoV-2 mRNA vaccine\n", "wptc95qb\tA recombinant Lactobacillus plantarum strain expressing the spike protein of SARS-CoV-2\n", "wtmjt3hf\tDevelopment of a COVID-19 vaccine based on the receptor binding domain displayed on virus-like particles\n", "wx1v0h0q\tDesigning a multi-epitope peptide-based vaccine against SARS-CoV-2\n", "wxagjqbt\tCOVID-19 Vaccine Candidates: Prediction and Validation of 174 SARS-CoV-2 Epitopes\n", "wzdgizoo\tQuantitative measurement of activity of JAK-STAT signaling pathways in blood samples and immune cells to predict innate and adaptive cellular immune response to viral infection and accelerate vaccine development\n", "wzv8n34v\tSingle-dose replicating RNA vaccine induces neutralizing antibodies against SARS-CoV-2 in nonhuman primates\n", "x5zvwtj7\tVaccines against Coronaviruses: The State of the Art\n", "xbze5s3c\tAn Evidence Based Perspective on mRNA-SARS-CoV-2 Vaccine Development\n", "xeq0dq6u\tSARS-CoV-2 will constantly sweep its tracks: a vaccine containing CpG motifs in ‘lasso’ for the multi-faced virus\n", "xhm97wy2\tRNA to the rescue: RNA is one of the most promising targets for drug development given its wide variety of uses\n", "xieqswct\tDevelopment of CRISPR as a prophylactic strategy to combat novel coronavirus and influenza\n", "xjg2e8be\tIn silico approach for designing of a multi-epitope based vaccine against novel Coronavirus (SARS-COV-2)\n", "xqgqq55q\tEmerging vaccine delivery systems for COVID-19: Functionalised silica nanoparticles offer a potentially safe and effective alternative delivery system for DNA/RNA vaccines and may be useful in the hunt for a COVID-19 vaccine\n", "xt8tld2i\tThe vaccine journey for COVID-19: a comprehensive systematic review of current clinical trials in humans\n", "xy7w8hbz\tCharacterization of the receptor-binding domain (RBD) of 2019 novel coronavirus: implication for development of RBD protein as a viral attachment inhibitor and vaccine\n", "y87tq9wu\tCurrent Status of Multiple Drug Molecules, and Vaccines: An Update in SARS-CoV-2 Therapeutics\n", "y883anmp\tSARS-CoV-2 vaccines: 'Warp Speed' needs mind melds not warped minds.\n", "ygwdldae\tImmunization with the receptor–binding domain of SARS-CoV-2 elicits antibodies cross-neutralizing SARS-CoV-2 and SARS-CoV without antibody-dependent enhancement\n", "ykzsoafe\tOptimization of antigen dose for a receptor-binding domain-based subunit vaccine against MERS coronavirus\n", "ymvrserl\tImmunoinformatic identification of B cell and T cell epitopes in the SARS-CoV-2 proteome\n", "yn79jn83\tAnalysis of a SARS-CoV-2-Infected Individual Reveals Development of Potent Neutralizing Antibodies with Limited Somatic Mutation\n", "yneir8ab\tCOVID-19 vaccine development pipeline gears up\n", "ypkiptvh\tUpdate on therapeutic approaches and emerging therapies for SARS-CoV-2 virus\n", "ys8cs84y\tExpected immune recognition of COVID-19 virus by memory from earlier infections with common coronaviruses in a large part of the world population\n", "ywia2ok7\tThe crystal structure of nsp10-nsp16 heterodimer from SARS-CoV-2 in complex with S-adenosylmethionine\n", "yx3j6373\tCOVID-19: immunopathology and its implications for therapy\n", "yxiacesg\tStructural and functional conservation of the programmed -1 ribosomal frameshift signal of SARS coronavirus 2 (SARS-CoV-2).\n", "z24dqh0y\tAnalysis of a SARS-CoV-2-Infected Individual Reveals Development of Potent Neutralizing Antibodies with Limited Somatic Mutation\n", "z5q82rmp\tGlobal efforts on vaccines for COVID-19: Since, sooner or later, we all will catch the coronavirus\n", "z5uhrta5\tIdentification of a Noncanonical Signal for Transcription of a Novel Subgenomic mRNA of Mouse Hepatitis Virus: Implication for the Mechanism of Coronavirus RNA Transcription\n", "zalk5ul7\tAre genetic vaccines the right weapon against Covid-19?\n", "zi1l5883\tVaccines against Coronaviruses: The State of the Art.\n", "zteyfpv9\tCurrent pharmacological treatments for SARS-COV-2: A narrative review\n", "zv4nbz9p\tEmerging Technologies for Use in the Study, Diagnosis, and Treatment of Patients with COVID-19\n", "zvop8bxh\tAntiviral RNAi therapy: emerging approaches for hitting a moving target\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "h9dhCjb0y4Rj" }, "source": [ "## doc_fifos\n", "\n", "For indexing using some tools (e.g., Anserini), it is helpful to have multiple concurrent document streams. You can do this with the `ir_datasets doc_fifos` command. Note that this command only works on posix systems (e.g., unix, macos).\n", "\n", "This command runs until all the documents are exhausted, so you need to run it in the background or elsewhere. So it's not condusive to show in a Colab setting." ] }, { "cell_type": "code", "metadata": { "id": "YGKeucnFzFOW" }, "source": [ "!ir_datasets doc_fifos cord19/trec-covid" ], "execution_count": null, "outputs": [] } ] } ================================================ FILE: ir_datasets/__init__.py ================================================ from enum import Enum class EntityType(Enum): docs = "docs" queries = "queries" qrels = "qrels" scoreddocs = "scoreddocs" docpairs = "docpairs" qlogs = "qlogs" from . import lazy_libs from . import log from . import util from . import formats registry = util.Registry() from . import datasets from . import indices from . import wrappers from . import commands Dataset = datasets.base.Dataset def load(name): return registry[name] def parent_id(dataset_id: str, entity_type: EntityType) -> str: """ Maps a dataset_id to a more general ID that shares the same entity handler (e.g., docs_handler). For example, for docs, "msmarco-document/trec-dl-2019/judged" -> "msmarco-document" or "wikir/en1k/test" -> "wikir/en1k". This is useful when creating shared document resources among multiple subsets, such as an index. Note: At this time, this function operates by convention; it finds the lowest dataset_id in the hierarchy that has the same docs_handler instance. This function may be updated in the future to also use explicit links added when datasets are registered. """ entity_type = EntityType(entity_type) # validate & allow strings ds = load(dataset_id) segments = dataset_id.split("/") handler = getattr(ds, f'{entity_type.value}_handler')() parent_ds_id = dataset_id while len(segments) > 1: segments.pop() try: parent_ds = load("/".join(segments)) if parent_ds.has(entity_type.value) and getattr(parent_ds, f'{entity_type.value}_handler')() == handler: parent_ds_id = "/".join(segments) except KeyError: pass # this dataset doesn't exist return parent_ds_id def docs_parent_id(dataset_id: str) -> str: return parent_id(dataset_id, EntityType.docs) corpus_id = docs_parent_id # legacy def queries_parent_id(dataset_id: str) -> str: return parent_id(dataset_id, EntityType.queries) def qrels_parent_id(dataset_id: str) -> str: return parent_id(dataset_id, EntityType.qrels) def scoreddocs_parent_id(dataset_id: str) -> str: return parent_id(dataset_id, EntityType.scoreddocs) def docpairs_parent_id(dataset_id: str) -> str: return parent_id(dataset_id, EntityType.docpairs) def qlogs_parent_id(dataset_id: str) -> str: return parent_id(dataset_id, EntityType.qlogs) def create_dataset(docs_tsv=None, queries_tsv=None, qrels_trec=None): LocalDownload = util.LocalDownload TsvDocs = formats.TsvDocs TsvQueries = formats.TsvQueries TrecQrels = formats.TrecQrels components = [] if docs_tsv is not None: components.append(TsvDocs(LocalDownload(docs_tsv))) if queries_tsv is not None: components.append(TsvQueries(LocalDownload(queries_tsv))) if qrels_trec is not None: components.append(TrecQrels(LocalDownload(qrels_trec), {})) return datasets.base.Dataset(*components) def main(args): import sys if len(args) < 1 or args[0] not in commands.COMMANDS: cmds = ','.join(commands.COMMANDS.keys()) sys.stderr.write(f'Usage: ir_datasets {{{cmds}}} ...\n') sys.exit(1) commands.COMMANDS[args[0]](args[1:]) def main_cli(): import sys main(sys.argv[1:]) __version__ = "0.5.11" ================================================ FILE: ir_datasets/__main__.py ================================================ import ir_datasets if __name__ == '__main__': ir_datasets.main_cli() ================================================ FILE: ir_datasets/commands/__init__.py ================================================ from . import doc_fifos from . import export from . import lookup from . import list as list_cmd from . import build_clueweb_warc_indexes from . import build_download_cache from . import build_c4_checkpoints from . import clean from . import generate_metadata COMMANDS = { 'doc_fifos': doc_fifos.main, 'export': export.main, 'lookup': lookup.main, 'list': list_cmd.main, 'build_clueweb_warc_indexes': build_clueweb_warc_indexes.main, 'build_c4_checkpoints': build_c4_checkpoints.main, 'build_download_cache': build_download_cache.main, 'clean': clean.main, 'generate_metadata': generate_metadata.main, } ================================================ FILE: ir_datasets/commands/build_c4_checkpoints.py ================================================ import os import sys import multiprocessing from pathlib import Path import gzip import hashlib import json import pickle import argparse import ir_datasets _logger = ir_datasets.log.easy() def process(args): lz4 = ir_datasets.lazy_libs.lz4_frame() source_file, output_file = args checkpoint_data = [] with ir_datasets.lazy_libs.zlib_state().GzipStateFile(str(source_file), keep_last_state=True) as f, _logger.pbar_raw(desc='building checkpoint') as pbar: idx = 0 while not f.eof(): if idx % 1500 == 0: state, pos = f.last_state, f.last_state_pos offset = f.output_pos - f.last_state_output_pos checkpoint_data.append((pos, state, offset)) f.readline() idx += 1 pbar.update(1) with lz4.frame.LZ4FrameFile(output_file, mode='a', block_linked=True, compression_level=lz4.frame.COMPRESSIONLEVEL_MAX, auto_flush=True) as fout: pickle.dump(checkpoint_data, fout) return source_file def main(args): parser = argparse.ArgumentParser(prog='ir_datasets build_c4_checkpoints', description='Buildes gzip checkpoint files for C4 documents.') parser.add_argument('source_dir') parser.add_argument('output_dir') parser.add_argument('--skip_last', action='store_true') parser.add_argument('--sources_file') args = parser.parse_args(args) source_dir = Path(args.source_dir) output_dir = Path(args.output_dir) all_source_files = source_dir.rglob('*.json.gz') all_source_files = sorted(all_source_files) if args.sources_file: sources = [] for file in _logger.pbar(all_source_files, desc='building sources file', unit='file'): try: count = 0 with gzip.open(file, 'rb') as f: for line in f: count += 1 h = hashlib.new('md5') h.update(open(file, 'rb').read()) md5 = h.hexdigest().lower() size = os.path.getsize(file) sources.append({ "name": f"en.noclean.{file.name}", "url": f"https://huggingface.co/datasets/allenai/c4/resolve/main/en.noclean/{file.name}", "expected_md5": md5, "size_hint": size, "checkpoint_freq": 1500, "doc_count": count, }) except Exception as ex: print(file, ex) with gzip.open(args.sources_file + '.gz', 'wt') as f: json.dump(sources, f) all_source_files = [f.relative_to(source_dir) for f in all_source_files] if args.skip_last: all_source_files = all_source_files[:-1] process_args = [(source_dir/f, output_dir/f'{f}.chk.pkl.lz4') for f in all_source_files] process_args = [a for a in process_args if not a[1].exists()] with _logger.pbar_raw(total=len(process_args), unit='file') as pbar: for src in map(process, process_args): pbar.update(1) pbar.set_postfix(file=str(src)[-20:]) if __name__ == '__main__': main(sys.argv[1:]) ================================================ FILE: ir_datasets/commands/build_clueweb_warc_indexes.py ================================================ import sys import multiprocessing from pathlib import Path import argparse import ir_datasets _logger = ir_datasets.log.easy() def process(args): source_file, output_file, cw09 = args index = ir_datasets.indices.ClueWebWarcIndex(str(source_file), str(output_file), warc_cw09=cw09) output_file.parent.mkdir(parents=True, exist_ok=True) if not index.built(): index.build() return source_file def main(args): parser = argparse.ArgumentParser(prog='ir_datasets build_clueweb_warc_indexes', description='Buildes indexes for ClueWeb WARC files.') parser.add_argument('source_dir') parser.add_argument('output_dir') parser.add_argument('--processes', default=1, type=int) parser.add_argument('--cw09', action='store_true') args = parser.parse_args(args) source_dir = Path(args.source_dir) output_dir = Path(args.output_dir) all_source_files = [f.relative_to(source_dir) for f in source_dir.rglob('*.warc.gz')] all_source_files = sorted(all_source_files) process_args = [(source_dir/f, output_dir/f'{f}.chk.lz4', args.cw09) for f in all_source_files] process_args = [a for a in process_args if not a[1].exists()] with _logger.pbar_raw(total=len(process_args), unit='file') as pbar: if args.processes == 1: for src in map(process, process_args): pbar.update(1) pbar.set_postfix(file=str(src)) else: with multiprocessing.Pool(args.processes) as pool: for src in pool.imap_unordered(process, process_args): pbar.update(1) pbar.set_postfix(file=src.relative_to(source_dir)) if __name__ == '__main__': main(sys.argv[1:]) ================================================ FILE: ir_datasets/commands/build_download_cache.py ================================================ import sys import time import io import os import argparse import json from contextlib import contextmanager import ir_datasets _logger = ir_datasets.log.easy() @contextmanager def tmp_environ(**kwargs): orig_values = {} for key, value in kwargs.items(): orig_values[key] = os.environ.get(key) os.environ[key] = value try: yield finally: for key, value in kwargs.items(): orig_value = orig_values[key] if orig_value is not None: os.environ[key] = orig_value else: del os.environ[key] def _build_cache(data, dir, prefix=''): if 'url' in data and 'expected_md5' in data: cache_path = f'{dir}/{data["expected_md5"]}' if os.path.exists(cache_path): _logger.info(f'skipping {prefix}; already exists') return try: with ir_datasets.util.finialized_file(cache_path, 'wb') as fout, _logger.duration(prefix): download = ir_datasets.util.Download([ir_datasets.util.RequestsDownload(data['url'])], expected_md5=data['expected_md5'], stream=True) with download.stream() as stream: inp = stream.read(io.DEFAULT_BUFFER_SIZE) while len(inp) > 0: fout.write(inp) inp = stream.read(io.DEFAULT_BUFFER_SIZE) except KeyboardInterrupt: _logger.info('download skipped by user (ctrl+c again in the next 0.5 seconds to exit)') try: time.sleep(0.5) except KeyboardInterrupt: sys.exit(1) except Exception as ex: _logger.warn(f'error: {ex}') elif 'instructions' in data: pass else: for key in data.keys(): _build_cache(data[key], dir, prefix=f'{prefix}/{key}' if prefix else key) def main(args): parser = argparse.ArgumentParser(prog='ir_datasets build_download_cache', description='Builds a cache of downloadable content') parser.add_argument('--dir', default=f'{ir_datasets.util.home_path()}/downloads') parser.add_argument('--retries', default='10') args = parser.parse_args(args) with open('ir_datasets/etc/downloads.json') as f: data = json.load(f) with tmp_environ(IR_DATASETS_DL_TRIES=args.retries): _build_cache(data, args.dir) if __name__ == '__main__': main(sys.argv[1:]) ================================================ FILE: ir_datasets/commands/clean.py ================================================ import sys import os import argparse import multiprocessing from collections import deque import ir_datasets from ir_datasets.util import DownloadConfig RED = '\u001b[31m' RES = '\u001b[0m' _logger = ir_datasets.log.easy() def walk_path(start_path='.', skips=[]): # adapted from total_size = 0 files = [] for dirpath, dirnames, filenames in os.walk(start_path): if any(s for s in skips if dirpath.startswith(s)): continue for f in filenames: fp = os.path.join(dirpath, f) if fp in skips: continue if not os.path.islink(fp): total_size += os.path.getsize(fp) files.append(fp) return total_size, files def clean(dataset, yes=False, list=False, human=True): base_path = os.path.join(ir_datasets.util.home_path()/dataset) dlc = DownloadConfig.context(dataset, base_path) skips = [] for dl_item in dlc.contents().values(): if 'instructions' in dl_item and 'cache_path' in dl_item: # non-downloadble item skips.append(os.path.join(base_path, dl_item['cache_path'])) size, files = walk_path(base_path, skips) files_fmt = f'{len(files)} files' if human: size_fmt = ir_datasets.util.format_file_size(size) if size > 1_000_000_000: # sizes over 1GB: list in red size_fmt = f'{RED}{size_fmt}{RES}' else: size_fmt = str(size) if list: if size > 0: print(f'{size_fmt}\t{files_fmt}\t{dataset}') return if not yes: inp = None while inp not in ('y', 'yes'): inp = input(f'clean up {size_fmt} from {dataset} ({files_fmt})?\n[y(es) / n(o) / l(ist files)] ').lower() if inp in ('l', 'list', 'list files'): for file in files: f_size = os.path.getsize(file) if human: fsize_fmt = ir_datasets.util.format_file_size(f_size) if f_size > 1_000_000_000: # sizes over 1GB: list in red fsize_fmt = f'{RED}{fsize_fmt}{RES}' else: fsize_fmt = str(size) print(f'{fsize_fmt}\t{file}') if inp in ('n', 'no'): return # remove identified files for file in files: os.remove(file) # remove empty directories for dirpath, dirnames, filenames in os.walk(base_path, topdown=False): if not dirnames and not filenames: os.rmdir(dirpath) def main(args): parser = argparse.ArgumentParser(prog='ir_datasets clean', description='Cleans up space by removing files that can automatically be rec-reated or re-downloaded.') parser.add_argument('datasets', nargs='*', help='dataset IDs to clean up') parser.add_argument('--yes', '-y', action='store_true', help='automatically say yes to confirmation messages') parser.add_argument('--list', '-l', action='store_true', help='lists datasets available for cleanup and their sizes; does not do any cleanup') parser.add_argument('-H', action='store_false', help='output raw sizes, rather than human-readable versions') args = parser.parse_args(args) try: if args.datasets: top_level_datasets = {d for d in ir_datasets.registry._registered if '/' not in d} for dataset in args.datasets: if dataset not in top_level_datasets: print(f'Skipping unknown dataset {dataset}') else: clean(dataset, args.yes, list=args.list, human=args.H) elif args.list: for dataset in ir_datasets.registry._registered: if '/' not in dataset: clean(dataset, list=True, human=args.H) else: sys.stderr.write('ERROR: Please provide either --list, dataset IDs to clean, or --help for more details\n') except KeyboardInterrupt: pass if __name__ == '__main__': main(sys.argv[1:]) ================================================ FILE: ir_datasets/commands/doc_fifos.py ================================================ import sys import os import select import tempfile import contextlib import json import argparse import multiprocessing from collections import deque import ir_datasets _logger = ir_datasets.log.easy() def main(args): parser = argparse.ArgumentParser(prog='ir_datasets doc_fifos', description='Starts a process that exports documents in parallel to several named pipes as json. This is useful as inputs to indexers like Anserini.') parser.add_argument('dataset') parser.add_argument('--count', type=int, default=max(multiprocessing.cpu_count() - 1, 1)) parser.add_argument('--fields', nargs='+') parser.add_argument('--dir') args = parser.parse_args(args) dataset = ir_datasets.load(args.dataset) try: dataset = ir_datasets.load(args.dataset) except KeyError: sys.stderr.write(f"Dataset {args.dataset} not found.\n") sys.exit(1) if not dataset.has_docs(): sys.stderr.write(f"Dataset {args.dataset} does not have docs.\n") sys.exit(1) docs_cls = dataset.docs_cls() field_idxs = [] if args.fields: for field in args.fields: if field not in docs_cls._fields: sys.stderr.write(f"Field {field} not found ind {args.dataset}. Available fields: {docs_cls._fields}\n") sys.exit(1) field_idxs.append(docs_cls._fields.index(field)) else: if len(docs_cls._fields) == 2: # there's only one field, silently use it field_idxs.append(1) else: # more than 1 field, let the user know everything is used. sys.stderr.write(f"Exporting all fields as document content: {docs_cls._fields[1:]}. Use --fields to specify fields.\n") field_idxs = list(range(1, len(docs_cls._fields))) with contextlib.ExitStack() as stack: if args.dir is not None: d = args.dir else: d = stack.enter_context(tempfile.TemporaryDirectory()) fifos = [] for i in range(args.count): fifo = os.path.join(d, f'{i}.json') os.mkfifo(fifo) fifos.append(fifo) docs_iter = dataset.docs_iter() docs_iter = _logger.pbar(docs_iter, total=dataset.docs_count(), unit='doc') print(f'Ready at {d}') print(f'To index with Anserini, run:\nIndexCollection -collection JsonCollection -input {d} -threads {args.count} -index ') fifos = [stack.enter_context(open(f, 'wt')) for f in fifos] ready = None for doc in docs_iter: if not ready: # first or no more ready _, ready, _ = select.select([], fifos, []) ready = deque(ready) fifo = ready.popleft() doc = {'id': doc.doc_id, 'contents': '\n'.join(str(doc[i]) for i in field_idxs)} json.dump(doc, fifo) fifo.write('\n') if __name__ == '__main__': main(sys.argv[1:]) ================================================ FILE: ir_datasets/commands/export.py ================================================ import sys import json import argparse import ir_datasets _logger = ir_datasets.log.easy() def main_docs(dataset, args): assert hasattr(dataset, 'docs_handler'), f"{args.dataset} does not provide docs" exporter = DEFAULT_EXPORTERS[args.format] exporter = exporter(dataset.docs_cls(), args.out, args.fields) for doc in dataset.docs_iter(): exporter.next(doc) exporter.flush() def main_queries(dataset, args): assert hasattr(dataset, 'queries_handler'), f"{args.dataset} does not provide queries" exporter = DEFAULT_EXPORTERS[args.format] exporter = exporter(dataset.queries_cls(), args.out, args.fields) for query in dataset.queries_iter(): exporter.next(query) exporter.flush() def main_qrels(dataset, args): assert hasattr(dataset, 'qrels_handler'), f"{args.dataset} does not provide qrels" exporter = QRELS_EXPORTERS[args.format] exporter = exporter(dataset.qrels_cls(), args.out, args.fields) for qrel in dataset.qrels_iter(): exporter.next(qrel) exporter.flush() def main_scoreddocs(dataset, args): assert hasattr(dataset, 'scoreddocs_handler'), f"{args.dataset} does not provide scoreddocs" exporter = SCOREDDOCS_EXPORTERS[args.format] exporter = exporter(dataset.scoreddocs_cls(), args.out, args.fields) if hasattr(exporter, 'runtag'): exporter.runtag = args.runtag for scoreddoc in dataset.scoreddocs_iter(): exporter.next(scoreddoc) exporter.flush() def main_docpairs(dataset, args): assert hasattr(dataset, 'docpairs_handler'), f"{args.dataset} does not provide docpairs" exporter = DEFAULT_EXPORTERS[args.format] exporter = exporter(dataset.docpairs_cls(), args.out, args.fields) for query in dataset.docpairs_iter(): exporter.next(query) exporter.flush() class TsvExporter: def __init__(self, data_cls, out, fields=None): self.data_cls = data_cls self.out = out if fields is None: fields = data_cls._fields if len(fields) > 2: # This message is only really needed if there's more than 2 fields _logger.info(f'No fields supplied. Using all fields: {fields}') field_conflicts = [f for f in fields if data_cls.__annotations__[f] not in (str, int, float)] if len(field_conflicts) == 1: # special case: if there's only one Tuple[X, ...], we can export unambiguously with variable number of columns if is_tuple_elip(data_cls.__annotations__[field_conflicts[0]]): _logger.info(f'Exporting variable number of columns for {field_conflicts[0]}') field_conflicts = [] if len(field_conflicts) > 0: fields = [f for f in fields if f not in field_conflicts] field_conflicts = ', '.join([repr((f, data_cls.__annotations__[f])) for f in field_conflicts]) _logger.info(f'Skipping the following fields due to unsupported data types: {field_conflicts}') self.idxs = [] for field in fields: assert field in data_cls._fields self.idxs.append(data_cls._fields.index(field)) def next(self, record): output = [] for idx in self.idxs: if isinstance(record[idx], (list, tuple)): for sub_rec in record[idx]: if hasattr(sub_rec, '_fields'): for value in sub_rec: output.append(str(value).replace('\t', ' ').replace('\n', ' ').replace('\r', ' ')) else: output.append(str(sub_rec).replace('\t', ' ').replace('\n', ' ').replace('\r', ' ')) elif hasattr(record[idx], '_fields'): for value in record[idx]: output.append(str(value).replace('\t', ' ').replace('\n', ' ').replace('\r', ' ')) else: output.append(str(record[idx]).replace('\t', ' ').replace('\n', ' ').replace('\r', ' ')) self.out.write('\t'.join(output) + '\n') def flush(self): pass class JsonlExporter: def __init__(self, data_cls, out, fields=None): self.data_cls = data_cls self.out = out fields = fields or data_cls._fields if fields is None: fields = data_cls._fields if len(fields) > 2: # This message is only really needed if there's more than 2 fields _logger.info(f'No fields supplied. Using all fields: {fields}') field_conflicts = [f for f in fields if data_cls.__annotations__[f] not in (str, int, float) and not is_tuple_elip(data_cls.__annotations__[f])] if len(field_conflicts) > 0: fields = [f for f in fields if f not in field_conflicts] field_conflicts = ', '.join([repr((f, data_cls.__annotations__[f])) for f in field_conflicts]) _logger.info(f'Skipping the following fields due to unsupported data types: {field_conflicts}') self.fields = fields self.idxs = [] for field in self.fields: assert field in data_cls._fields self.idxs.append(data_cls._fields.index(field)) def next(self, record): json.dump({f: self.encode(record[i]) for f, i in zip(self.fields, self.idxs)}, self.out) self.out.write('\n') def encode(self, value): if isinstance(value, (list, tuple)): return [self.encode(v) for v in value] if hasattr(value, '_fields'): return {k: self.encode(v) for k, v in value._asdict()} return value def flush(self): pass def is_tuple_elip(annotation): if hasattr(annotation, '_name') and annotation._name == 'Tuple' and len(annotation.__args__) == 2 and annotation.__args__[1] is Ellipsis: if annotation.__args__[0] in (str, int, float) or (hasattr(annotation.__args__[0], '_fields') and all(f in (str, int, float) for f in annotation.__args__[0].__annotations__.values())): return True return False class TrecQrelsExporter: def __init__(self, data_cls, out, fields=None): self.data_cls = data_cls self.out = out assert 'query_id' in data_cls._fields, f"unsupported dataset cls {data_cls} (missing query_id)" assert 'doc_id' in data_cls._fields, f"unsupported dataset cls {data_cls} (missing doc_id)" self.has_iteration = 'iteration' in data_cls._fields if fields is None: remaining_fields = set(data_cls._fields) - {'query_id', 'doc_id', 'iteration'} fields = sorted(remaining_fields, key=lambda f: data_cls._fields.index(f)) if fields != ['relevance']: _logger.info(f'exporting fields {fields}') self.rel_field_idxs = [] for field in fields: assert field in data_cls._fields, f"missing field {repr(field)}; choose --fields from {data_cls._fields}" self.rel_field_idxs.append(data_cls._fields.index(field)) if len(self.rel_field_idxs) > 1: _logger.info(f'exporting multiple relevance fields; may not work with some evaluation scripts. Specify fields with --fields') def next(self, record): rel_fields = ' '.join(str(record[i]) for i in self.rel_field_idxs) self.out.write(f'{record.query_id} {record.iteration if self.has_iteration else "0"} {record.doc_id} {rel_fields}\n') def flush(self): pass class TrecRunExporter: def __init__(self, data_cls, out, fields=None): self.data_cls = data_cls self.out = out assert fields is None, "fields not supported for TREC Run exporter" self.query_id = None self.query_scores = [] self.runtag = 'run' def next(self, record): if record.query_id != self.query_id: self.flush() query_id = record.query_id self.query_scores.append(record) def flush(self): for i, scoreddoc in enumerate(sorted(self.query_scores, key=lambda x: (-x.score, x.doc_id))): self.out.write(f'{scoreddoc.query_id} Q0 {scoreddoc.doc_id} {i} {scoreddoc.score} {self.runtag}\n') self.query_scores = [] DEFAULT_EXPORTERS = { 'tsv': TsvExporter, 'jsonl': JsonlExporter, } QRELS_EXPORTERS = {**DEFAULT_EXPORTERS, 'trec': TrecQrelsExporter} SCOREDDOCS_EXPORTERS = {**DEFAULT_EXPORTERS, 'trec': TrecRunExporter} def main(args): parser = argparse.ArgumentParser(prog='ir_datasets export', description='Exports documents, queries, qrels, and scoreddocs in various formats.') parser.add_argument('dataset') parser.set_defaults(out=sys.stdout) subparsers = parser.add_subparsers(dest='data') subparsers.required = True subparser = subparsers.add_parser('docs') subparser.add_argument('--format', choices=DEFAULT_EXPORTERS.keys(), default='tsv') subparser.add_argument('--fields', nargs='+') subparser.set_defaults(fn=main_docs) subparser = subparsers.add_parser('queries') subparser.add_argument('--format', choices=DEFAULT_EXPORTERS.keys(), default='tsv') subparser.add_argument('--fields', nargs='+') subparser.set_defaults(fn=main_queries) subparser = subparsers.add_parser('qrels') subparser.add_argument('--format', choices=QRELS_EXPORTERS.keys(), default='trec') subparser.add_argument('--fields', nargs='+') subparser.set_defaults(fn=main_qrels) subparser = subparsers.add_parser('scoreddocs') subparser.add_argument('--format', choices=SCOREDDOCS_EXPORTERS.keys(), default='trec') subparser.add_argument('--fields', nargs='+') subparser.add_argument('--runtag', default='run') subparser.set_defaults(fn=main_scoreddocs) subparser = subparsers.add_parser('docpairs') subparser.add_argument('--format', choices=DEFAULT_EXPORTERS.keys(), default='tsv') subparser.add_argument('--fields', nargs='+') subparser.set_defaults(fn=main_docpairs) args = parser.parse_args(args) dataset = ir_datasets.load(args.dataset) try: dataset = ir_datasets.load(args.dataset) except KeyError: sys.stderr.write(f"Dataset {args.dataset} not found.\n") sys.exit(1) try: args.fn(dataset, args) except BrokenPipeError: sys.stderr.close() except KeyboardInterrupt: sys.stderr.close() except AssertionError as e: if str(e): sys.stderr.write(str(e) + '\n') else: raise if __name__ == '__main__': main(sys.argv[1:]) ================================================ FILE: ir_datasets/commands/generate_metadata.py ================================================ import time import sys import os import json import argparse from pathlib import Path from fnmatch import fnmatch import ir_datasets from ir_datasets.util import DownloadConfig _logger = ir_datasets.log.easy() def dataset2metadata(args): dsid, data = args try: dataset = ir_datasets.load(dsid) except KeyError: return dsid, None try: for e in ir_datasets.EntityType: if dataset.has(e): if e.value not in data: parent_id = getattr(ir_datasets, f'{e.value}_parent_id')(dsid) if parent_id != dsid: data[e.value] = {'_ref': parent_id} else: with _logger.duration(f'{dsid} {e.value}'): data[e.value] = getattr(dataset, f'{e.value}_calc_metadata')() _logger.info(f'{dsid} {e.value}: {data[e.value]}') except Exception as ex: _logger.info(f'{dsid} {e.value} [error]: {ex}') return dsid, None return dsid, data def write_metadata_file(data, file): with file.open('wt') as f: # partially-formatted data; one dataset per line f.write('{\n') for i, key in enumerate(sorted(data.keys())): if i != 0: f.write(',\n') f.write(f' "{key}": {json.dumps(data[key])}') f.write('\n}\n') def main(args): parser = argparse.ArgumentParser(prog='ir_datasets generate_metadata', description='Generates metadata for the specified datasets') parser.add_argument('--file', help='output file', type=Path, default=Path('ir_datasets/etc/metadata.json')) parser.add_argument('--datasets', nargs='+', help='dataset IDs for which to compute metadata. If omitted, generates for all datasets present in the registry (skipping patterns)') args = parser.parse_args(args) if args.file.is_file(): with args.file.open('rb') as f: data = json.load(f) else: data = {} if args.datasets: def _ds_iter(): for dsid in args.datasets: yield dsid, data.get(dsid, {}) import multiprocessing with multiprocessing.Pool(10) as pool: for dsid, dataset_metadata in _logger.pbar(pool.imap_unordered(dataset2metadata, _ds_iter()), desc='datasets', total=len(args.datasets)): if dataset_metadata is not None: data[dsid] = dataset_metadata write_metadata_file(data, args.file) else: for dsid in ir_datasets.registry._registered: dataset = ir_datasets.load(dsid) brk = False try: _, dataset_metadata = dataset2metadata((dsid, data.get(dsid, {}))) if dataset_metadata is not None: data[dsid] = dataset_metadata except KeyboardInterrupt: _logger.info(f'KeyboardInterrupt; skipping. ctrl+c within 0.5sec to stop compute_metadata.') try: time.sleep(0.5) except KeyboardInterrupt: brk = True break write_metadata_file(data, args.file) if brk: break if __name__ == '__main__': main(sys.argv[1:]) ================================================ FILE: ir_datasets/commands/list.py ================================================ import sys import argparse import ir_datasets from ir_datasets.commands.export import DEFAULT_EXPORTERS _logger = ir_datasets.log.easy() def main(args): parser = argparse.ArgumentParser(prog='ir_datasets list', description='Lists available datasets.') parser.set_defaults(out=sys.stdout) args = parser.parse_args(args) for dataset in sorted(ir_datasets.registry): args.out.write(f'{dataset}\n') if __name__ == '__main__': main(sys.argv[1:]) ================================================ FILE: ir_datasets/commands/lookup.py ================================================ import sys import argparse import ir_datasets from ir_datasets.commands.export import DEFAULT_EXPORTERS _logger = ir_datasets.log.easy() def qid_lookup(dataset, args): assert hasattr(dataset, 'queries_handler') exporter = DEFAULT_EXPORTERS[args.format] exporter = exporter(dataset.queries_cls(), args.out, args.fields) store = dataset.queries_store() for qid in args.ids: try: query = store.get(qid) exporter.next(query) except KeyError: _logger.warn(f'query_id {qid} not found') def did_lookup(dataset, args): assert hasattr(dataset, 'docs_handler') exporter = DEFAULT_EXPORTERS[args.format] exporter = exporter(dataset.docs_cls(), args.out, args.fields) store = dataset.docs_store() for did in args.ids: try: doc = store.get(did) exporter.next(doc) except KeyError: _logger.warn(f'doc_id {did} not found') def main(args): parser = argparse.ArgumentParser(prog='ir_datasets lookup', description='Provides fast lookups of documents and queries ' 'using docs_store. Unlike using the exporter and grep (or similar), this tool builds ' 'an index for O(log(n)) lookups.') parser.add_argument('dataset') parser.set_defaults(out=sys.stdout) parser.add_argument('--format', choices=DEFAULT_EXPORTERS.keys(), default='tsv') parser.add_argument('--fields', nargs='+') parser.add_argument('--qid', '--query_id', '-q', action='store_true') parser.add_argument('ids', nargs='+') args = parser.parse_args(args) try: dataset = ir_datasets.load(args.dataset) except KeyError: sys.stderr.write(f"Dataset {args.dataset} not found.\n") sys.exit(1) if args.qid: qid_lookup(dataset, args) else: did_lookup(dataset, args) if __name__ == '__main__': main(sys.argv[1:]) ================================================ FILE: ir_datasets/datasets/__init__.py ================================================ from . import base from . import antique from . import aol_ia from . import aquaint from . import argsme from . import beir from . import c4 from . import car from . import clinicaltrials from . import clirmatrix from . import clueweb09 from . import clueweb12 from . import codec from . import cord19 from . import cranfield from . import csl from . import disks45 from . import dpr_w100 from . import codesearchnet from . import gov from . import gov2 from . import highwire from . import istella22 from . import kilt from . import lotte from . import medline from . import miracl from . import mmarco from . import mr_tydi from . import msmarco_document from . import msmarco_document_v2 from . import msmarco_passage from . import msmarco_passage_v2 from . import msmarco_qna from . import nano_beir from . import neumarco from . import nfcorpus from . import natural_questions from . import nyt from . import pmc from . import touche_image from . import touche # must be after argsme,clueweb12,touche_image from . import trec_arabic from . import trec_mandarin from . import trec_spanish from . import trec_robust04 from . import trec_tot from . import tripclick from . import tweets2013_ia from . import vaswani from . import wapo from . import wikiclir from . import wikir from . import trec_fair from . import trec_cast # must be after wapo,car,msmarco_passage from . import hc4 from . import neuclir # must be after hc4 from . import sara from . import trec_tot_2025 ================================================ FILE: ir_datasets/datasets/antique.py ================================================ import io import ir_datasets from ir_datasets.formats import TsvDocs, TrecQrels, TsvQueries from ir_datasets.util import DownloadConfig, Lazy from .base import Dataset, FilteredQueries, FilteredQrels, YamlDocumentation __all__ = ['collection', 'subsets'] _logger = ir_datasets.log.easy() NAME = 'antique' DUA = ("Please confirm you agree to the authors' data usage agreement found at " "") # Qrel defs taken verbatim from QREL_DEFS = { 4: "It looks reasonable and convincing. Its quality is on parwith or better than the " "\"Possibly Correct Answer\". Note that it does not have to provide the same answer " "as the \"PossiblyCorrect Answer\".", 3: "It can be an answer to the question, however, it is notsufficiently convincing. " "There should be an answer with much better quality for the question.", 2: "It does not answer the question or if it does, it provides anunreasonable answer, " "however, it is not out of context. Therefore, you cannot accept it as an answer to " "the question.", 1: "It is completely out of context or does not make any sense.", } VALIDATION_QIDS = {'1158088', '4032777', '1583099', '263783', '4237144', '1097878', '114758', '1211877', '1188438', '2689609', '1191621', '2571912', '1471877', '2961191', '2630860', '4092472', '3178012', '358253', '3913653', '844617', '2764765', '212427', '220575', '11706', '4069320', '3280274', '3159749', '4217473', '4042061', '1037897', '103298', '332662', '752633', '2704', '3635284', '2235825', '3651236', '2155390', '3752394', '2008456', '98438', '511835', '1647624', '3884772', '1536937', '544869', '66151', '2678635', '963523', '1881436', '993601', '3608433', '2048278', '3124162', '1907320', '1970273', '2891885', '2858043', '189364', '397709', '3470651', '3885753', '1933929', '94629', '2500918', '1708787', '2492366', '17665', '278043', '643630', '1727343', '196651', '3731489', '2910592', '1144768', '2573745', '546552', '1341602', '317469', '2735795', '1251077', '3507499', '3374970', '1034050', '1246269', '2901754', '2137263', '1295284', '2180502', '406082', '1443637', '2620488', '3118286', '3814583', '3738877', '684633', '2094435', '242701', '2613648', '2942624', '1495234', '1440810', '2421078', '961127', '595342', '363519', '4048305', '485408', '2573803', '3104841', '3626847', '727663', '3961', '4287367', '2112535', '913424', '1514356', '1512776', '937635', '1321784', '1582044', '1467322', '461995', '884643', '4338583', '2550445', '4165672', '1016750', '1184520', '3152714', '3617468', '3172166', '4031702', '2534994', '2035638', '404359', '1398838', '4183127', '2418824', '2439070', '2632334', '4262151', '3841762', '4400543', '2147417', '514804', '1423289', '2041828', '2776069', '1458676', '3407617', '1450678', '1978816', '2466898', '1607303', '2175167', '772988', '1289770', '3382182', '3690922', '1051346', '344029', '2357505', '1907847', '2587810', '3272207', '2522067', '1107012', '554539', '489705', '3652886', '4287894', '4387641', '1727879', '348777', '566364', '2678484', '4450252', '986260', '4336509', '3824106', '2169746', '2700836', '3495304', '3083719', '126182', '1607924', '1485589', '3211282', '2546730', '2897078', '3556937', '2113006', '929821', '2306533', '2543919', '1639607', '3958214', '2677193', '763189'} def _init(): documentation = YamlDocumentation('docs/antique.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path, dua=DUA) collection = TsvDocs(dlc['docs'], namespace=NAME, lang='en', count_hint=ir_datasets.util.count_hint(NAME)) subsets = {} for subset in ('train', 'test'): qrels = TrecQrels(dlc[f'{subset}/qrels'], QREL_DEFS) queries = TsvQueries(dlc[f'{subset}/queries'], namespace=NAME, lang='en') subsets[subset] = Dataset(collection, queries, qrels) # Split the training data into training and validation data validation_qids = Lazy(lambda: VALIDATION_QIDS) subsets['train/split200-train'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), validation_qids, mode='exclude'), FilteredQrels(subsets['train'].qrels_handler(), validation_qids, mode='exclude'), subsets['train']) subsets['train/split200-valid'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), validation_qids, mode='include'), FilteredQrels(subsets['train'].qrels_handler(), validation_qids, mode='include'), subsets['train']) # Separate test set removing the "offensive (and noisy)" questions disallow_list = dlc['disallow_list'] def disllow_qids(): with disallow_list.stream() as stream: stream = io.TextIOWrapper(stream) return {l.rstrip() for l in stream} disllow_qids = Lazy(disllow_qids) subsets['test/non-offensive'] = Dataset( FilteredQueries(subsets['test'].queries_handler(), disllow_qids, mode='exclude'), FilteredQrels(subsets['test'].qrels_handler(), disllow_qids, mode='exclude'), subsets['test']) ir_datasets.registry.register(NAME, Dataset(collection, documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets collection, subsets = _init() ================================================ FILE: ir_datasets/datasets/aol_ia.py ================================================ from datetime import datetime import json import pickle import re import contextlib from collections import Counter from hashlib import md5 import ir_datasets from typing import NamedTuple, Tuple from ir_datasets.util import DownloadConfig, GzipExtract, TarExtract, finialized_file from ir_datasets.formats import TrecQrels, TsvQueries, DocstoreBackedDocs, BaseQlogs from ir_datasets.datasets.base import Dataset, YamlDocumentation from ir_datasets.indices import DEFAULT_DOCSTORE_OPTIONS, DocstoreOptions, PickleLz4FullStore _logger = ir_datasets.log.easy() NAME = 'aol-ia' QREL_DEFS = { 1: 'clicked', } QID_LEN = 14 DID_LEN = 12 class LogItem(NamedTuple): doc_id: str rank: int clicked: bool class AolQlog(NamedTuple): user_id: str query_id: str query: str query_orig: str time: datetime items: Tuple[LogItem, ...] class AolIaDoc(NamedTuple): doc_id: str title: str text: str url: str ia_url: str def default_text(self): """ title and text """ return f'{self.title} {self.text}' class AolQlogs(BaseQlogs): def __init__(self, dlc): self.dlc = dlc def qlogs_iter(self): LZ4FrameFile = ir_datasets.lazy_libs.lz4_frame().frame.LZ4FrameFile with self.dlc.stream() as fin, \ LZ4FrameFile(fin) as fin: try: while True: yield pickle.load(fin) except EOFError: pass def qlogs_cls(self): return AolQlog def qlogs_count(self): return 36_389_567 class _ManagedDlc: def __init__(self, manager, path): self._manager = manager self._path = path @contextlib.contextmanager def stream(self): self._manager.build() with open(self._path, 'rb') as f: yield f def path(self, force=True): if force: self._manager.build() return self._path class AolManager: def __init__(self, log_dlcs, id2wb_dlc, base_path): self._log_dlcs = log_dlcs self.id2wb_dlc = id2wb_dlc # exposed for aolia-tools self._docs_store = None self._base_path = base_path self._logs_built = None if not self._base_path.exists(): self._base_path.mkdir(exist_ok=True, parents=True) def docs_store(self, options=DEFAULT_DOCSTORE_OPTIONS): self._build_docs() return self._internal_docs_store(options) def _internal_docs_store(self, options: DocstoreOptions=DEFAULT_DOCSTORE_OPTIONS): if self._docs_store is None: self._docs_store = PickleLz4FullStore(self._base_path/'docs.pklz4', None, AolIaDoc, 'doc_id', ['doc_id'], count_hint=ir_datasets.util.count_hint(NAME), options=options) return self._docs_store def _build_docs(self): if self._internal_docs_store().built(): return if not (self._base_path/'downloaded_docs'/'_done').exists(): raise RuntimeError('''To use the documents of AOLIA, you will need to run the download script in https://github.com/terrierteam/aolia-tools. To run the script, use the following commands: git clone https://github.com/terrierteam/aolia-tools cd aolia-tools pip install -r requirements.txt python downloader.py ''') LZ4FrameFile = ir_datasets.lazy_libs.lz4_frame().frame.LZ4FrameFile with _logger.pbar_raw(desc='', total=1525535) as pbar, self._internal_docs_store().lookup.transaction() as transaction: for file in sorted((self._base_path/'downloaded_docs').glob('*.jsonl.lz4')): pbar.set_postfix({'file': file.name}) docs = [] with LZ4FrameFile(file, 'rb') as fin: for line in fin: doc = json.loads(line) docs.append(AolIaDoc(doc['doc_id'], doc['title'], doc['text'], doc['url'], doc['wb_url'])) pbar.update() for doc in sorted(docs, key=lambda x: x.doc_id): # sort the documents in each file before adding them to the docstore. This ensures a consistent ordering. transaction.add(doc) def build(self): if self._logs_built is None: self._logs_built = (self._base_path/'_built_logs').exists() if self._logs_built: return # already built # sessionizer = Sessionizer() lz4_frame = ir_datasets.lazy_libs.lz4_frame().frame encountered_qids = set() with finialized_file(self._base_path/'queries.tsv', 'wt') as f_queries, \ finialized_file(self._base_path/'qrels', 'wt') as f_qrels, \ finialized_file(self._base_path/'log.pkl.lz4', 'wb') as f_log, \ lz4_frame.LZ4FrameFile(f_log, 'wb') as f_log, \ _logger.pbar_raw(desc=f'preparing {NAME} log lines', total=36389567) as pbar: for dlc in self._log_dlcs: with dlc.stream() as fin: assert next(fin) == b'AnonID\tQuery\tQueryTime\tItemRank\tClickURL\n' # skip header for line in fin: pbar.update() cols = line.decode().rstrip('\n').split('\t') if tuple(cols[3:]) == ('', ''): user_id, query, query_time, _, _ = cols rank, url = None, None else: user_id, query, query_time, rank, url = cols norm_query = ' '.join(ir_datasets.util.ws_tok(query)) query_id = md5(norm_query.encode()).hexdigest()[:QID_LEN] if query_id not in encountered_qids: f_queries.write(f'{query_id}\t{norm_query}\n') encountered_qids.add(query_id) log_items = [] if url is not None: doc_id = md5(url.encode()).hexdigest()[:DID_LEN] f_qrels.write(f'{query_id}\t{user_id}\t{doc_id}\t1\n') log_items.append(LogItem(doc_id, rank, True)) log_record = AolQlog(user_id, query_id, norm_query, query, datetime.fromisoformat(query_time), tuple(log_items)) pickle.dump(log_record, f_log) (self._base_path/'_built_logs').touch() self._logs_built = True def file_ref(self, path): return _ManagedDlc(self, self._base_path/path) def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') manager = AolManager([ GzipExtract(TarExtract(dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-01.txt.gz')), GzipExtract(TarExtract(dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-02.txt.gz')), GzipExtract(TarExtract(dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-03.txt.gz')), GzipExtract(TarExtract(dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-04.txt.gz')), GzipExtract(TarExtract(dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-05.txt.gz')), GzipExtract(TarExtract(dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-06.txt.gz')), GzipExtract(TarExtract(dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-07.txt.gz')), GzipExtract(TarExtract(dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-08.txt.gz')), GzipExtract(TarExtract(dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-09.txt.gz')), GzipExtract(TarExtract(dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-10.txt.gz')), ], GzipExtract(dlc['id2wb']), base_path) base = Dataset( DocstoreBackedDocs(manager.docs_store, docs_cls=AolIaDoc, namespace=NAME, lang=None), TsvQueries(manager.file_ref('queries.tsv'), lang=None), TrecQrels(manager.file_ref('qrels'), QREL_DEFS), AolQlogs(manager.file_ref('log.pkl.lz4')), documentation('_')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets, manager, base_path # Be sure to keep MANAGER and PATH here; they are used by aolia-tools base, subsets, MANAGER, PATH = _init() ================================================ FILE: ir_datasets/datasets/aquaint.py ================================================ import ir_datasets from ir_datasets.util import DownloadConfig from ir_datasets.formats import TrecQrels, TrecDocs, TrecQueries from ir_datasets.datasets.base import Dataset, YamlDocumentation NAME = 'aquaint' QREL_DEFS = { 2: 'highly relevant', 1: 'relevant', 0: 'not relevant', } QTYPE_MAP = { ' *(Number:)?': 'query_id', ' *(Topic:)?': 'title', '<desc> *(Description:)?': 'description', '<narr> *(Narrative:)?': 'narrative' } def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = TrecDocs(dlc['docs'], encoding='utf8', path_globs=['aquaint_comp/apw/*/*.gz', 'aquaint_comp/nyt/*/*.gz', 'aquaint_comp/xie/*/*.gz'], namespace=NAME, lang='en', count_hint=ir_datasets.util.count_hint(NAME)) base = Dataset(collection, documentation('_')) subsets['trec-robust-2005'] = Dataset( TrecQueries(dlc['trec-robust-2005/queries'], qtype_map=QTYPE_MAP, namespace='trec-robust', lang='en'), TrecQrels(dlc['trec-robust-2005/qrels'], QREL_DEFS), collection, documentation('trec-robust-2005')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/argsme.py ================================================ from itertools import chain from typing import Dict from ir_datasets import registry from ir_datasets.datasets.base import Dataset, YamlDocumentation from ir_datasets.formats import ArgsMeDocs, ArgsMeProcessedDocs, ArgsMeCombinedDocs from ir_datasets.util import DownloadConfig, home_path, Cache, ZipExtract, TarExtract NAME = "argsme" SUBSETS = { '1.0': (387692, "en", "args-me.json"), '1.0-cleaned': (382545, "en", "args-me-1.0-cleaned.json"), '2020-04-01/debateorg': (338620, "en", "debateorg.json"), '2020-04-01/debatepedia': (21197, "en", "debatepedia.json"), '2020-04-01/debatewise': (14353, "en", "debatewise.json"), '2020-04-01/idebate': (13522, "en", "idebate.json"), '2020-04-01/parliamentary': (48, "en", "parliamentary.json"), } PROCESSED_SUBSETS = { '2020-04-01/processed': (365408, "en", "args_processed.csv"), } COMBINED_SUBSETS = { '2020-04-01': ( [ '2020-04-01/debateorg', '2020-04-01/debatepedia', '2020-04-01/debatewise', '2020-04-01/idebate', '2020-04-01/parliamentary' ], 387740, "en" ), } def _init(): base_path = home_path() / NAME documentation = YamlDocumentation(f"docs/{NAME}.yaml") download_config = DownloadConfig.context(NAME, base_path) base = Dataset(documentation('_')) # Arguments that can be loaded from Zenodo. arguments: Dict[str, ArgsMeDocs] = { name: ArgsMeDocs( Cache( ZipExtract( download_config[name], zip_path ), base_path / f"{name}.json" ), namespace=f"{NAME}/{name}", language=language, count_hint=count_hint ) for name, (count_hint, language, zip_path) in SUBSETS.items() } # Processed arguments that can be loaded from Zenodo. processed_arguments: Dict[str, ArgsMeProcessedDocs] = { name: ArgsMeProcessedDocs( Cache( TarExtract( download_config[name], zip_path ), base_path / f"{name}.json" ), namespace=f"{NAME}/{name}", language=language, count_hint=count_hint ) for name, (count_hint, language, zip_path) in PROCESSED_SUBSETS.items() } # Arguments that are combined versions of other subsets. combined_arguments: Dict[str, ArgsMeCombinedDocs] = { name: ArgsMeCombinedDocs( base_path / f"{name}.json", [arguments[subset_name] for subset_name in subset_names], namespace=f"{NAME}/{name}", language=language, count_hint=count_hint ) for name, (subset_names, count_hint, language) in COMBINED_SUBSETS.items() } # Wrap in datasets with documentation. datasets = { name: Dataset( arguments, documentation(name) ) for name, arguments in chain( arguments.items(), processed_arguments.items(), combined_arguments.items(), ) } # NOTE: the following datasets are defined in touche.py: # - argsme/1.0/touche-2020-task-1/uncorrected # - argsme/2020-04-01/touche-2020-task-1 # - argsme/2020-04-01/touche-2020-task-1/uncorrected # - argsme/2020-04-01/touche-2021-task-1 # - argsme/2020-04-01/processed/touche-2022-task-1 # Register datasets. registry.register(NAME, base) for name, arguments in datasets.items(): registry.register(f'{NAME}/{name}', arguments) return base, datasets dataset = _init() ================================================ FILE: ir_datasets/datasets/base.py ================================================ import pkgutil import contextlib import itertools from pathlib import Path import ir_datasets from ir_datasets.formats import BaseQueries, BaseQrels, BaseScoredDocs, BaseDocPairs _logger = ir_datasets.log.easy() class Dataset: def __init__(self, *constituents): self._constituents = [c for c in constituents if c is not None] self._beta_apis = {} def __getstate__(self): return self._constituents def __setstate__(self, state): self._constituents = state def __getattr__(self, attr): if attr == 'docs' and self.has_docs(): if 'docs' not in self._beta_apis: self._beta_apis['docs'] = _BetaPythonApiDocs(self) return self._beta_apis['docs'] if attr == 'queries' and self.has_queries(): if 'queries' not in self._beta_apis: self._beta_apis['queries'] = _BetaPythonApiQueries(self) return self._beta_apis['queries'] if attr == 'qrels' and self.has_qrels(): if 'qrels' not in self._beta_apis: self._beta_apis['qrels'] = _BetaPythonApiQrels(self) return self._beta_apis['qrels'] if attr == 'scoreddocs' and self.has_scoreddocs(): if 'scoreddocs' not in self._beta_apis: self._beta_apis['scoreddocs'] = _BetaPythonApiScoreddocs(self) return self._beta_apis['scoreddocs'] if attr == 'docpairs' and self.has_docpairs(): if 'docpairs' not in self._beta_apis: self._beta_apis['docpairs'] = _BetaPythonApiDocpairs(self) return self._beta_apis['docpairs'] if attr == 'qlogs' and self.has_qlogs(): if 'qlogs' not in self._beta_apis: self._beta_apis['qlogs'] = _BetaPythonApiQlogs(self) return self._beta_apis['qlogs'] for cons in self._constituents: if hasattr(cons, attr): return getattr(cons, attr) raise AttributeError(attr) def __repr__(self): supplies = [] if self.has_docs(): supplies.append('docs') if self.has_queries(): supplies.append('queries') if self.has_qrels(): supplies.append('qrels') if self.has_scoreddocs(): supplies.append('scoreddocs') if self.has_docpairs(): supplies.append('docpairs') if self.has_qlogs(): supplies.append('qlogs') if hasattr(self, 'dataset_id'): return f'Dataset(id={repr(self.dataset_id())}, provides={repr(supplies)})' else: return f'Dataset(provides={repr(supplies)})' def __dir__(self): result = set(dir(super())) for cons in self._constituents: result |= set(dir(cons)) return list(result) def has(self, etype: ir_datasets.EntityType) -> bool: etype = ir_datasets.EntityType(etype) # validate & allow strings return hasattr(self, f'{etype.value}_handler') def has_docs(self): return self.has(ir_datasets.EntityType.docs) def has_queries(self): return self.has(ir_datasets.EntityType.queries) def has_qrels(self): return self.has(ir_datasets.EntityType.qrels) def has_scoreddocs(self): return self.has(ir_datasets.EntityType.scoreddocs) def has_docpairs(self): return self.has(ir_datasets.EntityType.docpairs) def has_qlogs(self): return self.has(ir_datasets.EntityType.qlogs) class _BetaPythonApiDocs: def __init__(self, handler): self._handler = handler self._docstore = None self.type = handler.docs_cls() self.lang = handler.docs_lang() def __iter__(self): return self._handler.docs_iter() def __len__(self): return self._handler.docs_count() def __getitem__(self, key): return self._handler.docs_iter()[key] def __repr__(self): return f'BetaPythonApiDocs({repr(self._handler)})' def lookup(self, doc_ids): if self._docstore is None: self._docstore = self._handler.docs_store() if isinstance(doc_ids, str): return self._docstore.get(doc_ids) return self._docstore.get_many(doc_ids) def lookup_iter(self, doc_ids): if self._docstore is None: self._docstore = self._handler.docs_store() if isinstance(doc_ids, str): yield self._docstore.get(doc_ids) else: yield from self._docstore.get_many_iter(doc_ids) @property def metadata(self): return self._handler.docs_metadata() class _BetaPythonApiQueries: def __init__(self, handler): self._handler = handler self._query_lookup = None self.type = handler.queries_cls() self.lang = handler.queries_lang() def __iter__(self): return self._handler.queries_iter() def __repr__(self): return f'BetaPythonApiQueries({repr(self._handler)})' def __len__(self): result = None if hasattr(self._handler, 'queries_count'): result = self._handler.queries_count() if result is None: if self._query_lookup is None: self._query_lookup = {q.query_id: q for q in self._handler.queries_iter()} result = len(self._query_lookup) return result def lookup(self, query_ids): if self._query_lookup is None: self._query_lookup = {q.query_id: q for q in self._handler.queries_iter()} if isinstance(query_ids, str): return self._query_lookup[query_ids] return {qid: self._query_lookup[qid] for qid in query_ids if qid in self._query_lookup} def lookup_iter(self, query_ids): if self._query_lookup is None: self._query_lookup = {q.query_id: q for q in self._handler.queries_iter()} if isinstance(query_ids, str): yield self._query_lookup[query_ids] else: for qid in query_ids: if qid in self._query_lookup: yield self._query_lookup[qid] @property def metadata(self): return self._handler.queries_metadata() class _BetaPythonApiQrels: def __init__(self, handler): self._handler = handler self.type = handler.qrels_cls() self.defs = handler.qrels_defs() self._qrels_dict = None def __iter__(self): return self._handler.qrels_iter() def __repr__(self): return f'BetaPythonApiQrels({repr(self._handler)})' def asdict(self): if self._qrels_dict is None: self._qrels_dict = self._handler.qrels_dict() return self._qrels_dict def __len__(self): result = None if hasattr(self._handler, 'qrels_count'): result = self._handler.qrels_count() if result is None: if self._qrels_dict is None: self._qrels_dict = self._handler.qrels_dict() result = sum(len(x) for x in self._qrels_dict.values()) return result @property def metadata(self): return self._handler.qrels_metadata() class _BetaPythonApiScoreddocs: def __init__(self, handler): self._handler = handler self.type = handler.scoreddocs_cls() def __iter__(self): return self._handler.scoreddocs_iter() def __repr__(self): return f'BetaPythonApiScoreddocs({repr(self._handler)})' def __len__(self): result = None if hasattr(self._handler, 'scoreddocs_count'): result = self._handler.scoreddocs_count() if result is None: result = sum(1 for _ in self._handler.scoreddocs_iter()) return result @property def metadata(self): return self._handler.scoreddocs_metadata() class _BetaPythonApiDocpairs: def __init__(self, handler): self._handler = handler self.type = handler.docpairs_cls() def __iter__(self): return self._handler.docpairs_iter() def __repr__(self): return f'BetaPythonApiDocpairs({repr(self._handler)})' def __len__(self): result = None if hasattr(self._handler, 'docpairs_count'): result = self._handler.docpairs_count() if result is None: result = sum(1 for _ in self._handler.docpairs_iter()) return result @property def metadata(self): return self._handler.docpairs_metadata() class _BetaPythonApiQlogs: def __init__(self, handler): self._handler = handler self.type = handler.qlogs_cls() def __iter__(self): return self._handler.qlogs_iter() def __repr__(self): return f'BetaPythonApiQlogs({repr(self._handler)})' def __len__(self): result = None if hasattr(self._handler, 'qlogs_count'): result = self._handler.qlogs_count() if result is None: result = sum(1 for _ in self._handler.qlogs_iter()) return result @property def metadata(self): return self._handler.qlogs_metadata() class FilteredQueries(BaseQueries): def __init__(self, queries_handler, lazy_qids, mode='include'): self._queries_handler = queries_handler self._lazy_qids = lazy_qids self._mode = mode def queries_iter(self): qids = self._lazy_qids() operator = { 'include': (lambda x: x.query_id in qids), 'exclude': (lambda x: x.query_id not in qids), }[self._mode] for query in self._queries_handler.queries_iter(): if operator(query): yield query def queries_cls(self): return self._queries_handler.queries_cls() def queries_handler(self): return self def queries_lang(self): return self._queries_handler.queries_lang() class FilteredQrels(BaseQrels): def __init__(self, qrels_handler, lazy_qids, mode='include'): self._qrels_handler = qrels_handler self._lazy_qids = lazy_qids self._mode = mode def qrels_iter(self): qids = self._lazy_qids() operator = { 'include': (lambda x: x.query_id in qids), 'exclude': (lambda x: x.query_id not in qids), }[self._mode] for query in self._qrels_handler.qrels_iter(): if operator(query): yield query def qrels_defs(self): return self._qrels_handler.qrels_defs() def qrels_handler(self): return self class FilteredScoredDocs(BaseScoredDocs): def __init__(self, scoreddocs_handler, lazy_qids, mode='include'): self._scoreddocs_handler = scoreddocs_handler self._lazy_qids = lazy_qids self._mode = mode def scoreddocs_iter(self): qids = self._lazy_qids() operator = { 'include': (lambda x: x.query_id in qids), 'exclude': (lambda x: x.query_id not in qids), }[self._mode] for query in self._scoreddocs_handler.scoreddocs_iter(): if operator(query): yield query def scoreddocs_handler(self): return self class FilteredDocPairs(BaseDocPairs): def __init__(self, docpairs_handler, lazy_qids, mode='include'): self._docpairs_handler = docpairs_handler self._lazy_qids = lazy_qids self._mode = mode def docpairs_iter(self): qids = self._lazy_qids() operator = { 'include': (lambda x: x.query_id in qids), 'exclude': (lambda x: x.query_id not in qids), }[self._mode] for query in self._docpairs_handler.docpairs_iter(): if operator(query): yield query def docpairs_handler(self): return self class YamlDocumentation: def __init__(self, file): self._file = file self._contents = None def __call__(self, key): return YamlDocumentationProvider(self, key) def get_key(self, key): if not self._contents: yaml = ir_datasets.lazy_libs.yaml() data = pkgutil.get_data('ir_datasets', self._file) self._contents = yaml.load(data, Loader=yaml.BaseLoader) # only strings return self._contents.get(key) class YamlDocumentationProvider: def __init__(self, documentation, key): self._documentation = documentation self._key = key def documentation(self): docs = self._documentation.get_key(self._key) if self._documentation.get_key(self._key): return dict(docs.items()) return {} class Deprecated: def __init__(self, message): self._message = message def deprecated(self): return self._message class ExpectedFile: def __init__(self, path, expected_md5=None, instructions=None): self._path = Path(path) self._expected_md5 = expected_md5 self._instructions = instructions def path(self, force=True): if force and not self._path.exists(): self._path.parent.mkdir(parents=True, exist_ok=True) inst = '\n\n' + self._instructions.format(path=self._path) if self._instructions else '' raise IOError(f"{self._path} does not exist.{inst}") return self._path @contextlib.contextmanager def stream(self): with self.path().open('rb') as result: if self._expected_md5: result = ir_datasets.util.HashStream(result, expected=self._expected_md5, algo='md5') yield result class Concat(Dataset): def __getattr__(self, attr): if attr.endswith('_iter'): iters = [] for ds in self._constituents: if hasattr(ds, attr): iters.append(getattr(ds, attr)()) if iters: return lambda: itertools.chain(*iters) return super().__getattr__(attr) ================================================ FILE: ir_datasets/datasets/beir.py ================================================ import json import codecs from typing import NamedTuple, Dict, List import ir_datasets from ir_datasets.util import ZipExtract, Cache, Lazy, Migrator from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries from ir_datasets.formats import BaseQueries, BaseDocs, BaseQrels, GenericDoc, GenericQuery, TrecQrel from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS _logger = ir_datasets.log.easy() NAME = 'beir' class BeirDoc(NamedTuple): doc_id: str text: str title: str metadata: Dict[str, str] def default_text(self): """ title text """ return f'{self.title} {self.text}' class BeirTitleDoc(NamedTuple): doc_id: str text: str title: str def default_text(self): """ title text """ return f'{self.title} {self.text}' class BeirTitleUrlDoc(NamedTuple): doc_id: str text: str title: str url: str def default_text(self): """ title text """ return f'{self.title} {self.text}' class BeirSciDoc(NamedTuple): doc_id: str text: str title: str authors: List[str] year: int cited_by: List[str] references: List[str] def default_text(self): """ title text """ return f'{self.title} {self.text}' class BeirCordDoc(NamedTuple): doc_id: str text: str title: str url: str pubmed_id: str def default_text(self): """ title text """ return f'{self.title} {self.text}' class BeirToucheDoc(NamedTuple): doc_id: str text: str title: str stance: str url: str def default_text(self): """ title text """ return f'{self.title} {self.text}' class BeirCqaDoc(NamedTuple): doc_id: str text: str title: str tags: List[str] def default_text(self): """ title text """ return f'{self.title} {self.text}' class BeirUrlQuery(NamedTuple): query_id: str text: str url: str def default_text(self): """ text """ return self.text class BeirSciQuery(NamedTuple): query_id: str text: str authors: List[str] year: int cited_by: List[str] references: List[str] def default_text(self): """ text """ return self.text class BeirToucheQuery(NamedTuple): query_id: str text: str description: str narrative: str def default_text(self): """ text """ return self.text class BeirCovidQuery(NamedTuple): query_id: str text: str query: str narrative: str def default_text(self): """ text """ return self.text class BeirCqaQuery(NamedTuple): query_id: str text: str tags: List[str] def default_text(self): """ text """ return self.text def _map_field(field, data): if field in ('doc_id', 'query_id'): return data['_id'] if field == 'text': return data['text'] if field == 'title': return data['title'] else: return data['metadata'][field] class BeirDocs(BaseDocs): def __init__(self, name, dlc, doc_type): super().__init__() self._name = name self._dlc = dlc self._doc_type = doc_type def docs_iter(self): return iter(self.docs_store()) def _docs_iter(self): with self._dlc.stream() as stream: for line in stream: data = json.loads(line) yield self._doc_type(*(_map_field(f, data) for f in self._doc_type._fields)) def docs_cls(self): return self._doc_type def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f'{ir_datasets.util.home_path()/NAME/self._name}/docs.pklz4', init_iter_fn=self._docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], count_hint=ir_datasets.util.count_hint(f'{NAME}/{self._name}'), options=options ) def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_namespace(self): return f'{NAME}/{self._name}' def docs_lang(self): return 'en' class BeirQueries(BaseQueries): def __init__(self, name, dlc, query_type): super().__init__() self._name = name self._dlc = dlc self._query_type = query_type def queries_iter(self): with self._dlc.stream() as stream: for line in stream: data = json.loads(line) yield self._query_type(*(_map_field(f, data) for f in self._query_type._fields)) def queries_cls(self): return self._query_type def queries_namespace(self): return f'{NAME}/{self._name}' def queries_lang(self): return 'en' class BeirQrels(BaseQrels): def __init__(self, qrels_dlc, qrels_defs): self._qrels_dlc = qrels_dlc self._qrels_defs = qrels_defs def qrels_path(self): return self._qrels_dlc.path() def qrels_iter(self): with self._qrels_dlc.stream() as f: f = codecs.getreader('utf8')(f) it = iter(f) assert next(it).strip() == 'query-id\tcorpus-id\tscore' # header row for line in it: if line == '\n': continue # ignore blank lines cols = line.rstrip().split() if len(cols) != 3: raise RuntimeError(f'expected 3 columns, got {len(cols)}') qid, did, score = cols yield TrecQrel(qid, did, int(score), '0') def qrels_cls(self): return TrecQrel def qrels_defs(self): return self._qrels_defs def _init(): base_path = ir_datasets.util.home_path()/NAME dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') base = Dataset(documentation('_')) subsets = {} benchmarks = { 'msmarco': (['train', 'dev', 'test'], GenericDoc, GenericQuery), 'trec-covid': (['test'], BeirCordDoc, BeirCovidQuery), 'nfcorpus': (['train', 'dev', 'test'], BeirTitleUrlDoc, BeirUrlQuery), 'nq': (['test'], BeirTitleDoc, GenericQuery), 'hotpotqa': (['train', 'dev', 'test'], BeirTitleUrlDoc, GenericQuery), 'fiqa': (['train', 'dev', 'test'], GenericDoc, GenericQuery), 'arguana': (['test'], BeirTitleDoc, GenericQuery), 'webis-touche2020': (['test'], BeirToucheDoc, BeirToucheQuery), 'webis-touche2020/v2': (['test'], BeirToucheDoc, BeirToucheQuery), 'quora': (['dev', 'test'], GenericDoc, GenericQuery), 'dbpedia-entity': (['dev', 'test'], BeirTitleUrlDoc, GenericQuery), 'scidocs': (['test'], BeirSciDoc, BeirSciQuery), 'fever': (['train', 'dev', 'test'], BeirTitleDoc, GenericQuery), 'climate-fever': (['test'], BeirTitleDoc, GenericQuery), 'scifact': (['train', 'test'], BeirTitleDoc, GenericQuery), } for ds, (qrels, doc_type, query_type) in benchmarks.items(): dlc_ds = dlc[ds] ds_zip = ds.split('/')[0] docs_migrator = Migrator(base_path/ds/'irds_version.txt', 'v2', affected_files=[f'{base_path/ds}/docs.pklz4'], message=f'Migrating {NAME}/{ds} (structuring fields)') docs = docs_migrator(BeirDocs(ds, ZipExtract(dlc_ds, f'{ds_zip}/corpus.jsonl'), doc_type)) queries = BeirQueries(ds, Cache(ZipExtract(dlc_ds, f'{ds_zip}/queries.jsonl'), base_path/ds/'queries.json'), query_type) if len(qrels) == 1: subsets[ds] = Dataset( docs, queries, BeirQrels(Cache(ZipExtract(dlc_ds, f'{ds_zip}/qrels/{qrels[0]}.tsv'), base_path/ds/f'{qrels[0]}.qrels'), qrels_defs={}), documentation(ds) ) else: subsets[ds] = Dataset( docs, queries, documentation(ds) ) for qrel in qrels: subset_qrels = BeirQrels(Cache(ZipExtract(dlc_ds, f'{ds_zip}/qrels/{qrel}.tsv'), base_path/ds/f'{qrel}.qrels'), qrels_defs={}) subset_qids = qid_filter(subset_qrels) subsets[f'{ds}/{qrel}'] = Dataset( docs, FilteredQueries(queries, subset_qids, mode='include'), subset_qrels, documentation(f'{ds}/{qrel}') ) cqa = ['android', 'english', 'gaming', 'gis', 'mathematica', 'physics', 'programmers', 'stats', 'tex', 'unix', 'webmasters', 'wordpress'] cqa_dlc = dlc['cqadupstack'] for ds in cqa: docs_migrator = Migrator(base_path/'cqadupstack'/ds/'irds_version.txt', 'v2', affected_files=[f'{base_path/"cqadupstack"/ds}/docs.pklz4'], message=f'Migrating {NAME}/cqadupstack/{ds} (structuring fields)') subsets[f'cqadupstack/{ds}'] = Dataset( docs_migrator(BeirDocs(f'cqadupstack/{ds}', ZipExtract(cqa_dlc, f'cqadupstack/{ds}/corpus.jsonl'), BeirCqaDoc)), BeirQueries(f'cqadupstack/{ds}', Cache(ZipExtract(cqa_dlc, f'cqadupstack/{ds}/queries.jsonl'), base_path/'cqadupstack'/ds/'queries.json'), BeirCqaQuery), BeirQrels(Cache(ZipExtract(cqa_dlc, f'cqadupstack/{ds}/qrels/test.tsv'), base_path/'cqadupstack'/ds/f'test.qrels'), qrels_defs={}), documentation(f'cqadupstack/{ds}') ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets def qid_filter(subset_qrels): # NOTE: this must be in a separate function otherwise there can be weird lambda binding problems return Lazy(lambda: {q.query_id for q in subset_qrels.qrels_iter()}) base, subsets = _init() ================================================ FILE: ir_datasets/datasets/c4.py ================================================ import re import os import json import pickle from pathlib import Path from typing import NamedTuple, Tuple import ir_datasets from ir_datasets.util import DownloadConfig, Download, RequestsDownload, TarExtractAll, GzipExtract from ir_datasets.formats import BaseDocs, TrecXmlQueries, DocSourceSeekableIter, DocSource, SourceDocIter from ir_datasets.datasets.base import Dataset, YamlDocumentation from ir_datasets.indices import Docstore, DEFAULT_DOCSTORE_OPTIONS _logger = ir_datasets.log.easy() NAME = 'c4' misinfo_map = {'number': 'query_id', 'query': 'text', 'description': 'description', 'narrative': 'narrative', 'disclaimer': 'disclaimer', 'stance': 'stance', 'evidence': 'evidence'} class C4Doc(NamedTuple): doc_id: str text: str url: str timestamp: str def default_text(self): """ text """ return self.text class MisinfoQuery(NamedTuple): query_id: str text: str description: str narrative: str disclaimer: str stance: str evidence: str def default_text(self): """ text """ return self.text class C4Source(DocSource): def __init__(self, name, dlc, checkpoint_dlc, doc_count, checkpoint_freq, size_hint, cache_path): self.name = name # e.g., en.noclean.c4-train.01234-of-07168 self.dlc = dlc self.checkpoint_dlc = checkpoint_dlc self.doc_count = doc_count self.checkpoint_freq = checkpoint_freq self._checkpoints = None self.size_hint = size_hint self.cache_path = cache_path def __len__(self): return self.doc_count def __iter__(self): return C4SourceIter(self) def checkpoints(self): if self._checkpoints is None: chk_file_name = self.dlc.path().split('/')[-1] + '.chk.pkl.lz4' with ir_datasets.lazy_libs.lz4_frame().frame.open(os.path.join(self.checkpoint_dlc.path(), chk_file_name)) as f: self._checkpoints = pickle.load(f) return self._checkpoints class C4SourceIter(DocSourceSeekableIter): def __init__(self, source): self.source = source self.idx = 0 self.source_f = ir_datasets.lazy_libs.zlib_state().GzipStateFile(self.source.dlc.path()) def close(self): if self.source_f is not None: self.source_f.close() self.source_f = None def __next__(self): line = self.source_f.readline() if not line: raise StopIteration() data = json.loads(line) doc_id = f'{self.source.name}.{self.idx}' self.idx += 1 return C4Doc(doc_id, data['text'], data['url'], data['timestamp']) def seek(self, idx): if (idx < self.idx) or \ (idx // self.source.checkpoint_freq != self.idx // self.source.checkpoint_freq) and \ (idx - self.idx > 100): # either we're going backward in the file or the index is in a different # checkpoint than we're at now, so we can jump ahead. # (or we're not jumping very far ahead (<100 documents), so don't bother # loading checkpoints, e.g., this is a case where step is used when iterating # over the documents.) target_checkpoint = idx // self.source.checkpoint_freq checkpoints = self.source.checkpoints() effective_checkpoint = min(target_checkpoint, len(checkpoints) - 1) pos, state, offset = checkpoints[effective_checkpoint] self.source_f.zseek(pos, state) self.source_f.read(offset) self.idx = effective_checkpoint * self.source.checkpoint_freq while idx > self.idx: # read the file in sequence 'till we get to the desired index self.source_f.readline() self.idx += 1 class C4Docstore(Docstore): def __init__(self, docs, options=DEFAULT_DOCSTORE_OPTIONS): super().__init__(docs.docs_cls(), 'doc_id', options=options) self.docs = docs def get_many_iter(self, doc_ids): files_to_search = {} for doc_id in doc_ids: match = re.match(r'^en.noclean.c4-train.(\d+)-of-07168.(\d+)$', doc_id) if not match: continue file_idx, doc_idx = match.groups() file_idx, doc_idx = int(file_idx), int(doc_idx) if file_idx not in files_to_search: files_to_search[file_idx] = [] files_to_search[file_idx].append(doc_idx) sources = self.docs._docs_sources() for file_idx, doc_idxs in files_to_search.items(): if file_idx >= len(sources): continue source = sources[file_idx] doc_idxs = sorted(doc_idxs) with iter(source) as it: for doc_idx in doc_idxs: it.seek(doc_idx) res = next(it, StopIteration) if res is not StopIteration: yield res class C4Docs(BaseDocs): def __init__(self, sources_dlc, checkpoint_dlc, base_path, source_name_filter=None, filter_name=''): super().__init__() self._sources_dlc = sources_dlc self._checkpoint_dlc = checkpoint_dlc self._sources = None self._base_path = Path(base_path) self._source_name_filter = source_name_filter self._filter_name = filter_name def docs_iter(self): return SourceDocIter(self, slice(0, self.docs_count(force=True))) def docs_cls(self): return C4Doc def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): assert field == 'doc_id' return C4Docstore(self, options=options) def docs_count(self, force=False): if force or self._sources is not None: return sum(s.doc_count for s in self._docs_sources()) def docs_namespace(self): return NAME def docs_lang(self): return 'en' def docs_source_iter(self): return iter(self._docs_sources()) def _docs_sources(self): if self._sources is None: sources = [] with self._sources_dlc.stream() as stream: for source in json.load(stream): if self._source_name_filter: if not re.match(self._source_name_filter, source['name']): continue cache_path = os.path.join(self._base_path, 'en.noclean', source['url'].split('/')[-1]) dlc = Download([RequestsDownload(source['url'])], expected_md5=source['expected_md5'], cache_path=cache_path) sources.append(C4Source(source['name'].replace('.json.gz', ''), dlc, self._checkpoint_dlc, source['doc_count'], source['checkpoint_freq'], source['size_hint'], cache_path)) self._sources = sources build_flag = self._base_path / 'en.noclean' / f'_built{self._filter_name}' if not build_flag.exists(): remaining_size = sum(s.size_hint for s in sources if not os.path.exists(s.cache_path)) if remaining_size > 0: _logger.info(f'Will start downloading c4/en-noclean files ({ir_datasets.util.format_file_size(remaining_size)}). ' f'If you already have a copy, you may link them to {self._base_path / "en.noclean"} (should contain ' f'files like c4-train.00000-of-07168.json.gz)') ir_datasets.util.check_disk_free(self._base_path / 'en.noclean', remaining_size) for source in sources: path = source.dlc.path() # downloads if it doesn't already exist # A quick check that should help make sure it's probably correct if the user downloaded # it themselves. (Not much overhead if downloaded ourselves.) true_size = os.path.getsize(path) if true_size != source.size_hint: raise RuntimeError(f'Expected {path} to be {source.size_hint} bytes but it was actually {true_size} bytes.') build_flag.touch() return self._sources def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') en_noclean_tr_collection = C4Docs( GzipExtract(dlc['en-noclean/sources']), TarExtractAll(dlc['en-noclean/checkpoints'], base_path / 'en.noclean.checkpoints'), base_path, source_name_filter=r'en\.noclean\.c4-train', filter_name='train') # exclude validation files (only include train) base = Dataset(documentation('_')) subsets['en-noclean-tr'] = Dataset( en_noclean_tr_collection, documentation('en-noclean-tr')) subsets['en-noclean-tr/trec-misinfo-2021'] = Dataset( en_noclean_tr_collection, TrecXmlQueries(dlc['trec-misinfo-2021/queries'], qtype=MisinfoQuery, qtype_map=misinfo_map, namespace='trec-misinfo', lang='en'), documentation('en-noclean-tr/trec-misinfo-2021')) ir_datasets.registry.register(NAME, base) for subset in subsets: ir_datasets.registry.register(f'{NAME}/{subset}', subsets[subset]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/car.py ================================================ from typing import NamedTuple, Tuple import ir_datasets from ir_datasets.util import DownloadConfig, TarExtract, ReTar from ir_datasets.formats import TrecQrels, BaseDocs, BaseQueries, GenericDoc from ir_datasets.datasets.base import Dataset, YamlDocumentation from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS NAME = 'car' AUTO_QRELS = { 1: 'Paragraph appears under heading' } MANUAL_QRELS = { 3: 'MUST be mentioned', 2: 'SHOULD be mentioned', 1: 'CAN be mentioned', 0: 'Non-relevant, but roughly on TOPIC', -1: 'NO, non-relevant', -2: 'Trash', } class CarQuery(NamedTuple): query_id: str text: str title: str headings: Tuple[str, ...] def default_text(self): """ text (which is title + headings) """ return self.text class CarDocs(BaseDocs): def __init__(self, streamer, count_hint=None): super().__init__() self._streamer = streamer self._count_hint = count_hint @ir_datasets.util.use_docstore def docs_iter(self): trec_car = ir_datasets.lazy_libs.trec_car() with self._streamer.stream() as stream: paras = trec_car.read_data.iter_paragraphs(stream) for p in paras: yield GenericDoc(p.para_id, p.get_text()) def docs_cls(self): return GenericDoc def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f'{ir_datasets.util.home_path()/NAME}/docs.pklz4', init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], count_hint=self._count_hint, options=options ) def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_namespace(self): return NAME def docs_lang(self): return 'en' class CarQueries(BaseQueries): def __init__(self, streamer): super().__init__() self._streamer = streamer def queries_iter(self): trec_car = ir_datasets.lazy_libs.trec_car() with self._streamer.stream() as stream: for page in trec_car.read_data.iter_outlines(stream): for heads in page.flat_headings_list(): qid = '/'.join([page.page_id] + [h.headingId for h in heads]) title = page.page_name headings = tuple(h.heading for h in heads) text = ' '.join((title,) + headings) yield CarQuery(qid, text, title, headings) def queries_namespace(self): return NAME def queries_cls(self): return CarQuery def queries_lang(self): return 'en' def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') docs_v15 = CarDocs(TarExtract(dlc['docs'], 'paragraphcorpus/paragraphcorpus.cbor', compression='xz'), count_hint=ir_datasets.util.count_hint(f'{NAME}/v1.5')) docs_v20 = CarDocs(TarExtract(dlc['docs/v2.0'], 'paragraphCorpus/dedup.articles-paragraphs.cbor', compression='xz'), count_hint=ir_datasets.util.count_hint(f'{NAME}/v2.0')) base = Dataset(documentation('_')) subsets['v1.5'] = Dataset(docs_v15, documentation('v1.5')) subsets['v1.5/trec-y1'] = Dataset( docs_v15, CarQueries(TarExtract(dlc['trec-y1/queries'], 'benchmarkY1test.public/test.benchmarkY1test.cbor.outlines', compression='xz')),) subsets['v1.5/trec-y1/manual'] = Dataset( subsets['v1.5/trec-y1'], TrecQrels(TarExtract(dlc['trec-y1/qrels'], 'TREC_CAR_2017_qrels/manual.benchmarkY1test.cbor.hierarchical.qrels'), MANUAL_QRELS)) subsets['v1.5/trec-y1/auto'] = Dataset( subsets['v1.5/trec-y1'], TrecQrels(TarExtract(dlc['trec-y1/qrels'], 'TREC_CAR_2017_qrels/automatic.benchmarkY1test.cbor.hierarchical.qrels'), AUTO_QRELS)) subsets['v1.5/test200'] = Dataset( docs_v15, CarQueries(TarExtract(dlc['test200'], 'test200/train.test200.cbor.outlines', compression='xz')), TrecQrels(TarExtract(dlc['test200'], 'test200/train.test200.cbor.hierarchical.qrels', compression='xz'), AUTO_QRELS)) train_data = ReTar(dlc['train'], base_path/'train.smaller.tar.xz', ['train/train.fold?.cbor.outlines', 'train/train.fold?.cbor.hierarchical.qrels'], compression='xz') subsets['v1.5/train/fold0'] = Dataset( docs_v15, CarQueries(TarExtract(train_data, 'train/train.fold0.cbor.outlines', compression='xz')), TrecQrels(TarExtract(train_data, 'train/train.fold0.cbor.hierarchical.qrels', compression='xz'), AUTO_QRELS)) subsets['v1.5/train/fold1'] = Dataset( docs_v15, CarQueries(TarExtract(train_data, 'train/train.fold1.cbor.outlines', compression='xz')), TrecQrels(TarExtract(train_data, 'train/train.fold1.cbor.hierarchical.qrels', compression='xz'), AUTO_QRELS)) subsets['v1.5/train/fold2'] = Dataset( docs_v15, CarQueries(TarExtract(train_data, 'train/train.fold2.cbor.outlines', compression='xz')), TrecQrels(TarExtract(train_data, 'train/train.fold2.cbor.hierarchical.qrels', compression='xz'), AUTO_QRELS)) subsets['v1.5/train/fold3'] = Dataset( docs_v15, CarQueries(TarExtract(train_data, 'train/train.fold3.cbor.outlines', compression='xz')), TrecQrels(TarExtract(train_data, 'train/train.fold3.cbor.hierarchical.qrels', compression='xz'), AUTO_QRELS)) subsets['v1.5/train/fold4'] = Dataset( docs_v15, CarQueries(TarExtract(train_data, 'train/train.fold4.cbor.outlines', compression='xz')), TrecQrels(TarExtract(train_data, 'train/train.fold4.cbor.hierarchical.qrels', compression='xz'), AUTO_QRELS)) subsets['v2.0'] = Dataset(docs_v20, documentation('v2.0')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/clinicaltrials.py ================================================ import codecs import itertools import io import gzip from contextlib import ExitStack import itertools from typing import NamedTuple, Tuple import tarfile import zipfile import xml.etree.ElementTree as ET import ir_datasets from ir_datasets.util import DownloadConfig, GzipExtract, ZipExtract from ir_datasets.formats import BaseDocs, BaseQueries, GenericQuery, TrecQrels, TrecXmlQueries from ir_datasets.datasets.base import Dataset, YamlDocumentation from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS from . import medline _logger = ir_datasets.log.easy() QREL_DEFS = { 0: 'not relevant', 1: 'possibly relevant', 2: 'definitely relevant' } QREL_DEFS_2021 = { 0: 'Not Relevant', 1: 'Excluded', 2: 'Eligible', } NAME = 'clinicaltrials' ct_qmap = {'topic': 'text'} class ClinicalTrialsDoc(NamedTuple): doc_id: str title: str condition: str summary: str detailed_description: str eligibility: str class ClinicalTrialsDocs(BaseDocs): def __init__(self, name, dlcs, compress_format='tgz', count_hint=None): self._name = name self._dlcs = dlcs self._compress_format = compress_format self._count_hint = count_hint def docs_iter(self): return iter(self.docs_store()) def _docs_iter(self): for dlc in self._dlcs: with dlc.stream() as stream, ExitStack() as stack: if self._compress_format == 'tgz': tarf = stack.enter_context(tarfile.open(fileobj=stream, mode='r|gz')) tarf_iter = iter(tarf) extract = tarf.extractfile path_attr = 'path' elif self._compress_format == 'zip': tarf = stack.enter_context(zipfile.ZipFile(stream)) tarf_iter = tarf.filelist extract = tarf.open path_attr = 'filename' else: raise ValueError('unknown compress format') for record in tarf_iter: if getattr(record, path_attr).endswith('.xml'): xml = extract(record).read() yield self._parse_doc(xml) def _parse_doc(self, xml): xml = ET.fromstring(xml) doc_id = ''.join(xml.find('.//nct_id').itertext()) title = xml.find('.//official_title') if not title: title = xml.find('.//brief_title') title = ''.join(title.itertext()) condition = xml.find('.//condition') condition = ''.join(condition.itertext()) if condition else '' summary = xml.find('.//brief_summary') summary = ''.join(summary.itertext()) if summary else '' detailed_description = xml.find('.//detailed_description') detailed_description = ''.join(detailed_description.itertext()) if detailed_description else '' eligibility = xml.find('.//eligibility/criteria') eligibility = ''.join(eligibility.itertext()) if eligibility else '' return ClinicalTrialsDoc(doc_id, title, condition, summary, detailed_description, eligibility) def docs_path(self, force=True): return ir_datasets.util.home_path()/NAME/self._name/'corpus' def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f'{self.docs_path(force=False)}.pklz4', init_iter_fn=self._docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], count_hint=self._count_hint, options=options ) def docs_cls(self): return ClinicalTrialsDoc def docs_namespace(self): return NAME def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_lang(self): return 'en' def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} base = Dataset(documentation('_')) collection17 = ClinicalTrialsDocs('2017', [dlc['docs/2017']], count_hint=ir_datasets.util.count_hint(f'{NAME}/2017')) collection19 = ClinicalTrialsDocs('2019', [dlc['docs/2019/0'], dlc['docs/2019/1'], dlc['docs/2019/2'], dlc['docs/2019/3']], count_hint=ir_datasets.util.count_hint(f'{NAME}/2019')) collection21 = ClinicalTrialsDocs('2021', [dlc['docs/2021/1'], dlc['docs/2021/2'], dlc['docs/2021/3'], dlc['docs/2021/4'], dlc['docs/2021/5']], compress_format='zip', count_hint=ir_datasets.util.count_hint(f'{NAME}/2021')) subsets['2017'] = Dataset(collection17, documentation('2017')) subsets['2019'] = Dataset(collection19, documentation('2019')) subsets['2021'] = Dataset(collection21, documentation('2021')) subsets['2017/trec-pm-2017'] = Dataset( collection17, medline.subsets['2017/trec-pm-2017'].queries_handler(), TrecQrels(dlc['trec-pm-2017/qrels'], QREL_DEFS), documentation('trec-pm-2017') ) subsets['2017/trec-pm-2018'] = Dataset( collection17, medline.subsets['2017/trec-pm-2018'].queries_handler(), TrecQrels(dlc['trec-pm-2018/qrels'], QREL_DEFS), documentation('trec-pm-2018') ) subsets['2019/trec-pm-2019'] = Dataset( collection19, TrecXmlQueries(dlc['trec-pm-2019/queries'], qtype=medline.TrecPmQuery, namespace='trec-pm-2019', lang='en'), TrecQrels(dlc['trec-pm-2019/qrels'], QREL_DEFS), documentation('trec-pm-2019') ) subsets['2021/trec-ct-2021'] = Dataset( collection21, TrecXmlQueries(dlc['trec-ct-2021/queries'], qtype=GenericQuery, qtype_map=ct_qmap, namespace='trec-ct-2021', lang='en'), TrecQrels(dlc['trec-ct-2021/qrels'], QREL_DEFS_2021), documentation('trec-ct-2021')) subsets['2021/trec-ct-2022'] = Dataset( collection21, TrecXmlQueries(dlc['trec-ct-2022/queries'], qtype=GenericQuery, qtype_map=ct_qmap, namespace='trec-ct-2022', lang='en'), documentation('trec-ct-2022')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/clirmatrix.py ================================================ import json import contextlib from pathlib import Path from typing import NamedTuple import ir_datasets from ir_datasets.util import GzipExtract, Lz4Extract, DownloadConfig, _DownloadConfig, MetadataProvider, MetadataComponent from ir_datasets.datasets.base import Dataset, YamlDocumentation from ir_datasets.formats import TsvDocs, CLIRMatrixQueries, CLIRMatrixQrels NAME = 'clirmatrix' _logger = ir_datasets.log.easy() QRELS_DEFS = { 6: "Most relevant, based on Jenks-optimized BM25 retrieval scores in the source language", 5: "Jenks-optimized BM25 retrieval scores in the source language", 4: "Jenks-optimized BM25 retrieval scores in the source language", 3: "Jenks-optimized BM25 retrieval scores in the source language", 2: "Jenks-optimized BM25 retrieval scores in the source language", 1: "Jenks-optimized BM25 retrieval scores in the source language", 0: "Document not retrieved in the source language", } def _init(): LANGS = ('af', 'als', 'am', 'an', 'ar', 'arz', 'ast', 'az', 'azb', 'ba', 'bar', 'be', 'bg', 'bn', 'bpy', 'br', 'bs', 'bug', 'ca', 'cdo', 'ce', 'ceb', 'ckb', 'cs', 'cv', 'cy', 'da', 'de', 'diq', 'el', 'eml', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fo', 'fr', 'fy', 'ga', 'gd', 'gl', 'gu', 'he', 'hi', 'hr', 'hsb', 'ht', 'hu', 'hy', 'ia', 'id', 'ilo', 'io', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'kn', 'ko', 'ku', 'ky', 'la', 'lb', 'li', 'lmo', 'lt', 'lv', 'mai', 'mg', 'mhr', 'min', 'mk', 'ml', 'mn', 'mr', 'mrj', 'ms', 'my', 'mzn', 'nap', 'nds', 'ne', 'new', 'nl', 'nn', 'no', 'oc', 'or', 'os', 'pa', 'pl', 'pms', 'pnb', 'ps', 'pt', 'qu', 'ro', 'ru', 'sa', 'sah', 'scn', 'sco', 'sd', 'sh', 'si', 'simple', 'sk', 'sl', 'sq', 'sr', 'su', 'sv', 'sw', 'szl', 'ta', 'te', 'tg', 'th', 'tl', 'tr', 'tt', 'uk', 'ur', 'uz', 'vec', 'vi', 'vo', 'wa', 'war', 'wuu', 'xmf', 'yi', 'yo', 'zh') LANG_REGEX = '(' + '|'.join(LANGS) + ')' MULTI8_LANGS = ('ar', 'de', 'en', 'es', 'fr', 'ja', 'ru', 'zh') MULTI8_LANG_REGEX = '(' + '|'.join(MULTI8_LANGS) + ')' base_path = ir_datasets.util.home_path()/NAME base_dlc = DownloadConfig.context(NAME, base_path) def _dlc_init(): with GzipExtract(base_dlc['downloads']).stream() as f: clirmatrix_dlc = _DownloadConfig(contents=json.load(f)) return clirmatrix_dlc _dlc = ir_datasets.util.Lazy(_dlc_init) metadata = MetadataProvider(MetadataProvider.json_loader(Lz4Extract(base_dlc['metadata']))) _docs_cache = {} def _docs_initializer(lang_code): if lang_code not in _docs_cache: dlc = _dlc().context("clirmatrix_docs", base_path) docs = TsvDocs(GzipExtract(dlc[f'docs/{lang_code}']), namespace=f'{NAME}/{lang_code}', lang=lang_code) _docs_cache[lang_code] = docs return _docs_cache[lang_code] def _initializer(dsid, args, dlc_context=None): docs_lang, queries_lang, split = args docs = _docs_initializer(docs_lang) components = [docs] if queries_lang: # queries & split are optional dlc = _dlc().context(dlc_context, base_path) dlc_key = f'queries/{queries_lang}_{docs_lang}/{split}' qrel_dlc = GzipExtract(dlc[dlc_key]) qrels = CLIRMatrixQrels(qrel_dlc, QRELS_DEFS) queries = CLIRMatrixQueries(qrel_dlc, queries_lang) components += [queries, qrels] result = Dataset(*components) result = Dataset(MetadataComponent(dsid, result, metadata), result) return result def _multi8_initializer(dsid, args): return _initializer(dsid, args, 'clirmatrix_multi8') def _bi139_base_initializer(dsid, args): return _initializer(dsid, args, 'clirmatrix_bi139_base') def _bi139_full_initializer(dsid, args): return _initializer(dsid, args, 'clirmatrix_bi139_full') def _corpus_initializer(dsid, args): return _initializer(dsid, (args[0], None, None)) documentation = YamlDocumentation(f'docs/{NAME}.yaml') base = Dataset(documentation('_')) ir_datasets.registry.register(NAME, base) ir_datasets.registry.register_pattern(rf'^{NAME}/{LANG_REGEX}$', _corpus_initializer) ir_datasets.registry.register_pattern(rf'^{NAME}/{MULTI8_LANG_REGEX}/multi8/{MULTI8_LANG_REGEX}/(train|dev|test1|test2)$', _multi8_initializer) ir_datasets.registry.register_pattern(rf'^{NAME}/{LANG_REGEX}/bi139-base/{LANG_REGEX}/(train|dev|test1|test2)$', _bi139_base_initializer) ir_datasets.registry.register_pattern(rf'^{NAME}/{LANG_REGEX}/bi139-full/{LANG_REGEX}/(train|dev|test1|test2)$', _bi139_full_initializer) return base collection = _init() ================================================ FILE: ir_datasets/datasets/clueweb09.py ================================================ import os import codecs from pathlib import Path from typing import NamedTuple, Tuple from glob import glob import ir_datasets from ir_datasets.util import GzipExtract, Lazy, DownloadConfig, TarExtract, Cache, Bz2Extract, ZipExtract, TarExtractAll from ir_datasets.formats import TrecQrels, TrecDocs, TrecXmlQueries, WarcDocs, GenericDoc, GenericQuery, TrecQrel, TrecSubQrels, TrecSubQrel, TrecSubtopic, TrecPrel, TrecPrels, TrecColonQueries, BaseQrels from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredQrels, YamlDocumentation from ir_datasets.indices import Docstore, CacheDocstore NAME = 'clueweb09' QREL_DEFS = { 4: 'Nav: This page represents a home page of an entity directly named by the query; the user may be searching for this specific page or site.', 3: 'Key: This page or site is dedicated to the topic; authoritative and comprehensive, it is worthy of being a top result in a web search engine.', 2: 'HRel: The content of this page provides substantial information on the topic.', 1: 'Rel: The content of this page provides some information on the topic, which may be minimal; the relevant information must be on that page, not just promising-looking anchor text pointing to a possibly useful page.', 0: 'Non: The content of this page does not provide useful information on the topic, but may provide useful information on other topics, including other interpretations of the same query.', -2: 'Junk: This page does not appear to be useful for any reasonable purpose; it may be spam or junk', } QREL_DEFS_09 = { 2: 'highly relevant', 1: 'relevant', 0: 'not relevant', } SQREL_DEFS_09 = { 1: 'relevant', 0: 'not relevant' } class TrecWebTrackQuery(NamedTuple): query_id: str query: str description: str type: str subtopics: Tuple[TrecSubtopic, ...] def default_text(self): """ query """ return self.query class ClueWeb09Docs(WarcDocs): def __init__(self, docs_dlc, chk_dlc, dirs=None, lang=None): super().__init__(warc_cw09=True, lang=lang) self.docs_dlc = docs_dlc self.chk_dlc = chk_dlc # All available languages self.dirs = dirs or ['ClueWeb09_Arabic_1', 'ClueWeb09_Chinese_1', 'ClueWeb09_Chinese_2', 'ClueWeb09_Chinese_3', 'ClueWeb09_Chinese_4', 'ClueWeb09_English_1', 'ClueWeb09_English_2', 'ClueWeb09_English_3', 'ClueWeb09_English_4', 'ClueWeb09_English_5', 'ClueWeb09_English_6', 'ClueWeb09_English_7', 'ClueWeb09_English_8', 'ClueWeb09_English_9', 'ClueWeb09_English_10', 'ClueWeb09_French_1', 'ClueWeb09_German_1', 'ClueWeb09_Italian_1', 'ClueWeb09_Japanese_1', 'ClueWeb09_Japanese_2', 'ClueWeb09_Korean_1', 'ClueWeb09_Portuguese_1', 'ClueWeb09_Spanish_1', 'ClueWeb09_Spanish_2'] self._docs_warc_file_counts_cache = None def docs_path(self, force=True): return self.docs_dlc.path(force) def _docs_iter_source_files(self): files = [] for d in self.dirs: files += sorted(glob(os.path.join(self.docs_dlc.path(), d, '*'))) for source_dir in files: for source_file in sorted(glob(os.path.join(source_dir, '*.gz'))): yield source_file def _docs_id_to_source_file(self, doc_id): parts = doc_id.split('-') if len(parts) != 4: return None dataset, sec, part, doc = parts if dataset != 'clueweb09': return None source_glob = os.path.join(self.docs_dlc.path(), f'ClueWeb09_*', sec, f'{part}.warc.gz') source_file = glob(source_glob) if len(source_file) == 0: return None if len(source_file) > 1: raise ValueError(f'doc_id {doc_id} found in multiple files: {source_file}') return source_file[0] def _docs_source_file_to_checkpoint(self, source_file): source_prefix = Path(self.docs_dlc.path()) source_file = Path(source_file) index_prefix = Path(self.chk_dlc.path()) result = index_prefix / source_file.relative_to(source_prefix) if result == source_file: return None return f'{result}.chk.lz4' def _docs_warc_file_counts(self): if self._docs_warc_file_counts_cache is None: result = {} for d in self.dirs: counts_file = os.path.join(self.docs_dlc.path(), f'record_counts/{d}_counts.txt') with open(counts_file, 'rt') as f: for line in f: file, count = line.strip().split() # Fixing bug in record_counts: en0054 is under ClueWeb09_English_4, not _5 if d == 'ClueWeb09_English_5' and 'en0054' in file: file = os.path.join(self.docs_dlc.path(), 'ClueWeb09_English_4', file[3:]) else: file = os.path.join(self.docs_dlc.path(), d, file[3:]) result[file] = int(count) self._docs_warc_file_counts_cache = result return self._docs_warc_file_counts_cache def docs_namespace(self): return NAME class CatBQrelFilter(BaseQrels): def __init__(self, qrels_handler): self._qrels_handler = qrels_handler def qrels_iter(self): catb_segs = {'en0000','en0001','en0002','en0003','en0004','en0005','en0006','en0007','en0008','en0009','en0010','en0011','enwp00','enwp01','enwp02','enwp03'} for qrel in self._qrels_handler.qrels_iter(): _, seg_id, _, _ = qrel.doc_id.split('-') if seg_id in catb_segs: yield qrel def qrels_defs(self): return self._qrels_handler.qrels_defs() def qrels_cls(self): return self._qrels_handler.qrels_cls() def qrels_path(self): return self._qrels_handler.qrels_path() def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} docs_dlc = dlc['docs'] chk_dlc = TarExtractAll(dlc['docs.chk'], base_path/'corpus.chk') collection = ClueWeb09Docs(docs_dlc, chk_dlc, lang=None) # multiple langs collection_ar = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_Arabic_1'], lang='ar') collection_zh = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_Chinese_1', 'ClueWeb09_Chinese_2', 'ClueWeb09_Chinese_3', 'ClueWeb09_Chinese_4'], lang='zh') collection_en = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_English_1', 'ClueWeb09_English_2', 'ClueWeb09_English_3', 'ClueWeb09_English_4', 'ClueWeb09_English_5', 'ClueWeb09_English_6', 'ClueWeb09_English_7', 'ClueWeb09_English_8', 'ClueWeb09_English_9', 'ClueWeb09_English_10'], lang='en') collection_fr = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_French_1'], lang='fr') collection_de = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_German_1'], lang='de') collection_it = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_Italian_1'], lang='it') collection_ja = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_Japanese_1', 'ClueWeb09_Japanese_2'], lang='ja') collection_ko = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_Korean_1'], lang='ko') collection_pt = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_Portuguese_1'], lang='pt') collection_es = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_Spanish_1', 'ClueWeb09_Spanish_2'], lang='es') collection_catb = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_English_1'], lang='en') base = Dataset(collection, documentation('_')) subsets['ar'] = Dataset(collection_ar, documentation('ar')) subsets['zh'] = Dataset(collection_zh, documentation('zh')) subsets['en'] = Dataset(collection_en, documentation('en')) subsets['fr'] = Dataset(collection_fr, documentation('fr')) subsets['de'] = Dataset(collection_de, documentation('de')) subsets['it'] = Dataset(collection_it, documentation('it')) subsets['ja'] = Dataset(collection_ja, documentation('ja')) subsets['ko'] = Dataset(collection_ko, documentation('ko')) subsets['pt'] = Dataset(collection_pt, documentation('pt')) subsets['es'] = Dataset(collection_es, documentation('es')) subsets['catb'] = Dataset(collection_catb, documentation('catb')) subsets['en/trec-web-2009'] = Dataset( collection_en, TrecXmlQueries(dlc['trec-web-2009/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), TrecPrels(GzipExtract(dlc['trec-web-2009/qrels.adhoc']), QREL_DEFS_09), documentation('trec-web-2009')) # NOTE: Contains positive (1) and negative (0) judgements at subtopic level subsets['en/trec-web-2009/diversity'] = Dataset( collection_en, TrecXmlQueries(dlc['trec-web-2009/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), TrecSubQrels(GzipExtract(dlc['trec-web-2009/qrels.all']), SQREL_DEFS_09), documentation('trec-web-2009')) subsets['en/trec-web-2010'] = Dataset( collection_en, TrecXmlQueries(dlc['trec-web-2010/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), TrecQrels(dlc['trec-web-2010/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2010')) subsets['en/trec-web-2010/diversity'] = Dataset( collection_en, TrecXmlQueries(dlc['trec-web-2010/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), TrecSubQrels(dlc['trec-web-2010/qrels.all'], QREL_DEFS), documentation('trec-web-2010')) subsets['en/trec-web-2011'] = Dataset( collection_en, TrecXmlQueries(dlc['trec-web-2011/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), TrecQrels(dlc['trec-web-2011/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2011')) subsets['en/trec-web-2011/diversity'] = Dataset( collection_en, TrecXmlQueries(dlc['trec-web-2011/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), TrecSubQrels(dlc['trec-web-2011/qrels.all'], QREL_DEFS), documentation('trec-web-2011')) subsets['en/trec-web-2012'] = Dataset( collection_en, TrecXmlQueries(dlc['trec-web-2012/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), TrecQrels(dlc['trec-web-2012/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2012')) subsets['en/trec-web-2012/diversity'] = Dataset( collection_en, TrecXmlQueries(dlc['trec-web-2012/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), TrecSubQrels(dlc['trec-web-2012/qrels.all'], QREL_DEFS), documentation('trec-web-2012')) subsets['catb/trec-web-2009'] = Dataset( collection_catb, TrecXmlQueries(dlc['trec-web-2009/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), CatBQrelFilter(TrecPrels(GzipExtract(dlc['trec-web-2009/qrels.adhoc']), QREL_DEFS_09)), documentation('trec-web-2009')) subsets['catb/trec-web-2009/diversity'] = Dataset( collection_catb, TrecXmlQueries(dlc['trec-web-2009/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), CatBQrelFilter(TrecSubQrels(GzipExtract(dlc['trec-web-2009/qrels.all']), SQREL_DEFS_09)), documentation('trec-web-2009')) subsets['catb/trec-web-2010'] = Dataset( collection_catb, TrecXmlQueries(dlc['trec-web-2010/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), CatBQrelFilter(TrecQrels(dlc['trec-web-2010/qrels.adhoc'], QREL_DEFS)), documentation('trec-web-2010')) subsets['catb/trec-web-2010/diversity'] = Dataset( collection_catb, TrecXmlQueries(dlc['trec-web-2010/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), CatBQrelFilter(TrecSubQrels(dlc['trec-web-2010/qrels.all'], QREL_DEFS)), documentation('trec-web-2010')) subsets['catb/trec-web-2011'] = Dataset( collection_catb, TrecXmlQueries(dlc['trec-web-2011/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), CatBQrelFilter(TrecQrels(dlc['trec-web-2011/qrels.adhoc'], QREL_DEFS)), documentation('trec-web-2011')) subsets['catb/trec-web-2011/diversity'] = Dataset( collection_catb, TrecXmlQueries(dlc['trec-web-2011/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), CatBQrelFilter(TrecSubQrels(dlc['trec-web-2011/qrels.all'], QREL_DEFS)), documentation('trec-web-2011')) subsets['catb/trec-web-2012'] = Dataset( collection_catb, TrecXmlQueries(dlc['trec-web-2012/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), CatBQrelFilter(TrecQrels(dlc['trec-web-2012/qrels.adhoc'], QREL_DEFS)), documentation('trec-web-2012')) subsets['catb/trec-web-2012/diversity'] = Dataset( collection_catb, TrecXmlQueries(dlc['trec-web-2012/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), CatBQrelFilter(TrecSubQrels(dlc['trec-web-2012/qrels.all'], QREL_DEFS)), documentation('trec-web-2012')) subsets['trec-mq-2009'] = Dataset( collection, TrecColonQueries(GzipExtract(dlc['trec-mq-2009/queries']), encoding='latin1', lang='en'), TrecPrels(GzipExtract(dlc['trec-mq-2009/qrels']), QREL_DEFS_09), documentation('trec-mq-2009')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/clueweb12.py ================================================ import codecs import io import os import gzip import contextlib from typing import NamedTuple, Tuple from glob import glob from pathlib import Path import ir_datasets from ir_datasets.util import DownloadConfig, TarExtract, TarExtractAll, Cache, Bz2Extract, ZipExtract, IterStream from ir_datasets.formats import TrecQrels, TrecSubQrels, TrecDocs, TrecXmlQueries, WarcDocs, GenericDoc, GenericQuery, TrecQrel, TrecSubQrel, NtcirQrels, TrecSubtopic from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredQrels, YamlDocumentation from ir_datasets.indices import Docstore, CacheDocstore _logger = ir_datasets.log.easy() NAME = 'clueweb12' QREL_DEFS = { 4: 'Nav: This page represents a home page of an entity directly named by the query; the user may be searching for this specific page or site.', 3: 'Key: This page or site is dedicated to the topic; authoritative and comprehensive, it is worthy of being a top result in a web search engine.', 2: 'HRel: The content of this page provides substantial information on the topic.', 1: 'Rel: The content of this page provides some information on the topic, which may be minimal; the relevant information must be on that page, not just promising-looking anchor text pointing to a possibly useful page.', 0: 'Non: The content of this page does not provide useful information on the topic, but may provide useful information on other topics, including other interpretations of the same query.', -2: 'Junk: This page does not appear to be useful for any reasonable purpose; it may be spam or junk', } NTCIR_QREL_DEFS = { 0: 'Two annotators rated as non-relevant', 1: 'One annotator rated as relevant, one as non-relevant', 2: 'Two annotators rated as relevant, OR one rates as highly relevant and one as non-relevant', 3: 'One annotator rated as highly relevant, one as relevant', 4: 'Two annotators rated as highly relevant', } MISINFO_QREL_DEFS = { 0: 'Not relevant', 1: 'Relevant', 2: 'Highly relevant', } EHEALTH_QREL_DEFS = { 0: 'Not relevant', 1: 'Somewhat relevant', 2: 'Highly relevant', } ntcir_map = {'qid': 'query_id', 'content': 'title', 'description': 'description'} misinfo_map = {'number': 'query_id', 'query': 'title', 'cochranedoi': 'cochranedoi', 'description': 'description', 'narrative': 'narrative'} ehealth_map = {'id': 'query_id', 'title': 'text'} class TrecWebTrackQuery(NamedTuple): query_id: str query: str description: str type: str subtopics: Tuple[TrecSubtopic, ...] def default_text(self): """ query """ return self.query class NtcirQuery(NamedTuple): query_id: str title: str description: str def default_text(self): """ title """ return self.title class MisinfoQuery(NamedTuple): query_id: str title: str cochranedoi: str description: str narrative: str def default_text(self): """ title """ return self.title class MisinfoQrel(NamedTuple): query_id: str doc_id: str relevance: int effectiveness: int redibility: int class EhealthQrel(NamedTuple): query_id: str doc_id: str relevance: int trustworthiness: int understandability: int iteration: str class MsinfoQrels(TrecQrels): def qrels_iter(self): with self._qrels_dlc.stream() as f: f = codecs.getreader('utf8')(f) for line in f: if line == '\n': continue # ignore blank lines cols = line.rstrip().split() if len(cols) != 6: raise RuntimeError(f'expected 6 columns, got {len(cols)}') qid, it, did, rel, eff, cred = cols yield MisinfoQrel(qid, did, int(rel), int(eff), int(cred)) def qrels_cls(self): return MisinfoQrel class EhealthQrels(TrecQrels): def __init__(self, qrels_dlcs, qtrust_dlcs, qunder_dlcs, qrels_defs, query_id_suffix=''): super().__init__(None, qrels_defs) self._qrels_dlcs = qrels_dlcs self._qtrust_dlcs = qtrust_dlcs self._qunder_dlcs = qunder_dlcs self._query_id_suffix = query_id_suffix def qrels_iter(self): for i, (qrel_dlc, qtrust_dlc, qunder_dlc) in enumerate(zip(self._qrels_dlcs, self._qtrust_dlcs, self._qunder_dlcs)): with qrel_dlc.stream() as frel, \ qtrust_dlc.stream() as ftrust, \ qunder_dlc.stream() as funder: frel = codecs.getreader('utf8')(frel) ftrust = codecs.getreader('utf8')(ftrust) funder = codecs.getreader('utf8')(funder) for lrel, ltrust, lunder in zip(frel, ftrust, funder): cols_rel = lrel.rstrip().split() cols_trust = ltrust.rstrip().split() cols_under = lunder.rstrip().split() assert len(cols_rel) == 4 and len(cols_trust) == 4 and len(cols_under) == 4 assert cols_rel[0] == cols_trust[0] and cols_trust[0] == cols_under[0] # qid assert cols_rel[2] == cols_trust[2] and cols_trust[2] == cols_under[2] # did qid, did = cols_rel[0], cols_rel[2] yield EhealthQrel(qid + self._query_id_suffix, did, int(cols_rel[3]), int(cols_trust[3]), int(cols_under[3]), str(i)) def qrels_cls(self): return EhealthQrel class FixAmp: def __init__(self, streamer): self._streamer = streamer def stream(self): return io.BufferedReader(IterStream(iter(self)), buffer_size=io.DEFAULT_BUFFER_SIZE) def __iter__(self): with self._streamer.stream() as stream: for line in stream: yield line.replace(b' & ', b' & ') class ClueWeb12Docs(WarcDocs): def __init__(self, docs_dlc, chk_dlc=None): super().__init__(lang='en') # all CW12 are english self.docs_dlc = docs_dlc self.chk_dlc = chk_dlc self._docs_warc_file_counts_cache = None def docs_path(self, force=True): return self.docs_dlc.path(force) def _docs_iter_source_files(self): for source_dir in sorted(glob(os.path.join(self.docs_dlc.path(), 'ClueWeb12_*', '*'))): for source_file in sorted(glob(os.path.join(source_dir, '*.gz'))): yield source_file def _docs_id_to_source_file(self, doc_id): parts = doc_id.split('-') if len(parts) != 4: return None dataset, sec, part, doc = parts if dataset != 'clueweb12': return None return os.path.join(self.docs_dlc.path(), f'ClueWeb12_{sec[:2]}', sec, f'{sec}-{part}.warc.gz') def _docs_source_file_to_checkpoint(self, source_file): if self.chk_dlc is None: return None source_prefix = Path(self.docs_dlc.path()) source_file = Path(source_file) index_prefix = Path(self.chk_dlc.path()) result = index_prefix / source_file.relative_to(source_prefix) if result == source_file: return None return f'{result}.chk.lz4' def _docs_warc_file_counts(self): if self._docs_warc_file_counts_cache is None: result = {} for counts_file in glob(os.path.join(self.docs_dlc.path(), 'recordcounts', '*.txt')): d = os.path.basename(counts_file)[:-len('_counts.txt')] with open(counts_file, 'rt') as f: for line in f: file, count = line.strip().split() file = os.path.join(self.docs_dlc.path(), d, file[2:]) result[file] = int(count) self._docs_warc_file_counts_cache = result return self._docs_warc_file_counts_cache def docs_namespace(self): return NAME class ClueWeb12b13Extractor: def __init__(self, docs_dlc, extract_jar_dlc): self.docs_dlc = docs_dlc self.extract_jar_dlc = extract_jar_dlc def path(self, force=True): source_path = self.docs_dlc.path() path = f'{source_path}-b13' if not force: return path if os.path.exists(path): self._create_record_counts_if_needed(path) return path extract_path = self.extract_jar_dlc.path() message = f'''clueweb12-b13 docs not found. Please either: (1) Link docs to {path} if b13 subset already built, or (2) Run the following command to build the b13 subset: java -jar {extract_path} {source_path}/ {path}/ ''' _logger.info(message) raise RuntimeError(message) def _create_record_counts_if_needed(self, path): # The official JAR doesn't build up the recordcounts files that we use for jumping ahead. # So we will build them ourselves the first time. Luckily, the header of each WARC file # in CW12 contains a warc-number-of-documents header, which we can use (avoids reading) # the entire file. It still takes a little time, but not super long. rc_dir = os.path.join(path, 'recordcounts') if len(os.listdir(rc_dir)) != 0: return warc = ir_datasets.lazy_libs.warc() with contextlib.ExitStack() as stack, _logger.pbar_raw(desc='building b13 document count cache', unit='file') as pbar: for d in glob(os.path.join(path, 'ClueWeb12_??')): d = os.path.basename(d) out = stack.enter_context(ir_datasets.util.finialized_file(f'{rc_dir}/{d}_counts.txt', 'wt')) for file in sorted(glob(os.path.join(path, d, '*', '*.warc.gz'))): shortf = file[-24:] with gzip.open(file, 'rb') as f, warc.WARCFile(fileobj=f) as warcf: num_docs = next(iter(warcf)).header['warc-number-of-documents'] out.write(f'./{shortf} {num_docs}\n') pbar.update(1) def stream(self): raise NotImplementedError def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} docs_dlc = dlc['docs'] docs_chk_dlc = TarExtractAll(dlc['docs.chk'], base_path/'corpus.chk') b13_dlc = Bz2Extract(Cache(TarExtract(dlc['cw12b-info'], 'ClueWeb12-CreateB13/software/CreateClueWeb12B13Dataset.jar'), base_path/'CreateClueWeb12B13Dataset.jar')) collection = ClueWeb12Docs(docs_dlc, docs_chk_dlc) collection_b13 = ClueWeb12Docs(ClueWeb12b13Extractor(docs_dlc, b13_dlc)) base = Dataset(collection, documentation('_')) subsets['b13'] = Dataset(collection_b13, documentation('b13')) subsets['trec-web-2013'] = Dataset( collection, TrecXmlQueries(dlc['trec-web-2013/queries'], qtype=TrecWebTrackQuery, namespace='trec-web', lang='en'), TrecQrels(dlc['trec-web-2013/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2013')) subsets['trec-web-2013/diversity'] = Dataset( collection, TrecXmlQueries(dlc['trec-web-2013/queries'], qtype=TrecWebTrackQuery, namespace='trec-web', lang='en'), TrecSubQrels(dlc['trec-web-2013/qrels.all'], QREL_DEFS), documentation('trec-web-2013/diversity')) subsets['trec-web-2014'] = Dataset( collection, TrecXmlQueries(dlc['trec-web-2014/queries'], qtype=TrecWebTrackQuery, namespace='trec-web', lang='en'), TrecQrels(dlc['trec-web-2014/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2014')) subsets['trec-web-2014/diversity'] = Dataset( collection, TrecXmlQueries(dlc['trec-web-2014/queries'], qtype=TrecWebTrackQuery, namespace='trec-web', lang='en'), TrecSubQrels(dlc['trec-web-2014/qrels.all'], QREL_DEFS), documentation('trec-web-2014/diversity')) subsets['b13/ntcir-www-1'] = Dataset( collection_b13, TrecXmlQueries(Cache(ZipExtract(dlc['ntcir-www-1/queries'], 'eng.queries.xml'), base_path/'ntcir-www-1'/'queries.xml'), qtype=GenericQuery, qtype_map={'qid': 'query_id', 'content': 'text'}, namespace='ntcir-www', lang='en'), NtcirQrels(dlc['ntcir-www-1/qrels'], NTCIR_QREL_DEFS), documentation('ntcir-www-1')) subsets['b13/ntcir-www-2'] = Dataset( collection_b13, TrecXmlQueries(Cache(ZipExtract(dlc['ntcir-www-2/queries'], 'qEng.xml'), base_path/'ntcir-www-2'/'queries.xml'), qtype=NtcirQuery, qtype_map=ntcir_map, namespace='ntcir-www', lang='en'), NtcirQrels(dlc['ntcir-www-2/qrels'], NTCIR_QREL_DEFS), documentation('ntcir-www-2')) subsets['b13/ntcir-www-3'] = Dataset( collection_b13, TrecXmlQueries(dlc['ntcir-www-3/queries'], qtype=NtcirQuery, qtype_map=ntcir_map, namespace='ntcir-www', lang='en'), documentation('ntcir-www-3')) subsets['b13/trec-misinfo-2019'] = Dataset( collection_b13, TrecXmlQueries(dlc['trec-misinfo-2019/queries'], qtype=MisinfoQuery, qtype_map=misinfo_map, namespace='trec-misinfo-2019', lang='en'), MsinfoQrels(dlc['trec-misinfo-2019/qrels'], MISINFO_QREL_DEFS), documentation('trec-misinfo-2019')) subsets['b13/clef-ehealth'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='en'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS), documentation('clef-ehealth')) subsets['b13/clef-ehealth/cs'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/cs']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='cs'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS, query_id_suffix='-cs'), documentation('clef-ehealth/cs')) subsets['b13/clef-ehealth/de'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/de']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='de'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS, query_id_suffix='-de'), documentation('clef-ehealth/de')) subsets['b13/clef-ehealth/fr'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/fr']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='fr'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS, query_id_suffix='-fr'), documentation('clef-ehealth/fr')) subsets['b13/clef-ehealth/hu'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/hu']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='hu'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS, query_id_suffix='-hu'), documentation('clef-ehealth/hu')) subsets['b13/clef-ehealth/pl'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/pl']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='pl'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS, query_id_suffix='-pl'), documentation('clef-ehealth/pl')) subsets['b13/clef-ehealth/sv'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/sv']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='sv'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS, query_id_suffix='-sv'), documentation('clef-ehealth/sv')) # NOTE: the following datasets are defined in touche.py: # - clueweb12/touche-2020-task-2 # - clueweb12/touche-2021-task-2 # - clueweb12/touche-2022-task-2 ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/codec.py ================================================ import json from typing import NamedTuple import ir_datasets from ir_datasets.util import Lazy from ir_datasets.formats import BaseQueries, TrecQrels, JsonlDocs from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQrels _logger = ir_datasets.log.easy() NAME = 'codec' QREL_DEFS = { 3: 'Very Valuable. Includes central topic-specific arguments, evidence, or knowledge. This does not include general definitions or background.', 2: 'Somewhat Valuable. Includes valuable topic-specific arguments, evidence, or knowledge.', 1: 'Not Valuable. Consists of definitions or background.', 0: 'Not Relevant. Not useful or on topic.', } DOMAINS = ['economics', 'history', 'politics'] class CodecDoc(NamedTuple): doc_id: str title: str text: str url: str def default_text(self): return f'{self.title} {self.text}' class CodecQuery(NamedTuple): query_id: str query: str domain: str guidelines: str def default_text(self): """ query """ return self.query class CodecQueries(BaseQueries): def __init__(self, streamer, qid_filter=None): super().__init__() self._streamer = streamer self._qid_filter = qid_filter def queries_iter(self): with self._streamer.stream() as stream: data = json.load(stream) for qid, query in data.items(): if self._qid_filter is None or qid.startswith(self._qid_filter): yield CodecQuery(qid, query['Query'], query['Domain'], query['Guidelines']) def queries_cls(self): return CodecQuery def queries_namespace(self): return NAME def queries_lang(self): return 'en' def filter_qids(domain, queries_handler): return Lazy(lambda: {q.query_id for q in queries_handler.queries_iter()}) def _init(): base_path = ir_datasets.util.home_path()/NAME dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') corpus = JsonlDocs(dlc['documents'], doc_cls=CodecDoc, mapping={'doc_id': "id", "title": "title", "text": "contents", "url": "url"}, lang='en', count_hint=729824) base = Dataset( corpus, CodecQueries(dlc['topics']), TrecQrels(dlc['qrels'], QREL_DEFS), documentation('_')) subsets = {} for domain in DOMAINS: queries_handler = CodecQueries(dlc['topics'], qid_filter=domain) subsets[domain] = Dataset( corpus, queries_handler, FilteredQrels(base.qrels_handler(), filter_qids(domain, queries_handler), mode='include'), documentation(domain)) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/codesearchnet.py ================================================ import json import csv import gzip from typing import NamedTuple import io import itertools from pathlib import Path import ir_datasets from ir_datasets.util import DownloadConfig, TarExtract, ZipExtractCache from ir_datasets.formats import BaseDocs, BaseQueries, BaseQrels from ir_datasets.formats import GenericDoc, GenericQuery, TrecQrel from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS from ir_datasets.datasets.base import Dataset, YamlDocumentation NAME = 'codesearchnet' QREL_DEFS = { 1: 'Matches docstring', } QREL_DEFS_CHALLENGE = { 0: 'Irrelevant', 1: 'Weak Match', 2: 'String Match', 3: 'Exact Match', } class CodeSearchNetDoc(NamedTuple): doc_id: str repo: str path: str func_name: str code: str language: str class CodeSearchNetChallengeQrel(NamedTuple): query_id: str doc_id: str relevance: str note: str class CodeSearchNetDocs(BaseDocs): def __init__(self, docs_dlcs): super().__init__() self.docs_dlcs = docs_dlcs @ir_datasets.util.use_docstore def docs_iter(self): for dlc in self.docs_dlcs: base_path = Path(dlc.path()) for file in sorted(base_path.glob('**/*.gz')): with gzip.open(file, 'rt') as f: for line in f: data = json.loads(line) yield CodeSearchNetDoc( data['url'], # doc_id = url data['repo'], data['path'], data['func_name'], data['code'], data['language'], ) def docs_cls(self): return CodeSearchNetDoc def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f'{ir_datasets.util.home_path()/NAME}/docs.pklz4', init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], count_hint=ir_datasets.util.count_hint(NAME), options=options ) def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_namespace(self): return NAME def docs_lang(self): return None # not natural languages class CodeSearchNetQueries(BaseQueries): def __init__(self, queries_dlcs, split): super().__init__() self.queries_dlcs = queries_dlcs self.split = split def queries_iter(self): for dlc in self.queries_dlcs: base_path = Path(dlc.path()) for file in sorted(base_path.glob(f'**/{self.split}/*.gz')): with gzip.open(file, 'rt') as f: for line in f: data = json.loads(line) yield GenericQuery( data['url'], # query_id = url data['docstring'], # text = docstring ) def queries_cls(self): return GenericQuery def queries_namespace(self): return NAME def queries_lang(self): return 'en' class CodeSearchNetQrels(BaseQrels): def __init__(self, qrels_dlcs, split): super().__init__() self.qrels_dlcs = qrels_dlcs self.split = split def qrels_iter(self): for dlc in self.qrels_dlcs: base_path = Path(dlc.path()) for file in sorted(base_path.glob(f'**/{self.split}/*.gz')): with gzip.open(file, 'rt') as f: for line in f: data = json.loads(line) yield TrecQrel( query_id=data['url'], doc_id=data['url'], relevance=1, iteration='0', ) def qrels_cls(self): return TrecQrel def qrels_defs(self): return QREL_DEFS def queries_lang(self): return 'en' class CodeSearchNetChallengeQueries(BaseQueries): def __init__(self, queries_dlc): super().__init__() self.queries_dlc = queries_dlc def queries_path(self): return self.queries_dlc.path() def queries_iter(self): with self.queries_dlc.stream() as stream: stream = io.TextIOWrapper(stream) for i, line in enumerate(stream): if i == 0: continue # skip first (header) line yield GenericQuery(str(i), line.rstrip()) def queries_cls(self): return GenericQuery def queries_namespace(self): return NAME class CodeSearchNetChallengeQrels(BaseQrels): def __init__(self, qrels_dlc, queries_handler): super().__init__() self.qrels_dlc = qrels_dlc self._queries_handler = queries_handler def qrels_path(self): return self.qrels_dlc.path() def qrels_iter(self): query_map = {q.text: q.query_id for q in self._queries_handler.queries_iter()} with self.qrels_dlc.stream() as stream: stream = io.TextIOWrapper(stream) for data in csv.DictReader(stream): yield CodeSearchNetChallengeQrel( query_id=query_map[data['Query']], doc_id=data['GitHubUrl'], relevance=data['Relevance'], note=data['Notes']) def qrels_cls(self): return CodeSearchNetChallengeQrel def qrels_defs(self): return QREL_DEFS_CHALLENGE def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} langs = ['python', 'java', 'go', 'php', 'ruby', 'javascript'] dlcs = {lang: ZipExtractCache(dlc[lang], base_path/lang) for lang in langs} all_dlcs = [dlcs[lang] for lang in langs] collection = CodeSearchNetDocs(all_dlcs) base = Dataset( collection, documentation('_'), ) subsets['train'] = Dataset( collection, CodeSearchNetQueries(all_dlcs, 'train'), CodeSearchNetQrels(all_dlcs, 'train'), documentation('train'), ) subsets['valid'] = Dataset( collection, CodeSearchNetQueries(all_dlcs, 'valid'), CodeSearchNetQrels(all_dlcs, 'valid'), documentation('valid'), ) subsets['test'] = Dataset( collection, CodeSearchNetQueries(all_dlcs, 'test'), CodeSearchNetQrels(all_dlcs, 'test'), documentation('test'), ) challenge_queries = CodeSearchNetChallengeQueries(dlc['challenge/queries']) subsets['challenge'] = Dataset( collection, challenge_queries, CodeSearchNetChallengeQrels(dlc['challenge/qrels'], challenge_queries), documentation('challenge'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/cord19.py ================================================ import io import codecs import json import csv import contextlib import os import shutil import tarfile from collections import defaultdict from typing import NamedTuple, Tuple from pathlib import Path import ir_datasets from ir_datasets.util import Lazy, DownloadConfig from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredQrels, YamlDocumentation from ir_datasets.formats import BaseDocs, TrecXmlQueries, TrecQrels, GenericQuery, GenericQrel from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS NAME = 'cord19' _logger = ir_datasets.log.easy() class Cord19Doc(NamedTuple): doc_id: str title: str doi: str date: str abstract: str def default_text(self): """ title + abstract """ return f'{self.title} {self.abstract}' class Cord19FullTextSection(NamedTuple): title: str text: str class Cord19FullTextDoc(NamedTuple): doc_id: str title: str doi: str date: str abstract: str body: Tuple[Cord19FullTextSection, ...] def default_text(self): """ title + abstract + body """ body = ' '.join(f'{b.title} {b.text}' for b in self.body) return f'{self.title} {self.abstract} {body}' QRELS_DEFS = { 2: 'Relevant: the article is fully responsive to the information need as expressed by the topic, i.e. answers the Question in the topic. The article need not contain all information on the topic, but must, on its own, provide an answer to the question.', 1: 'Partially Relevant: the article answers part of the question but would need to be combined with other information to get a complete answer.', 0: 'Not Relevant: everything else.', } QTYPE_MAP = { 'query': 'title', 'question': 'description', 'narrative': 'narrative' } class Cord19Docs(BaseDocs): def __init__(self, streamer, extr_path, date, include_fulltext=False, count_hint=None): self._streamer = streamer self._extr_path = Path(extr_path) self._date = date self._include_fulltext = include_fulltext self._count_hint = count_hint def docs_path(self, force=True): result = self._streamer.path(force) if self._include_fulltext: return f'{result}.fulltext' return result def docs_cls(self): return Cord19FullTextDoc if self._include_fulltext else Cord19Doc def docs_iter(self): return iter(self.docs_store()) def _docs_iter(self): if self._include_fulltext: if not os.path.exists(self._extr_path): try: with self._streamer.stream() as stream: mode = 'r|' if self._streamer.path().endswith('.gz'): mode += 'gz' elif self._streamer.path().endswith('.bz2'): mode += 'bz2' with _logger.duration('extracting tarfile'): with tarfile.open(fileobj=stream, mode=mode) as tarf: tarf.extractall(self._extr_path) except: shutil.rmtree(self._extr_path) raise with contextlib.ExitStack() as ctxt: # Sometiems the document parses are in a single big file, sometimes in separate. fulltexts = None if self._include_fulltext: bigfile = self._extr_path/self._date/'document_parses.tar.gz' if bigfile.exists(): fulltexts = tarfile.open(fileobj=ctxt.push(bigfile.open('rb'))) else: fulltexts = { 'biorxiv_medrxiv': tarfile.open(fileobj=ctxt.push((self._extr_path/self._date/'biorxiv_medrxiv.tar.gz').open('rb'))), 'comm_use_subset': tarfile.open(fileobj=ctxt.push((self._extr_path/self._date/'comm_use_subset.tar.gz').open('rb'))), 'noncomm_use_subset': tarfile.open(fileobj=ctxt.push((self._extr_path/self._date/'noncomm_use_subset.tar.gz').open('rb'))), 'custom_license': tarfile.open(fileobj=ctxt.push((self._extr_path/self._date/'custom_license.tar.gz').open('rb'))), } if self._include_fulltext: csv_reader = ctxt.push((self._extr_path/self._date/'metadata.csv').open('rt')) else: csv_reader = ctxt.enter_context(self._streamer.stream()) csv_reader = codecs.getreader('utf8')(csv_reader) csv_reader = csv.DictReader(csv_reader) for record in csv_reader: did = record['cord_uid'] title = record['title'] doi = record['doi'] abstract = record['abstract'] date = record['publish_time'] if self._include_fulltext: body = None # Sometiems the document parses are in a single big file, sometimes in separate. # The metadata format is also different in these cases. if isinstance(fulltexts, dict): if record['has_pmc_xml_parse']: path = os.path.join(record['full_text_file'], 'pmc_json', record['pmcid'] + '.xml.json') body = json.load(fulltexts[record['full_text_file']].extractfile(path)) elif record['has_pdf_parse']: path = os.path.join(record['full_text_file'], 'pdf_json', record['sha'].split(';')[0].strip() + '.json') body = json.load(fulltexts[record['full_text_file']].extractfile(path)) elif fulltexts is not None: if record['pmc_json_files']: body = json.load(fulltexts.extractfile(record['pmc_json_files'].split(';')[0])) elif record['pdf_json_files']: body = json.load(fulltexts.extractfile(record['pdf_json_files'].split(';')[0])) if body is not None: if 'body_text' in body: body = tuple(Cord19FullTextSection(b['section'], b['text']) for b in body['body_text']) else: body = tuple() # no body available else: body = tuple() # no body available yield Cord19FullTextDoc(did, title, doi, date, abstract, body) else: yield Cord19Doc(did, title, doi, date, abstract) def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f'{self.docs_path(force=False)}.pklz4', init_iter_fn=self._docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], count_hint=self._count_hint, options=options ) def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_namespace(self): return NAME def docs_lang(self): return 'en' def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = Cord19Docs(dlc['docs/2020-07-16/metadata'], base_path/'2020-07-16', '2020-07-16', count_hint=ir_datasets.util.count_hint(f'{NAME}')) collection_ft = Cord19Docs(dlc['docs/2020-07-16'], base_path/'2020-07-16.fulltext', '2020-07-16', include_fulltext=True, count_hint=ir_datasets.util.count_hint(f'{NAME}/fulltext')) queries = TrecXmlQueries(dlc['trec-covid/queries'], qtype_map=QTYPE_MAP, namespace=NAME, lang='en') qrels = TrecQrels(dlc['trec-covid/qrels'], QRELS_DEFS) base = Dataset(collection, documentation('_')) subsets['trec-covid'] = Dataset(queries, qrels, collection, documentation('trec-covid')) subsets['fulltext'] = Dataset(collection_ft, documentation('fulltext')) subsets['fulltext/trec-covid'] = Dataset(queries, qrels, collection_ft, documentation('fulltext/trec-covid')) subsets['trec-covid/round1'] = Dataset( Cord19Docs(dlc['docs/2020-04-10/metadata'], base_path/'2020-04-10', '2020-04-10', count_hint=ir_datasets.util.count_hint(f'{NAME}/round1')), TrecXmlQueries(dlc['trec-covid/round1/queries'], qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels(dlc['trec-covid/round1/qrels'], QRELS_DEFS), documentation('trec-covid/round1')) subsets['trec-covid/round2'] = Dataset( Cord19Docs(dlc['docs/2020-05-01/metadata'], base_path/'2020-05-01', '2020-05-01', count_hint=ir_datasets.util.count_hint(f'{NAME}/round2')), TrecXmlQueries(dlc['trec-covid/round2/queries'], qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels(dlc['trec-covid/round2/qrels'], QRELS_DEFS), documentation('trec-covid/round2')) subsets['trec-covid/round3'] = Dataset( Cord19Docs(dlc['docs/2020-05-19/metadata'], base_path/'2020-05-19', '2020-05-19', count_hint=ir_datasets.util.count_hint(f'{NAME}/round3')), TrecXmlQueries(dlc['trec-covid/round3/queries'], qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels(dlc['trec-covid/round3/qrels'], QRELS_DEFS), documentation('trec-covid/round3')) subsets['trec-covid/round4'] = Dataset( Cord19Docs(dlc['docs/2020-06-19/metadata'], base_path/'2020-06-19', '2020-06-19', count_hint=ir_datasets.util.count_hint(f'{NAME}/round4')), TrecXmlQueries(dlc['trec-covid/round4/queries'], qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels(dlc['trec-covid/round4/qrels'], QRELS_DEFS), documentation('trec-covid/round4')) subsets['trec-covid/round5'] = Dataset( collection, queries, TrecQrels(dlc['trec-covid/round5/qrels'], QRELS_DEFS), documentation('trec-covid/round5')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/cranfield.py ================================================ import io import codecs import itertools import ir_datasets from typing import NamedTuple from ir_datasets.util import DownloadConfig, TarExtract, Cache from ir_datasets.formats import BaseDocs, BaseQueries, BaseQrels from ir_datasets.formats import GenericDoc, GenericQuery, TrecQrel from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS from ir_datasets.datasets.base import Dataset, YamlDocumentation NAME = 'cranfield' QREL_DEFS = { -1: 'References of no interest.', 1: 'References of minimum interest, for example, those that have been included from an historical viewpoint.', 2: 'References which were useful, either as general background to the work or as suggesting methods of tackling certain aspects of the work.', 3: 'References of a high degree of relevance, the lack of which either would have made the research impracticable or would have resulted in a considerable amount of extra work.', 4: 'References which are a complete answer to the question.', } class CranfieldDoc(NamedTuple): doc_id: str title: str text: str author: str bib: str def default_text(self): """ title + text """ return f'{self.title} {self.text}' def prefix_sentinel_splitter(it, sentinel): lines = None for is_sentinel, group in itertools.groupby(it, lambda l: l.startswith(sentinel)): if is_sentinel: lines = [list(group)[0].replace(sentinel, '')] else: lines += list(group) yield lines class CranfieldDocs(BaseDocs): def __init__(self, docs_dlc): super().__init__() self.docs_dlc = docs_dlc def docs_path(self, force=True): return self.docs_dlc.path(force) @ir_datasets.util.use_docstore def docs_iter(self): with self.docs_dlc.stream() as stream: stream = io.TextIOWrapper(stream) for lines in prefix_sentinel_splitter(stream, sentinel='.I '): record = {'doc_id': '', 'title': '', 'author': '', 'bib': '', 'text': ''} field = 'doc_id' for line in lines: if line.startswith('.T'): field = 'title' elif line.startswith('.A'): field = 'author' elif line.startswith('.B'): field = 'bib' elif line.startswith('.W'): field = 'text' else: record[field] += line record = {k: v.strip() for k, v in record.items()} yield CranfieldDoc(**record) def docs_cls(self): return CranfieldDoc def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f'{ir_datasets.util.home_path()/NAME}/docs.pklz4', init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], count_hint=ir_datasets.util.count_hint(NAME), options=options ) def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_namespace(): return NAME def docs_lang(self): return 'en' class CranfieldQueries(BaseQueries): def __init__(self, queries_dlc): super().__init__() self.queries_dlc = queries_dlc def queries_path(self): return self.queries_dlc.path() def queries_iter(self): with self.queries_dlc.stream() as stream: stream = io.TextIOWrapper(stream) query_id = 1 for lines in prefix_sentinel_splitter(stream, sentinel='.I '): record = {'query_id': '', 'text': ''} field = 'query_id' for line in lines: if line.startswith('.W'): field = 'text' else: record[field] += line record = {k: v.strip() for k, v in record.items()} record['query_id'] = str(query_id) # overwrite query_id to match qrels query_id += 1 yield GenericQuery(**record) def queries_cls(self): return GenericQuery def queries_namespace(self): return NAME def queries_lang(self): return 'en' class CranfieldQrels(BaseQrels): def __init__(self, qrels_dlc): self._qrels_dlc = qrels_dlc def qrels_path(self): return self._qrels_dlc.path() def qrels_iter(self): with self._qrels_dlc.stream() as f: f = codecs.getreader('utf8')(f) for line in f: cols = line.rstrip().split() if len(cols) != 3: raise RuntimeError(f'expected 3 columns, got {len(cols)}') qid, did, score = cols yield TrecQrel(qid, did, int(score), '0') def qrels_cls(self): return TrecQrel def qrels_defs(self): return QREL_DEFS def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} main_dlc = dlc['main'] base = Dataset( CranfieldDocs(Cache(TarExtract(main_dlc, 'cran.all.1400'), base_path/'docs.txt')), CranfieldQueries(Cache(TarExtract(main_dlc, 'cran.qry'), base_path/'queries.txt')), CranfieldQrels(Cache(TarExtract(main_dlc, 'cranqrel'), base_path/'qrels.txt')), documentation('_'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/csl.py ================================================ from typing import List, NamedTuple from enum import Enum import ir_datasets from ir_datasets.util import DownloadConfig, GzipExtract from ir_datasets.datasets.base import Dataset, YamlDocumentation from ir_datasets.formats.trec import TrecQrels from ir_datasets.formats import JsonlDocs, ExctractedCCQueries, ExctractedCCNoReportQuery from ir_datasets.util.fileio import TarExtract NAME = 'csl' class CslDoc(NamedTuple): doc_id: str title: str abstract: str keywords: List[str] category: str category_eng: str discipline: str discipline_eng: str def default_text(self): return f'{self.title}\n{self.abstract}' QREL_DEFS = { 3: 'Very-valuable. Information in the document would be found in the lead paragraph of a report that is later written on the topic.', 1: 'Somewhat-valuable. The most valuable information in the document would be found in the remainder of such a report.', 0: 'Not-valuable. Information in the document might be included in a report footnote, or omitted entirely.', } def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') docs = JsonlDocs(GzipExtract(dlc['docs']), doc_cls=CslDoc, namespace=NAME, lang='zh', count_hint=395927) base = Dataset( docs, documentation('_') ) subsets["trec-2023"] = Dataset( docs, ExctractedCCQueries(dlc['trec-2023/queries'], subset_lang='zh', filter_lwq=False, cls=ExctractedCCNoReportQuery, namespace=NAME), TrecQrels(TarExtract(dlc['trec-2023/qrels'], 'tech_final_qrels.txt'), QREL_DEFS), documentation('trec-2023'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/disks45.py ================================================ import ir_datasets from ir_datasets.util import GzipExtract, TarExtract, Lazy, DownloadConfig from ir_datasets.formats import TrecQrels, TrecDocs, TrecQueries from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredQrels, YamlDocumentation NAME = 'disks45' QREL_DEFS = { 2: 'highly relevant', 1: 'relevant', 0: 'not relevant', } QREL_DEFS_TREC78 = { 1: 'relevant', 0: 'not relevant', } DUA = ("Please confirm you agree to the TREC data usage agreement found at " "<https://trec.nist.gov/data/cd45/index.html>") # folds from Huston & Croft 2014 <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.646.7749> ROBUST04_FOLDS = { 'fold1': {'302', '303', '309', '316', '317', '319', '323', '331', '336', '341', '356', '357', '370', '373', '378', '381', '383', '392', '394', '406', '410', '411', '414', '426', '428', '433', '447', '448', '601', '607', '608', '612', '617', '619', '635', '641', '642', '646', '647', '654', '656', '662', '665', '669', '670', '679', '684', '690', '692', '700'}, 'fold2': {'301', '308', '312', '322', '327', '328', '338', '343', '348', '349', '352', '360', '364', '365', '369', '371', '374', '386', '390', '397', '403', '419', '422', '423', '424', '432', '434', '440', '446', '602', '604', '611', '623', '624', '627', '632', '638', '643', '651', '652', '663', '674', '675', '678', '680', '683', '688', '689', '695', '698'}, 'fold3': {'306', '307', '313', '321', '324', '326', '334', '347', '351', '354', '358', '361', '362', '363', '376', '380', '382', '396', '404', '413', '415', '417', '427', '436', '437', '439', '444', '445', '449', '450', '603', '605', '606', '614', '620', '622', '626', '628', '631', '637', '644', '648', '661', '664', '666', '671', '677', '685', '687', '693'}, 'fold4': {'320', '325', '330', '332', '335', '337', '342', '344', '350', '355', '368', '377', '379', '387', '393', '398', '402', '405', '407', '408', '412', '420', '421', '425', '430', '431', '435', '438', '616', '618', '625', '630', '633', '636', '639', '649', '650', '653', '655', '657', '659', '667', '668', '672', '673', '676', '682', '686', '691', '697'}, 'fold5': {'304', '305', '310', '311', '314', '315', '318', '329', '333', '339', '340', '345', '346', '353', '359', '366', '367', '372', '375', '384', '385', '388', '389', '391', '395', '399', '400', '401', '409', '416', '418', '429', '441', '442', '443', '609', '610', '613', '615', '621', '629', '634', '640', '645', '658', '660', '681', '694', '696', '699'} } def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path, dua=DUA) subsets = {} collection_nocr = TrecDocs(dlc['docs'], path_globs=['**/FBIS/FB*', '**/FR94/??/FR*', '**/FT/*/FT*', '**/LATIMES/LA*'], namespace=NAME, lang='en', expected_file_count=2295, count_hint=ir_datasets.util.count_hint(NAME), parser='sax', docstore_path=base_path/'corpus.nocr.pklz4') robust_queries = TrecQueries(GzipExtract(dlc['robust04-queries']), namespace=NAME, lang='en') robust_qrels = TrecQrels(dlc['robust04-qrels'], QREL_DEFS) base = Dataset(documentation('_')) subsets['nocr'] = Dataset( collection_nocr, documentation('nocr')) subsets['nocr/trec-robust-2004'] = Dataset( collection_nocr, robust_queries, robust_qrels, documentation('nocr/trec-robust-2004')) for fold in ROBUST04_FOLDS: qid_filter = make_filter(fold) subsets[f'nocr/trec-robust-2004/{fold}'] = Dataset( collection_nocr, FilteredQueries(robust_queries, qid_filter), FilteredQrels(robust_qrels, qid_filter), documentation(f'nocr/trec-robust-2004/{fold}')) subsets['nocr/trec8'] = Dataset( collection_nocr, TrecQrels(TarExtract(dlc['trec8-qrels'], 'qrels.trec8.adhoc.parts1-5'), QREL_DEFS_TREC78), TrecQueries(GzipExtract(dlc['trec8-queries']), namespace=NAME, lang='en'), documentation('nocr/trec8')) subsets['nocr/trec7'] = Dataset( collection_nocr, TrecQrels([ GzipExtract(TarExtract(dlc['trec7-qrels'], 'qrels.trec7.adhoc.part1.gz')), GzipExtract(TarExtract(dlc['trec7-qrels'], 'qrels.trec7.adhoc.part2.gz')), GzipExtract(TarExtract(dlc['trec7-qrels'], 'qrels.trec7.adhoc.part3.gz')), GzipExtract(TarExtract(dlc['trec7-qrels'], 'qrels.trec7.adhoc.part4.gz')), GzipExtract(TarExtract(dlc['trec7-qrels'], 'qrels.trec7.adhoc.part5.gz')), ], QREL_DEFS_TREC78), TrecQueries(GzipExtract(dlc['trec7-queries']), namespace=NAME, lang='en'), documentation('nocr/trec7')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets def make_filter(fold): return Lazy(lambda: ROBUST04_FOLDS[fold]) base, subsets = _init() ================================================ FILE: ir_datasets/datasets/dpr_w100.py ================================================ from typing import NamedTuple, Tuple import contextlib import itertools import ir_datasets from ir_datasets.util import GzipExtract from ir_datasets.datasets.base import Dataset, YamlDocumentation from ir_datasets.formats import TsvDocs, BaseQueries, TrecQrels _logger = ir_datasets.log.easy() NAME = 'dpr-w100' QREL_DEFS = { 2: 'marked by human annotator as containing the answer', 1: 'contains the answer text and retrieved in the top BM25 results', 0: '"hard" negative samples', -1: 'negative samples' } class DprW100Doc(NamedTuple): doc_id: str text: str title: str def default_text(self): """ title + text """ return f'{self.title} {self.text}' class DprW100Query(NamedTuple): query_id: str text: str answers: Tuple[str, ] def default_text(self): """ text """ return self.text class DprW100Manager: def __init__(self, dlc, base_path, passage_id_key='passage_id'): self._dlc = dlc self._base_path = base_path self._base_path.mkdir(parents=True, exist_ok=True) self._passage_id_key = passage_id_key def build(self): ijson = ir_datasets.lazy_libs.ijson() if (self._base_path/'queries.tsv').exists(): return # already built with contextlib.ExitStack() as stack: f_queries = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'queries.tsv', 'wt')) f_qrels = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'qrels', 'wt')) stream = stack.enter_context(self._dlc.stream()) qid_counter = itertools.count() for record in _logger.pbar(ijson.items(stream, 'item'), 'building dpr-w100', unit='record'): qid = str(next(qid_counter)) f_queries.write('\t'.join([ qid, record['question'].replace('\t', ' ') ] + [ a.replace('\t', ' ') for a in record['answers'] ]) + '\n') seen = set() for ctxt in record['positive_ctxs']: if ctxt[self._passage_id_key] not in seen: seen.add(ctxt[self._passage_id_key]) rel = 2 if ctxt['score'] == 1000 else 1 f_qrels.write(f'{qid} 0 {ctxt[self._passage_id_key]} {rel}\n') for ctxt in record['hard_negative_ctxs']: if ctxt[self._passage_id_key] not in seen: seen.add(ctxt[self._passage_id_key]) f_qrels.write(f'{qid} 0 {ctxt[self._passage_id_key]} 0\n') for ctxt in record['negative_ctxs']: if ctxt[self._passage_id_key] not in seen: seen.add(ctxt[self._passage_id_key]) f_qrels.write(f'{qid} 0 {ctxt[self._passage_id_key]} -1\n') def file_ref(self, path): return _ManagedDlc(self, self._base_path/path) class _ManagedDlc: def __init__(self, manager, path): self._manager = manager self._path = path @contextlib.contextmanager def stream(self): self._manager.build() with open(self._path, 'rb') as f: yield f def path(self, force=True): if force: self._manager.build() return self._path class DprW100Queries(BaseQueries): def __init__(self, dlc): self._dlc = dlc def queries_iter(self): with self._dlc.stream() as stream: for line in stream: cols = line.decode().strip().split('\t') yield DprW100Query(cols[0], cols[1], tuple(cols[2:])) def queries_cls(self): return DprW100Query def queries_namespace(self): return NAME def queries_lang(self): return 'en' def _init(): base_path = ir_datasets.util.home_path()/NAME dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = TsvDocs(GzipExtract(dlc['docs']), doc_cls=DprW100Doc, namespace=NAME, lang='en', skip_first_line=True, docstore_size_hint=12827215492, count_hint=ir_datasets.util.count_hint(NAME)) base = Dataset( collection, documentation('_')) subsets = {} nq_dev_manager = DprW100Manager(GzipExtract(dlc['nq-dev']), base_path/'nq-dev') subsets['natural-questions/dev'] = Dataset( collection, DprW100Queries(nq_dev_manager.file_ref('queries.tsv')), TrecQrels(nq_dev_manager.file_ref('qrels'), QREL_DEFS), documentation('natural-questions/dev')) nq_train_manager = DprW100Manager(GzipExtract(dlc['nq-train']), base_path/'nq-train') subsets['natural-questions/train'] = Dataset( collection, DprW100Queries(nq_train_manager.file_ref('queries.tsv')), TrecQrels(nq_train_manager.file_ref('qrels'), QREL_DEFS), documentation('natural-questions/train')) tqa_dev_manager = DprW100Manager(GzipExtract(dlc['tqa-dev']), base_path/'tqa-dev', passage_id_key='psg_id') subsets['trivia-qa/dev'] = Dataset( collection, DprW100Queries(tqa_dev_manager.file_ref('queries.tsv')), TrecQrels(tqa_dev_manager.file_ref('qrels'), QREL_DEFS), documentation('trivia-qa/dev')) tqa_train_manager = DprW100Manager(GzipExtract(dlc['tqa-train']), base_path/'tqa-train', passage_id_key='psg_id') subsets['trivia-qa/train'] = Dataset( collection, DprW100Queries(tqa_train_manager.file_ref('queries.tsv')), TrecQrels(tqa_train_manager.file_ref('qrels'), QREL_DEFS), documentation('trivia-qa/train')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/gov.py ================================================ import re import io import os import gzip import codecs from collections import Counter from contextlib import contextmanager, ExitStack from pathlib import Path from typing import NamedTuple from glob import glob import ir_datasets from ir_datasets.util import DownloadConfig, GzipExtract, TarExtract from ir_datasets.formats import TrecQrels, TrecQueries, TrecColonQueries, BaseDocs, GenericQuery, BaseQrels from ir_datasets.datasets.base import Dataset, YamlDocumentation from ir_datasets.indices import Docstore, PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS _logger = ir_datasets.log.easy() NAME = 'gov' QREL_DEFS = { 1: 'Relevant', 0: 'Not Relevant', } NAMED_PAGE_QREL_DEFS = { 1: 'Name refers to this page', } NAMED_PAGE_QTYPE_MAP = { '<num> *(Number:)? *NP': 'query_id', # Remove NP prefix from QIDs '<desc> *(Description:)?': 'text', } WEB03_QTYPE_MAP = { '<num> *(Number:)? *TD': 'query_id', # Remove TD prefix from QIDs '<title>': 'title', '<desc> *(Description:)?': 'description', } WEB04_QTYPE_MAP = { '<num> *(Number:)? *WT04-': 'query_id', '<title>': 'text', } class GovWeb02Query(NamedTuple): query_id: str title: str description: str def default_text(self): """ title """ return self.title class GovDoc(NamedTuple): doc_id: str url: str http_headers: str body: bytes body_content_type: str def default_text(self): return ir_datasets.util.sax_html_parser(self.body, headers=self.http_headers, fields=[{'title', 'body'}])[0] class GovDocs(BaseDocs): def __init__(self, docs_dlc): super().__init__() self.docs_dlc = docs_dlc def docs_path(self, force=True): return self.docs_dlc.path(force) def docs_iter(self): return iter(self.docs_store()) def _docs_iter(self): dirs = sorted(Path(self.docs_dlc.path()).glob('G??')) for source_dir in dirs: for source_file in sorted(source_dir.glob('*.gz')): yield from self._docs_ctxt_iter_gov(source_file) def docs_cls(self): return GovDoc def _docs_ctxt_iter_gov(self, gov2f): with ExitStack() as stack: if isinstance(gov2f, (str, Path)): gov2f = stack.enter_context(gzip.open(gov2f, 'rb')) inp = bytearray() # incrementally read the input file with read1 -- this ends up being more than twice # as fast as reading the input line-by-line and searching for <DOC> and </DOC> lines inp.extend(gov2f.read1()) START, END = b'<DOC>\n', b'</DOC>\n' while inp != b'': inp, next_doc = self._extract_next_block(inp, START, END) while next_doc is not None: yield self._process_gov_doc(next_doc) inp, next_doc = self._extract_next_block(inp, START, END) inp.extend(gov2f.read1()) def _process_gov_doc(self, raw_doc): # read the file by exploiting the sequence of blocks in the document -- this ends # up being several times faster than reading line-by-line raw_doc, doc_id = self._extract_next_block(raw_doc, b'<DOCNO>', b'</DOCNO>\n') assert doc_id is not None doc_id = doc_id.strip().decode() doc_body, doc_hdr = self._extract_next_block(raw_doc, b'<DOCHDR>\n', b'</DOCHDR>\n') assert doc_hdr is not None for encoding in ['utf8', 'ascii', 'latin1']: try: doc_url, doc_hdr = doc_hdr.decode(encoding).split('\n', 1) break except UnicodeDecodeError: continue content_type_match = re.search('^content-type:(.*)$', doc_hdr, re.I|re.M) content_type = 'text/html' # default to text/html if content_type_match: content_type = content_type_match.group(1) if ';' in content_type: content_type, _ = content_type.split(';', 1) content_type = content_type.strip() return GovDoc(doc_id, doc_url, doc_hdr, bytes(doc_body), content_type) def _extract_next_block(self, inp, START, END): # if START and END appear in inp, then return (everything after END in inp, the content between START and END), # or if they don't appear, return (inp, None). i_start = inp.find(START) i_end = inp.find(END) if i_start == -1 or i_end == -1: return inp, None return inp[i_end+len(END):], inp[i_start+len(START):i_end] def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f'{self.docs_path(force=False)}.pklz4', init_iter_fn=self._docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], count_hint=ir_datasets.util.count_hint(NAME), options=options ) def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_namespace(self): return NAME def docs_lang(self): return 'en' def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} collection = GovDocs(dlc['docs']) base = Dataset(collection, documentation('_')) subsets['trec-web-2002'] = Dataset( collection, TrecQueries(GzipExtract(dlc['trec-web-2002/queries']), namespace='gov/trec-web-2002', lang='en'), TrecQrels(GzipExtract(dlc['trec-web-2002/qrels']), QREL_DEFS), documentation('trec-web-2002') ) subsets['trec-web-2002/named-page'] = Dataset( collection, TrecQueries(GzipExtract(dlc['trec-web-2002/named-page/queries']), qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace='gov/trec-web-2002/named-page', lang='en'), TrecQrels(GzipExtract(dlc['trec-web-2002/named-page/qrels']), NAMED_PAGE_QREL_DEFS), documentation('trec-web-2002/named-page') ) subsets['trec-web-2003'] = Dataset( collection, TrecQueries(dlc['trec-web-2003/queries'], qtype=GovWeb02Query, qtype_map=WEB03_QTYPE_MAP, namespace='gov/trec-web-2003', lang='en'), TrecQrels(dlc['trec-web-2003/qrels'], QREL_DEFS), documentation('trec-web-2003') ) subsets['trec-web-2003/named-page'] = Dataset( collection, TrecQueries(dlc['trec-web-2003/named-page/queries'], qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace='gov/trec-web-2003/named-page', lang='en'), TrecQrels(dlc['trec-web-2003/named-page/qrels'], NAMED_PAGE_QREL_DEFS), documentation('trec-web-2003/named-page') ) subsets['trec-web-2004'] = Dataset( collection, TrecQueries(dlc['trec-web-2004/queries'], qtype=GenericQuery, qtype_map=WEB04_QTYPE_MAP, namespace='gov/trec-web-2004', lang='en'), TrecQrels(dlc['trec-web-2004/qrels'], QREL_DEFS), documentation('trec-web-2004') ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/gov2.py ================================================ import re import io import os import gzip import codecs from collections import Counter from contextlib import contextmanager, ExitStack from pathlib import Path from typing import NamedTuple from glob import glob import ir_datasets from ir_datasets.util import DownloadConfig, GzipExtract, TarExtract from ir_datasets.formats import TrecQrels, TrecQueries, TrecColonQueries, BaseDocs, GenericQuery, BaseQrels, TrecPrels from ir_datasets.datasets.base import Dataset, YamlDocumentation from ir_datasets.indices import Docstore _logger = ir_datasets.log.easy() NAME = 'gov2' QREL_DEFS = { 2: 'Highly Relevant', 1: 'Relevant', 0: 'Not Relevant', } NAMED_PAGE_QREL_DEFS = { 1: 'Relevant', 0: 'Not Relevant', } NAMED_PAGE_QTYPE_MAP = { '<num> *(Number:)? *NP': 'query_id', # Remove NP prefix from QIDs '<title> *(Topic:)?': 'text', } EFF_MAP_05 = {'751': '1192', '752': '1330', '753': '5956', '754': '6303', '755': '6939', '756': '7553', '757': '8784', '758': '9121', '759': '9266', '760': '10359', '761': '10406', '762': '11597', '763': '12750', '764': '15502', '765': '16895', '766': '17279', '767': '17615', '768': '18050', '769': '18678', '770': '19280', '771': '19963', '772': '20766', '773': '21329', '774': '21513', '775': '23212', '776': '24289', '777': '24781', '778': '24813', '779': '26593', '780': '27428', '781': '28120', '782': '28627', '783': '29561', '784': '33379', '785': '33820', '786': '34135', '787': '35192', '788': '36242', '789': '36530', '790': '36616', '791': '36738', '792': '37111', '793': '41088', '794': '41192', '795': '41506', '796': '44506', '797': '45081', '798': '47993', '799': '48890', '800': '49462'} EFF_MAP_06 = {'801': '62937', '802': '63569', '803': '63582', '804': '63641', '805': '64227', '806': '64266', '807': '64310', '808': '64642', '809': '64687', '810': '64704', '811': '64723', '812': '64741', '813': '64752', '814': '64938', '815': '65024', '816': '65070', '817': '65222', '818': '65335', '819': '65486', '820': '65504', '821': '65599', '822': '65821', '823': '65826', '824': '65950', '825': '66084', '826': '66409', '827': '66725', '828': '67326', '829': '67531', '830': '67550', '831': '67782', '832': '67961', '833': '68322', '834': '68492', '835': '68967', '836': '69028', '837': '69127', '838': '69401', '839': '69552', '840': '69564', '841': '69935', '842': '70033', '843': '70041', '844': '70285', '845': '70579', '846': '70707', '847': '70751', '848': '70815', '849': '70935', '850': '71136'} class Gov2Doc(NamedTuple): doc_id: str url: str http_headers: str body: bytes body_content_type: str def default_text(self): return ir_datasets.util.sax_html_parser(self.body, headers=self.http_headers, fields=[{'title', 'body'}])[0] class Gov2DocIter: def __init__(self, gov2_docs, slice): self.gov2_docs = gov2_docs self.slice = slice self.next_index = 0 self.file_iter = gov2_docs._docs_iter_source_files() self.current_file = None self.current_file_start_idx = 0 self.current_file_end_idx = 0 def __next__(self): if self.slice.start >= self.slice.stop: raise StopIteration while self.next_index != self.slice.start or self.current_file is None or self.current_file_end_idx <= self.slice.start: if self.current_file is None or self.current_file_end_idx <= self.slice.start: # First iteration or no docs remaining in this file if self.current_file is not None: self.current_file.close() self.current_file = None # jump ahead to the file that contains the desired index first = True while first or self.current_file_end_idx < self.slice.start: source_file = next(self.file_iter) self.next_index = self.current_file_end_idx self.current_file_start_idx = self.current_file_end_idx self.current_file_end_idx = self.current_file_start_idx + self.gov2_docs._docs_file_counts()[source_file] first = False self.current_file = self.gov2_docs._docs_ctxt_iter_gov2(source_file) else: for _ in zip(range(self.slice.start - self.next_index), self.current_file): # The zip here will stop at after either as many docs we must advance, or however # many docs remain in the file. In the latter case, we'll just drop out into the # next iteration of the while loop and pick up the next file. self.next_index += 1 result = next(self.current_file) self.next_index += 1 self.slice = slice(self.slice.start + (self.slice.step or 1), self.slice.stop, self.slice.step) return result def close(self): self.file_iter = None def __iter__(self): return self def __del__(self): self.close() def __getitem__(self, key): if isinstance(key, slice): # it[start:stop:step] new_slice = ir_datasets.util.apply_sub_slice(self.slice, key) return Gov2DocIter(self.gov2_docs, new_slice) elif isinstance(key, int): # it[index] new_slice = ir_datasets.util.slice_idx(self.slice, key) new_it = Gov2DocIter(self.gov2_docs, new_slice) try: return next(new_it) except StopIteration as e: raise IndexError((self.slice, slice(key, key+1), new_slice)) raise TypeError('key must be int or slice') class Gov2Docs(BaseDocs): def __init__(self, docs_dlc, doccount_dlc): super().__init__() self.docs_dlc = docs_dlc self._doccount_dlc = doccount_dlc self._docs_file_counts_cache = None def docs_path(self, force=True): return self.docs_dlc.path(force) def _docs_iter_source_files(self): dirs = sorted((Path(self.docs_dlc.path()) / 'GOV2_data').glob('GX???')) for source_dir in dirs: for source_file in sorted(source_dir.glob('*.gz')): yield str(source_file) def docs_iter(self): return Gov2DocIter(self, slice(0, self.docs_count())) def docs_cls(self): return Gov2Doc def _docs_ctxt_iter_gov2(self, gov2f): with ExitStack() as stack: if isinstance(gov2f, (str, Path)): gov2f = stack.enter_context(gzip.open(gov2f, 'rb')) inp = bytearray() # incrementally read the input file with read1 -- this ends up being more than twice # as fast as reading the input line-by-line and searching for <DOC> and </DOC> lines inp.extend(gov2f.read1()) START, END = b'<DOC>\n', b'</DOC>\n' while inp != b'': inp, next_doc = self._extract_next_block(inp, START, END) while next_doc is not None: yield self._process_gov2_doc(next_doc) inp, next_doc = self._extract_next_block(inp, START, END) inp.extend(gov2f.read1()) def _process_gov2_doc(self, raw_doc): # read the file by exploiting the sequence of blocks in the document -- this ends # up being several times faster than reading line-by-line raw_doc, doc_id = self._extract_next_block(raw_doc, b'<DOCNO>', b'</DOCNO>\n') assert doc_id is not None doc_id = doc_id.strip().decode() doc_body, doc_hdr = self._extract_next_block(raw_doc, b'<DOCHDR>\n', b'</DOCHDR>\n') assert doc_hdr is not None for encoding in ['utf8', 'ascii', 'latin1']: try: doc_url, doc_hdr = doc_hdr.decode(encoding).split('\n', 1) break except UnicodeDecodeError: continue content_type_match = re.search('^content-type:(.*)$', doc_hdr, re.I|re.M) content_type = 'text/html' # default to text/html if content_type_match: content_type = content_type_match.group(1) if ';' in content_type: content_type, _ = content_type.split(';', 1) content_type = content_type.strip() return Gov2Doc(doc_id, doc_url, doc_hdr, bytes(doc_body), content_type) def _extract_next_block(self, inp, START, END): # if START and END appear in inp, then return (everything after END in inp, the content between START and END), # or if they don't appear, return (inp, None). i_start = inp.find(START) i_end = inp.find(END) if i_start == -1 or i_end == -1: return inp, None return inp[i_end+len(END):], inp[i_start+len(START):i_end] def _docs_id_to_source_file(self, doc_id): parts = doc_id.split('-') if len(parts) != 3: return None s_dir, file, doc = parts source_file = os.path.join(self.docs_dlc.path(), 'GOV2_data', s_dir, f'{file}.gz') return source_file def _docs_file_counts(self): if self._docs_file_counts_cache is None: result = {} with self._doccount_dlc.stream() as f: f = codecs.getreader('utf8')(f) for line in f: path, count = line.strip().split() file = os.path.join(self.docs_dlc.path(), 'GOV2_data', path) result[file] = int(count) self._docs_file_counts_cache = result return self._docs_file_counts_cache def docs_store(self, options=ir_datasets.indices.DEFAULT_DOCSTORE_OPTIONS): docstore = Gov2Docstore(self) return ir_datasets.indices.CacheDocstore(docstore, f'{self.docs_path(force=False)}.cache', options=options) def docs_count(self): return sum(self._docs_file_counts().values()) def docs_namespace(self): return NAME def docs_lang(self): return 'en' class Gov2Docstore(Docstore): def __init__(self, gov2_docs, options=ir_datasets.indices.DEFAULT_DOCSTORE_OPTIONS): super().__init__(gov2_docs.docs_cls(), 'doc_id', options=options) self.gov2_docs = gov2_docs def get_many_iter(self, doc_ids): result = {} files_to_search = {} for doc_id in doc_ids: source_file = self.gov2_docs._docs_id_to_source_file(doc_id) if source_file is not None: if source_file not in files_to_search: files_to_search[source_file] = [] files_to_search[source_file].append(doc_id) for source_file, doc_ids in files_to_search.items(): doc_ids = sorted(doc_ids) for doc in self.gov2_docs._docs_ctxt_iter_gov2(source_file): if doc_ids[0] == doc.doc_id: yield doc doc_ids = doc_ids[1:] if not doc_ids: break # file finished class RewriteQids(BaseQrels): def __init__(self, base_qrels, qid_map): self._base_qrels = base_qrels self._qid_map = qid_map def qrels_iter(self): cls = self.qrels_cls() for qrel in self._base_qrels.qrels_iter(): if qrel.query_id in self._qid_map: qrel = cls(self._qid_map[qrel.query_id], *qrel[1:]) yield qrel def qrels_defs(self): return self._base_qrels.qrels_defs() def qrels_path(self): return self._base_qrels.qrels_path() def qrels_cls(self): return self._base_qrels.qrels_cls() class Gov2DocCountFile: def __init__(self, path, docs_dlc): self._path = path self._docs_dlc = docs_dlc def path(self, force=True): if force and not os.path.exists(self._path): docs_urls_path = os.path.join(self._docs_dlc.path(), 'GOV2_extras/url2id.gz') result = Counter() with _logger.pbar_raw(desc='building doccounts file', total=25205179, unit='doc') as pbar: with gzip.open(docs_urls_path, 'rt') as fin: for line in fin: url, doc_id = line.rstrip().split() d, f, i = doc_id.split('-') # formatted like: GX024-52-0546388 file = f'{d}/{f}.gz' result[file] += 1 pbar.update() with ir_datasets.util.finialized_file(self._path, 'wt') as fout: for file in sorted(result): fout.write(f'{file}\t{result[file]}\n') return self._path @contextmanager def stream(self): with open(self.path(), 'rb') as f: yield f def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} docs_dlc = dlc['docs'] doccount_dlc = Gov2DocCountFile(os.path.join(base_path, 'corpus.doccounts'), docs_dlc) collection = Gov2Docs(docs_dlc, doccount_dlc) base = Dataset(collection, documentation('_')) subsets['trec-tb-2004'] = Dataset( collection, TrecQueries(dlc['trec-tb-2004/queries'], namespace=NAME, lang='en'), TrecQrels(dlc['trec-tb-2004/qrels'], QREL_DEFS), documentation('trec-tb-2004') ) subsets['trec-tb-2005'] = Dataset( collection, TrecQueries(dlc['trec-tb-2005/queries'], namespace=NAME, lang='en'), TrecQrels(dlc['trec-tb-2005/qrels'], QREL_DEFS), documentation('trec-tb-2005') ) subsets['trec-tb-2005/named-page'] = Dataset( collection, TrecQueries(dlc['trec-tb-2005/named-page/queries'], qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels(dlc['trec-tb-2005/named-page/qrels'], NAMED_PAGE_QREL_DEFS), documentation('trec-tb-2005/named-page') ) subsets['trec-tb-2005/efficiency'] = Dataset( collection, TrecColonQueries(GzipExtract(dlc['trec-tb-2005/efficiency/queries']), encoding='latin1', namespace=NAME, lang='en'), RewriteQids(TrecQrels(dlc['trec-tb-2005/qrels'], QREL_DEFS), EFF_MAP_05), documentation('trec-tb-2005/efficiency') ) subsets['trec-tb-2006'] = Dataset( collection, TrecQueries(dlc['trec-tb-2006/queries'], namespace=NAME, lang='en'), TrecQrels(dlc['trec-tb-2006/qrels'], QREL_DEFS), documentation('trec-tb-2006') ) subsets['trec-tb-2006/named-page'] = Dataset( collection, TrecQueries(dlc['trec-tb-2006/named-page/queries'], qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels(dlc['trec-tb-2006/named-page/qrels'], NAMED_PAGE_QREL_DEFS), documentation('trec-tb-2006/named-page') ) subsets['trec-tb-2006/efficiency'] = Dataset( collection, TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.all'), encoding='latin1', namespace=NAME, lang='en'), RewriteQids(TrecQrels(dlc['trec-tb-2006/qrels'], QREL_DEFS), EFF_MAP_06), documentation('trec-tb-2006/efficiency') ) subsets['trec-tb-2006/efficiency/10k'] = Dataset( collection, TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.10k'), encoding='latin1', namespace=NAME, lang='en'), documentation('trec-tb-2006/efficiency/10k') ) subsets['trec-tb-2006/efficiency/stream1'] = Dataset( collection, TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-1'), encoding='latin1', namespace=NAME, lang='en'), documentation('trec-tb-2006/efficiency/stream1') ) subsets['trec-tb-2006/efficiency/stream2'] = Dataset( collection, TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-2'), encoding='latin1', namespace=NAME, lang='en'), documentation('trec-tb-2006/efficiency/stream2') ) subsets['trec-tb-2006/efficiency/stream3'] = Dataset( collection, TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-3'), encoding='latin1', namespace=NAME, lang='en'), RewriteQids(TrecQrels(dlc['trec-tb-2006/qrels'], QREL_DEFS), EFF_MAP_06), documentation('trec-tb-2006/efficiency/stream3') ) subsets['trec-tb-2006/efficiency/stream4'] = Dataset( collection, TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-4'), encoding='latin1', namespace=NAME, lang='en'), documentation('trec-tb-2006/efficiency/stream4') ) subsets['trec-mq-2007'] = Dataset( collection, TrecColonQueries(GzipExtract(dlc['trec-mq-2007/queries']), encoding='latin1'), TrecPrels(dlc['trec-mq-2007/qrels'], QREL_DEFS), documentation('trec-mq-2007') ) subsets['trec-mq-2008'] = Dataset( collection, TrecColonQueries(GzipExtract(dlc['trec-mq-2008/queries']), encoding='latin1', namespace='trec-mq', lang='en'), TrecPrels(TarExtract(dlc['trec-mq-2008/qrels'], '2008.RC1/prels'), QREL_DEFS), documentation('trec-mq-2008') ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/hc4.py ================================================ import ir_datasets from ir_datasets.util import DownloadConfig from ir_datasets.formats import TrecQrels from ir_datasets.datasets.base import Dataset, YamlDocumentation from ir_datasets.formats import ExctractedCCDocs, ExctractedCCQueries NAME = 'hc4' DOC_COUNTS = { 'zh': 646305, 'fa': 486486, 'ru': 4721064 } QREL_DEFS = { 3: 'Very-valuable. Information in the document would be found in the lead paragraph of a report that is later written on the topic.', 1: 'Somewhat-valuable. The most valuable information in the document would be found in the remainder of such a report.', 0: 'Not-valuable. Information in the document might be included in a report footnote, or omitted entirely.', } def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') base = Dataset(documentation('_')) # dummy top level ds for lang in ['zh', 'fa', 'ru']: lang_docs = ExctractedCCDocs(dlc[f'{lang}/docs'], subset_lang=lang, namespace=NAME, count=DOC_COUNTS[lang]) subsets[lang] = Dataset( lang_docs, documentation(lang) ) for sep in ['train', 'dev', 'test']: subsets[f'{lang}/{sep}'] = Dataset( lang_docs, ExctractedCCQueries(dlc[f'{sep}/topics'], subset_lang=lang, namespace=NAME), TrecQrels(dlc[f'{lang}/{sep}/qrels'], QREL_DEFS), documentation(f'{lang}/{sep}'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/highwire.py ================================================ import codecs from typing import NamedTuple, Tuple from zipfile import ZipFile import ir_datasets from ir_datasets.util import DownloadConfig from ir_datasets.formats import BaseDocs, BaseQueries, GenericQuery, BaseQrels from ir_datasets.datasets.base import Dataset, YamlDocumentation from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS _logger = ir_datasets.log.easy() SOURCES = ['ajepidem', 'ajpcell', 'ajpendometa', 'ajpgastro', 'ajpheart', 'ajplung', 'ajprenal', 'alcohol', 'andrology', 'annonc', 'bjanast', 'bjp', 'blood', 'carcinogenesis', 'cercor', 'development', 'diabetes', 'endocrinology', 'euroheartj', 'glycobiology', 'humanrep', 'humolgen', 'ijepidem', 'intimm', 'jantichemo', 'jappliedphysio', 'jbc-1995', 'jbc-1996', 'jbc-1997', 'jbc-1998', 'jbc-1999', 'jbc-2000', 'jbc-2001', 'jbc-2002', 'jbc-2003', 'jbc-2004', 'jbc-2005', 'jcb', 'jclinicalendometa', 'jcs', 'jexpbio', 'jexpmed', 'jgenphysio', 'jgenviro', 'jhistocyto', 'jnci', 'jneuro', 'mcp', 'microbio', 'molbiolevol', 'molendo', 'molhumanrep', 'nar', 'nephrodiatransp', 'peds', 'physiogenomics', 'rheumatolgy', 'rna', 'toxsci'] QREL_DEFS_06 = { 0: 'NOT', 1: 'POSSIBLY', 2: 'DEFINITELY' } QREL_DEFS_07 = { 0: 'NOT_RELEVANT', 1: 'RELEVANT', } NAME = 'highwire' class HighwireSpan(NamedTuple): start: int length: int text: str class HighwireDoc(NamedTuple): doc_id: str journal: str title: str spans: Tuple[HighwireSpan, ...] def default_text(self): """ title + spans """ return self.title + ' ' + ' '.join(s.text for s in self.spans) class TrecGenomicsQrel(NamedTuple): query_id: str doc_id: str span_start: int span_len: int relevance: int class HighwireQrel(NamedTuple): query_id: str doc_id: str start: int length: int relevance: int class HighwireDocs(BaseDocs): def __init__(self, dlcs, legalspans_dlc): self._dlcs = dlcs self._legalspans_dlc = legalspans_dlc def docs_iter(self): return iter(self.docs_store()) def _docs_iter(self): lxml_html = ir_datasets.lazy_libs.lxml_html() def _legalspans_iter(): with self._legalspans_dlc.stream() as f: prev_did, spans = None, None for line in codecs.getreader('utf8')(f): doc_id, start_idx, length = line.split() if prev_did != doc_id: if prev_did is not None: yield prev_did, spans prev_did, spans = doc_id, [] spans.append((int(start_idx), int(length))) yield prev_did, spans legalspans_iter = _legalspans_iter() for source in SOURCES: with ZipFile(self._dlcs[source].path(), 'r') as zipf: for record in zipf.filelist: doc_id = record.filename.split('/')[-1].split('.')[0] doc_raw = zipf.open(record, 'r').read() legalspans_did, legalspans = next(legalspans_iter, None) assert legalspans_did == doc_id spans = tuple(HighwireSpan(s, l, doc_raw[s:s+l]) for s, l in legalspans) # the title should be in the first span inside a <h2> element title = lxml_html.document_fromstring(b'<OUTER>' + spans[0].text + b'</OUTER>') title = title.xpath("//h2") title = title[0].text_content() if title else '' # keep just the text content within each spans spans = tuple(HighwireSpan(s, l, lxml_html.document_fromstring(b'<OUTER>' + t + b'</OUTER>').text_content()) for s, l, t in spans) yield HighwireDoc(doc_id, source, title, spans) def docs_path(self, force=True): return ir_datasets.util.home_path()/NAME/'corpus' def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f'{self.docs_path(force=False)}.pklz4', init_iter_fn=self._docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], count_hint=ir_datasets.util.count_hint(NAME), options=options ) def docs_cls(self): return HighwireDoc def docs_namespace(self): return NAME def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_lang(self): return 'en' class TrecGenomicsQueries(BaseQueries): def __init__(self, queries_dlc): self._queries_dlc = queries_dlc def queries_iter(self): with self._queries_dlc.stream() as f: for line in codecs.getreader('cp1252')(f): if line.strip() == '': continue doc_id, text = line[1:4], line[5:].rstrip() text = text.replace('[ANTIBODIES]', 'antibodies').replace('[BIOLOGICAL SUBSTANCES]', 'biological substances').replace('[CELL OR TISSUE TYPES]', 'cell or tissue types').replace('[DISEASES]', 'diseases').replace('[DRUGS]', 'drugs').replace('[GENES]', 'genes').replace('[MOLECULAR FUNCTIONS]', 'molecular functions').replace('[MUTATIONS]', 'mutations').replace('[PATHWAYS]', 'pathways').replace('[PROTEINS]', 'proteins').replace('[SIGNS OR SYMPTOMS]', 'signs or symptoms').replace('[STRAINS]', 'strains').replace('[TOXICITIES]', 'toxicities').replace('[TUMOR TYPES]', 'tumor types') yield GenericQuery(doc_id, text) def queries_cls(self): return GenericQuery def queries_namespace(self): return 'trec-genomics' def queries_lang(self): return 'en' class HighwireQrels(BaseQrels): def __init__(self, qrels_dlc, qrel_defs): self._qrels_dlc = qrels_dlc self._qrel_defs = qrel_defs def qrels_iter(self): rev_devs = dict((v, k) for k, v in self._qrel_defs.items()) with self._qrels_dlc.stream() as f: for line in codecs.getreader('utf8')(f): if line.startswith('#') or line.strip() == '': continue cols = line.split() if len(cols) == 6: # 2006 query_id, doc_id, start, length, _, rel_str = cols elif len(cols) == 5: # 2006 query_id, doc_id, start, length, rel_str = cols else: raise RuntimeError('error parsing file') yield HighwireQrel(query_id, doc_id, int(start), int(length), rev_devs[rel_str]) def qrels_defs(self): return self._qrel_defs def qrels_path(self): return self._qrels_dlc.path() def qrels_cls(self): return HighwireQrel def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} collection = HighwireDocs(dlc, dlc['legalspans']) base = Dataset(collection, documentation('_')) subsets['trec-genomics-2006'] = Dataset( collection, TrecGenomicsQueries(dlc['trec-genomics-2006/queries']), HighwireQrels(dlc['trec-genomics-2006/qrels'], QREL_DEFS_06), documentation('trec-genomics-2006'), ) subsets['trec-genomics-2007'] = Dataset( collection, TrecGenomicsQueries(dlc['trec-genomics-2007/queries']), HighwireQrels(dlc['trec-genomics-2007/qrels'], QREL_DEFS_07), documentation('trec-genomics-2007'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/istella22.py ================================================ import json import codecs from typing import NamedTuple, Dict, List import ir_datasets from ir_datasets.util import TarExtract, TarExtractAll, RelativePath, GzipExtract, Lazy from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries, FilteredQrels from ir_datasets.formats import JsonlDocs, JsonlQueries, TrecQrels from ir_datasets.indices import PickleLz4FullStore _logger = ir_datasets.log.easy() class Istella22Doc(NamedTuple): doc_id: str title: str url: str text: str extra_text: str lang: str lang_pct: int def default_text(self): """ title + text + extra_text """ return f'{self.title} {self.text} {self.extra_text}' NAME = 'istella22' QREL_DEFS = {1: 'Least relevant', 2: 'Somewhat relevant', 3: 'Mostly relevant', 4: 'Perfectly relevant'} DUA = ("To use the Istella22 dataset, you must read and accept the Istella22 Licence Agreement, found here: " "<https://istella.ai/data/istella22-dataset/>") def _init(): base_path = ir_datasets.util.home_path()/NAME dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path, dua=DUA) documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_dlc = TarExtractAll(dlc['source'], base_path/'istella22_extracted') docs = JsonlDocs(GzipExtract(RelativePath(base_dlc, 'istella22/docs.jsonl.gz')), doc_cls=Istella22Doc, lang=None, count_hint=8421456) test_queries = JsonlQueries(GzipExtract(RelativePath(base_dlc, 'istella22/queries.test.jsonl.gz')), lang='it') test_qrels = TrecQrels(GzipExtract(RelativePath(base_dlc, 'istella22/qrels.test.gz')), QREL_DEFS) base = Dataset( docs, documentation('_')) subsets = {} subsets['test'] = Dataset( docs, test_queries, test_qrels, documentation('test')) for fold in ['fold1', 'fold2', 'fold3', 'fold4', 'fold5']: fold_qids = Lazy(fold_qids_factory(fold, base_dlc)) subsets[f'test/{fold}'] = Dataset( docs, FilteredQueries(test_queries, fold_qids, mode='include'), FilteredQrels(test_qrels, fold_qids, mode='include'), documentation('test')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets def fold_qids_factory(fold, base_dlc): def wrapped(): with TarExtract(RelativePath(base_dlc, 'istella22/queries.test.folds.tar.gz'), f'./test.queries.{fold}').stream() as f: result = [qid.decode().strip().lstrip('0') for qid in f] return result return wrapped base, subsets = _init() ================================================ FILE: ir_datasets/datasets/kilt.py ================================================ import json import codecs from typing import NamedTuple, Tuple import ir_datasets from ir_datasets.util import TarExtractAll, Cache, RelativePath, Lazy, Migrator from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQrels from ir_datasets.formats import BaseDocs, TrecQrels from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS from ir_datasets.datasets import codec _logger = ir_datasets.log.easy() NAME = 'kilt' CODEC_QREL_DEFS = { 3: 'Very Valuable. It is absolutely critical to understand what this entity is for understanding this topic.', 2: 'Somewhat valuable. It is important to understand what this entity is for understanding this topic.', 1: 'Not Valuable. It is useful to understand what this entity is for understanding this topic.', 0: 'Not Relevant. This entity is not useful or on topic.', } class KiltDocAnchor(NamedTuple): text: str href: str paragraph_id: int start: int end: int class KiltDoc(NamedTuple): doc_id: str title: str text: str text_pieces: Tuple[str, ...] anchors: Tuple[KiltDocAnchor, ...] categories: Tuple[str, ...] wikidata_id: str history_revid: str history_timestamp: str history_parentid: str history_pageid: str history_url: str def default_text(self): """ title + text """ return f'{self.title} {self.text}' def strip_markup(text): if text.startswith('Section::::'): return text.replace('Section::::', '').replace(':', ' ') if text.startswith('BULLET::::-'): return text.replace('BULLET::::-', '-') return text class KiltDocs(BaseDocs): def __init__(self, streamer, count_hint=None): super().__init__() self._streamer = streamer self._count_hint = count_hint @ir_datasets.util.use_docstore def docs_iter(self): for doc in self.docs_kilt_raw_iter(): yield KiltDoc( doc['wikipedia_id'], doc['wikipedia_title'], ''.join(strip_markup(t) for t in doc['text']), tuple(doc['text']), tuple(KiltDocAnchor( a['text'], a['href'], a['paragraph_id'], a['start'], a['end']) for a in doc['anchors']), tuple(doc['categories'].split(',')), doc.get('wikidata_info', {}).get('wikidata_id', ''), str(doc['history']['revid']), doc['history']['timestamp'], str(doc['history']['parentid']), str(doc['history']['pageid']), doc['history']['url'], ) def docs_cls(self): return KiltDoc def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f'{ir_datasets.util.home_path()/NAME}/docs.pklz4', init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], count_hint=self._count_hint, options=options ) def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_namespace(self): return NAME def docs_lang(self): return 'en' def docs_kilt_raw_iter(self): with self._streamer.stream() as stream: for doc in stream: yield json.loads(doc) def _init(): base_path = ir_datasets.util.home_path()/NAME dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') corpus = KiltDocs(dlc['knowledgesource'], count_hint=5903530) base = Dataset( corpus, documentation('_')) subsets = {} subsets['codec'] = Dataset( corpus, codec.base.queries_handler(), TrecQrels(dlc['codec/qrels'], CODEC_QREL_DEFS), documentation('codec')) for domain in codec.DOMAINS: queries_handler = codec.subsets[domain] subsets[f'codec/{domain}'] = Dataset( corpus, queries_handler, FilteredQrels(subsets['codec'].qrels_handler(), codec.filter_qids(domain, queries_handler), mode='include'), documentation(f'codec/{domain}')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/lotte.py ================================================ import json import codecs from typing import NamedTuple, Dict, List import ir_datasets from ir_datasets.util import TarExtractAll, Cache, RelativePath, Lazy, Migrator from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries from ir_datasets.formats import TsvDocs, TsvQueries, BaseQrels, GenericDoc, GenericQuery, TrecQrel from ir_datasets.indices import PickleLz4FullStore _logger = ir_datasets.log.easy() NAME = 'lotte' QRELS_DEFS = {1: 'Answer upvoted or accepted on stack exchange'} class LotteQrels(BaseQrels): def __init__(self, qrels_dlc): self._qrels_dlc = qrels_dlc def qrels_path(self): return self._qrels_dlc.path() def qrels_iter(self): with self._qrels_dlc.stream() as f: for line in f: data = json.loads(line) for did in data['answer_pids']: yield TrecQrel(str(data['qid']), str(did), 1, "0") def qrels_cls(self): return TrecQrel def qrels_defs(self): return QRELS_DEFS def _init(): base_path = ir_datasets.util.home_path()/NAME dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_dlc = TarExtractAll(dlc['source'], base_path/'lotte_extracted') base = Dataset(documentation('_')) subsets = {} domains = [ ('lifestyle',), ('recreation',), ('science',), ('technology',), ('writing',), ('pooled',), ] for (domain,) in domains: for split in ['dev', 'test']: corpus = TsvDocs(RelativePath(base_dlc, f'lotte/{domain}/{split}/collection.tsv'), lang='en') subsets[f'{domain}/{split}'] = Dataset( corpus, documentation(f'{domain}/{split}') ) for qtype in ['search', 'forum']: subsets[f'{domain}/{split}/{qtype}'] = Dataset( corpus, TsvQueries(RelativePath(base_dlc, f'lotte/{domain}/{split}/questions.{qtype}.tsv'), lang='en'), LotteQrels(RelativePath(base_dlc, f'lotte/{domain}/{split}/qas.{qtype}.jsonl')), documentation(f'{domain}/{split}/{qtype}') ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/medline.py ================================================ import codecs import itertools import io import gzip from contextlib import ExitStack import itertools from typing import NamedTuple, Tuple import tarfile import ir_datasets from ir_datasets.util import DownloadConfig, GzipExtract, ZipExtract from ir_datasets.formats import BaseDocs, BaseQueries, GenericQuery, TrecQrels, TrecXmlQueries from ir_datasets.datasets.base import Dataset, YamlDocumentation from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS from .highwire import TrecGenomicsQueries _logger = ir_datasets.log.easy() QREL_DEFS = { 0: 'not relevant', 1: 'possibly relevant', 2: 'definitely relevant' } TREC04_XML_MAP = { 'ID': 'query_id', 'TITLE': 'title', 'NEED': 'need', 'CONTEXT': 'context', } NAME = 'medline' class MedlineDoc(NamedTuple): doc_id: str title: str abstract: str def default_text(self): """ title + abstract """ return f'{self.title} {self.abstract}' class TrecGenomicsQuery(NamedTuple): query_id: str title: str need: str context: str def default_text(self): """ title """ return self.title class TrecPm2017Query(NamedTuple): query_id: str disease: str gene: str demographic: str other: str def default_text(self): """ disease, gene, demographic, and other """ return f'{self.disease} {self.gene} {self.demographic} {self.other}' class TrecPmQuery(NamedTuple): query_id: str disease: str gene: str demographic: str def default_text(self): """ disease, gene, and demographic """ return f'{self.disease} {self.gene} {self.demographic}' class ConcatFile: """ Simulates a sequence of file-like objects that are cat'd. Only supports read operations. """ def __init__(self, files): self.file_iter = files self.file = next(self.file_iter) def read(self, count=None): result = b'' while not result and self.file is not None: result = self.file.read(count) if not result: self.file = next(self.file_iter, None) return result class MedlineDocs(BaseDocs): def __init__(self, name, dlcs, count_hint=None): self._name = name self._dlcs = dlcs self._count_hint = count_hint @ir_datasets.util.use_docstore def docs_iter(self): ET = ir_datasets.lazy_libs.xml_etree() with ExitStack() as stack: if self._name == '2004': # The files for 2004 are a large XML file that's split internally. # Simulate one big file for the parser below. EOF = io.BytesIO(b'\n</MedlineCitationSet>') files = [ConcatFile(itertools.chain( (stack.enter_context(dlc.stream()) for dlc in self._dlcs), (EOF,) ))] elif self._name == '2017': # The files for 2017 are individual files in a big tar file. Generate # a file for each. def _files(): for dlc in self._dlcs: with dlc.stream() as f: tarf = stack.enter_context(tarfile.open(fileobj=f, mode=f'r|gz')) for r in tarf: if r.isfile() and r.name.endswith('.gz'): yield gzip.GzipFile(fileobj=tarf.extractfile(r), mode='r') files = _files() else: raise ValueError(f'unknown {self._name}') for file in files: for _, el in ET.iterparse(file, events=['end']): if el.tag == 'MedlineCitation': doc_id = el.find('.//PMID').text title = el.find('.//ArticleTitle') abstract = el.find('.//AbstractText') yield MedlineDoc(doc_id, title.text if title is not None else '', abstract.text if abstract is not None else '') if el.tag in ('PubmedArticle', 'MedlineCitation'): el.clear() # so we don't need to keep it all in memory def docs_path(self, force=True): return ir_datasets.util.home_path()/NAME/self._name/'corpus' def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f'{self.docs_path(force=False)}.pklz4', init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], size_hint=15900069519, count_hint=self._count_hint, options=options ) def docs_cls(self): return MedlineDoc def docs_namespace(self): return NAME def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_lang(self): return 'en' class AacrAscoDocs(BaseDocs): def __init__(self, dlc): self._dlc = dlc @ir_datasets.util.use_docstore def docs_iter(self): with self._dlc.stream() as f, tarfile.open(fileobj=f, mode=f'r|gz') as tarf: for file in tarf: if not file.isfile(): continue file_reader = tarf.extractfile(file) file_reader = codecs.getreader('utf8')(file_reader) doc_id = file.name.split('/')[-1].split('.')[0] meeting = next(file_reader) title = '' for line in file_reader: title += line if title.endswith('\n\n'): break assert title.startswith('Title:') title = title[len('Title:'):].strip() abstract = file_reader.read().strip() yield MedlineDoc(doc_id, title, abstract) def docs_path(self, force=True): return ir_datasets.util.home_path()/NAME/'2017'/'corpus' def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f'{self.docs_path(force=False)}.pklz4', init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], options=options ) def docs_cls(self): return MedlineDoc def docs_namespace(self): return NAME def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_lang(self): return 'en' class ConcatDocs(BaseDocs): def __init__(self, docs, count_hint=None): self._docs = docs self._count_hint = count_hint def docs_iter(self): return iter(self.docs_store()) @ir_datasets.util.use_docstore def docs_iter(self): for docs in self._docs: yield from docs.docs_iter() def docs_path(self, force=True): return f'{self._docs[0].docs_path(force)}.concat' def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f'{self.docs_path(force=False)}.pklz4', init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], count_hint=self._count_hint, options=options ) def docs_cls(self): return self._docs[0].docs_cls() def docs_namespace(self): return self._docs[0].docs_namespace() def docs_lang(self): return self._docs[0].docs_lang() def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} base = Dataset(documentation('_')) collection04 = MedlineDocs('2004', [GzipExtract(dlc['2004/a']), GzipExtract(dlc['2004/b']), GzipExtract(dlc['2004/c']), GzipExtract(dlc['2004/d'])], count_hint=ir_datasets.util.count_hint(f'{NAME}/2004')) subsets['2004'] = Dataset(collection04, documentation('2004')) subsets['2004/trec-genomics-2004'] = Dataset( collection04, TrecXmlQueries(ZipExtract(dlc['trec-genomics-2004/queries'], 'Official.xml'), qtype=TrecGenomicsQuery, qtype_map=TREC04_XML_MAP, namespace='trec-genomics', lang='en'), TrecQrels(dlc['trec-genomics-2004/qrels'], QREL_DEFS), documentation('trec-genomics-2004'), ) subsets['2004/trec-genomics-2005'] = Dataset( collection04, TrecGenomicsQueries(dlc['trec-genomics-2005/queries']), TrecQrels(dlc['trec-genomics-2005/qrels'], QREL_DEFS), documentation('trec-genomics-2005'), ) collection17 = ConcatDocs([ AacrAscoDocs(dlc['2017/aacr_asco_extra']), MedlineDocs('2017', [dlc['2017/part1'], dlc['2017/part2'], dlc['2017/part3'], dlc['2017/part4'], dlc['2017/part5']]), ], count_hint=ir_datasets.util.count_hint(f'{NAME}/2017')) subsets['2017'] = Dataset(collection17, documentation('2017')) subsets['2017/trec-pm-2017'] = Dataset( collection17, TrecXmlQueries(dlc['trec-pm-2017/queries'], qtype=TrecPm2017Query, namespace='trec-pm-2017', lang='en'), TrecQrels(dlc['trec-pm-2017/qrels'], QREL_DEFS), documentation('trec-pm-2017'), ) subsets['2017/trec-pm-2018'] = Dataset( collection17, TrecXmlQueries(dlc['trec-pm-2018/queries'], qtype=TrecPmQuery, namespace='trec-pm-2018', lang='en'), TrecQrels(dlc['trec-pm-2018/qrels'], QREL_DEFS), documentation('trec-pm-2018'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/miracl.py ================================================ import ir_datasets from typing import NamedTuple from ir_datasets.util import DownloadConfig, GzipExtract from ir_datasets.datasets.base import Dataset, YamlDocumentation from ir_datasets.formats import JsonlDocs, TsvQueries, TrecQrels, TrecScoredDocs NAME = 'miracl' _logger = ir_datasets.log.easy() QRELS_DEFS = { 0: 'Not Relevant', 1: 'Relevant', } class MiraclDoc(NamedTuple): doc_id: str title: str text: str def default_text(self): return f'{self.title} {self.text}' def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} langs = [ ('ar', 5, {'train', 'dev', 'test-a', 'test-b'}), ('bn', 1, {'train', 'dev', 'test-a', 'test-b'}), ('de', 32, {'dev', 'test-b'}), ('en', 66, {'train', 'dev', 'test-a', 'test-b'}), ('es', 21, {'train', 'dev', 'test-b'}), ('fa', 5, {'train', 'dev', 'test-b'}), ('fi', 4, {'train', 'dev', 'test-a', 'test-b'}), ('fr', 30, {'train', 'dev', 'test-b'}), ('hi', 2, {'train', 'dev', 'test-b'}), ('id', 3, {'train', 'dev', 'test-a', 'test-b'}), ('ja', 14, {'train', 'dev', 'test-a', 'test-b'}), ('ko', 3, {'train', 'dev', 'test-a', 'test-b'}), ('ru', 20, {'train', 'dev', 'test-a', 'test-b'}), ('sw', 1, {'train', 'dev', 'test-a', 'test-b'}), ('te', 2, {'train', 'dev', 'test-a', 'test-b'}), ('th', 2, {'train', 'dev', 'test-a', 'test-b'}), ('yo', 1, {'dev', 'test-b'}), ('zh', 10, {'train', 'dev', 'test-b'}), ] for lang, n_doc_files, topic_sets in langs: collection = JsonlDocs( [GzipExtract(dlc[f'v1.0/{lang}/corpus/{i}']) for i in range(n_doc_files)], doc_cls=MiraclDoc, mapping={'doc_id': 'docid', 'title': 'title', 'text': 'text'}, namespace=f'{NAME}/{lang}', lang=lang, count_hint=ir_datasets.util.count_hint(f'{NAME}/{lang}'), docstore_path=base_path/'v1.0'/lang/'docs.pklz4') subsets[f'{lang}'] = Dataset(collection, documentation(f'{lang}')) if 'train' in topic_sets: subsets[f'{lang}/train'] = Dataset( collection, TsvQueries(dlc[f'v1.0/{lang}/train/topics'], namespace=f'{NAME}/{lang}', lang=lang), TrecQrels(dlc[f'v1.0/{lang}/train/qrels'], QRELS_DEFS), documentation(f'{lang}/train')) if 'dev' in topic_sets: subsets[f'{lang}/dev'] = Dataset( collection, TsvQueries(dlc[f'v1.0/{lang}/dev/topics'], namespace=f'{NAME}/{lang}', lang=lang), TrecQrels(dlc[f'v1.0/{lang}/dev/qrels'], QRELS_DEFS), documentation(f'{lang}/dev')) if 'test-a' in topic_sets: subsets[f'{lang}/test-a'] = Dataset( collection, TsvQueries(dlc[f'v1.0/{lang}/test-a/topics'], namespace=f'{NAME}/{lang}', lang=lang), documentation(f'{lang}/test-a')) if 'test-b' in topic_sets: subsets[f'{lang}/test-b'] = Dataset( collection, TsvQueries(dlc[f'v1.0/{lang}/test-b/topics'], namespace=f'{NAME}/{lang}', lang=lang), documentation(f'{lang}/test-b')) ir_datasets.registry.register(NAME, Dataset(documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return collection, subsets collection, subsets = _init() ================================================ FILE: ir_datasets/datasets/mmarco.py ================================================ import io import codecs import re import ir_datasets from ir_datasets.util import DownloadConfig, Lazy from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries from ir_datasets.datasets import msmarco_passage from ir_datasets.formats import TsvQueries, TsvDocs, TrecQrels, TsvDocPairs, TrecScoredDocs NAME = 'mmarco' _logger = ir_datasets.log.easy() QRELS_DEFS = { 1: 'Labeled by crowd worker as relevant' } def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} train_qrels = ir_datasets.registry['msmarco-passage/train'].qrels_handler() train_docparis = TsvDocPairs(dlc['train/triples']) dev_qrels = TrecQrels(dlc['dev/qrels'], QRELS_DEFS) dev_small_qrels = TrecQrels(dlc['dev/qrels-small'], QRELS_DEFS) small_dev_qids = Lazy(lambda: {q.query_id for q in dev_small_qrels.qrels_iter()}) for lang in ['es', 'fr', 'pt', 'it', 'id', 'de', 'ru', 'zh']: collection = TsvDocs(dlc[f'{lang}/docs'], namespace=f'mmarco/{lang}', lang=lang, count_hint=ir_datasets.util.count_hint(f'{NAME}/{lang}')) subsets[f'{lang}'] = Dataset(collection, documentation(f'{lang}')) subsets[f'{lang}/train'] = Dataset( collection, TsvQueries(dlc[f'{lang}/queries/train'], namespace=f'mmarco/{lang}', lang=lang), train_qrels, train_docparis, documentation(f'{lang}/train')) subsets[f'{lang}/dev'] = Dataset( collection, TsvQueries(dlc[f'{lang}/queries/dev'], namespace=f'mmarco/{lang}', lang=lang), dev_qrels, documentation(f'{lang}/dev')) subsets[f'{lang}/dev/small'] = Dataset( collection, FilteredQueries(subsets[f'{lang}/dev'].queries_handler(), small_dev_qids, mode='include'), dev_small_qrels, TrecScoredDocs(dlc[f'{lang}/scoreddocs/dev']) if lang not in ('zh', 'pt') else None, documentation(f'{lang}/dev/small')) if lang in ('zh', 'pt'): subsets[f'{lang}/dev/v1.1'] = Dataset( collection, TsvQueries(dlc[f'{lang}/queries/dev/v1.1'], namespace=f'mmarco/{lang}', lang=lang), dev_qrels, documentation(f'{lang}/dev/v1.1')) subsets[f'{lang}/dev/small/v1.1'] = Dataset( collection, FilteredQueries(subsets[f'{lang}/dev/v1.1'].queries_handler(), small_dev_qids, mode='include'), dev_small_qrels, TrecScoredDocs(dlc[f'{lang}/scoreddocs/dev/v1.1']), documentation(f'{lang}/dev/v1.1')) if lang in ('pt',): subsets[f'{lang}/train/v1.1'] = Dataset( collection, TsvQueries(dlc[f'{lang}/queries/train/v1.1'], namespace=f'mmarco/{lang}', lang=lang), train_qrels, train_docparis, documentation(f'{lang}/train/v1.1')) for lang in ['ar', 'zh', 'dt', 'fr', 'de', 'hi', 'id', 'it', 'ja', 'pt', 'ru', 'es', 'vi']: collection = TsvDocs(dlc[f'v2/{lang}/docs'], namespace=f'mmarco/{lang}', lang=lang, count_hint=ir_datasets.util.count_hint(f'{NAME}/v2/{lang}')) subsets[f'v2/{lang}'] = Dataset(collection, documentation(f'v2/{lang}')) subsets[f'v2/{lang}/train'] = Dataset( collection, TsvQueries(dlc[f'v2/{lang}/queries/train'], namespace=f'mmarco/v2/{lang}', lang=lang), train_qrels, train_docparis, documentation(f'v2/{lang}/train')) subsets[f'v2/{lang}/dev'] = Dataset( collection, TsvQueries(dlc[f'v2/{lang}/queries/dev'], namespace=f'v2/mmarco/{lang}', lang=lang), dev_qrels, documentation(f'v2/{lang}/dev')) subsets[f'v2/{lang}/dev/small'] = Dataset( collection, FilteredQueries(subsets[f'v2/{lang}/dev'].queries_handler(), small_dev_qids, mode='include'), dev_small_qrels, TrecScoredDocs(dlc[f'v2/{lang}/scoreddocs/dev'], negate_score=True), documentation(f'v2/{lang}/dev/small')) ir_datasets.registry.register(NAME, Dataset(documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return collection, subsets collection, subsets = _init() ================================================ FILE: ir_datasets/datasets/mr_tydi.py ================================================ import json import codecs from typing import NamedTuple, Dict import ir_datasets from ir_datasets.util import TarExtractAll, RelativePath, GzipExtract, Migrator from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries from ir_datasets.formats import TsvQueries, BaseDocs, TrecQrels, GenericDoc from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS _logger = ir_datasets.log.easy() NAME = 'mr-tydi' QREL_DEFS = { 1: "Passage identified within Wikipedia article from top Google search results" } class MrTydiDocs(BaseDocs): def __init__(self, dlc, lang, count_hint=None): super().__init__() self._dlc = dlc self._count_hint = count_hint self._lang = lang @ir_datasets.util.use_docstore def docs_iter(self): with self._dlc.stream() as stream: for line in stream: data = json.loads(line) yield GenericDoc(data['id'], data['contents']) def docs_cls(self): return GenericDoc def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f'{ir_datasets.util.home_path()/NAME/self._lang}.pklz4', init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], count_hint=self._count_hint, options=options ) def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_namespace(self): return f'{NAME}/{self._lang}' def docs_lang(self): return self._lang def _init(): base_path = ir_datasets.util.home_path()/NAME dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') base = Dataset(documentation('_')) subsets = {} langs = { 'ar': 'mrtydi-v1.0-arabic', 'bn': 'mrtydi-v1.0-bengali', 'en': 'mrtydi-v1.0-english', 'fi': 'mrtydi-v1.0-finnish', 'id': 'mrtydi-v1.0-indonesian', 'ja': 'mrtydi-v1.0-japanese', 'ko': 'mrtydi-v1.0-korean', 'ru': 'mrtydi-v1.0-russian', 'sw': 'mrtydi-v1.0-swahili', 'te': 'mrtydi-v1.0-telugu', 'th': 'mrtydi-v1.0-thai', } migrator = Migrator(base_path/'irds_version.txt', 'v2', affected_files=[base_path/lang for lang in langs], message='Migrating mr-tydi (restructuring directory)') for lang, file_name in langs.items(): dlc_ds = TarExtractAll(dlc[lang], f'{base_path/lang}.data') docs = MrTydiDocs(GzipExtract(RelativePath(dlc_ds, f'{file_name}/collection/docs.jsonl.gz')), lang, count_hint=ir_datasets.util.count_hint(f'{NAME}/{lang}')) docs = migrator(docs) subsets[lang] = Dataset( docs, TsvQueries(RelativePath(dlc_ds, f'{file_name}/topic.tsv'), lang=lang), TrecQrels(RelativePath(dlc_ds, f'{file_name}/qrels.txt'), QREL_DEFS), documentation(lang) ) subsets[f'{lang}/train'] = Dataset( docs, TsvQueries(RelativePath(dlc_ds, f'{file_name}/topic.train.tsv'), lang=lang), TrecQrels(RelativePath(dlc_ds, f'{file_name}/qrels.train.txt'), QREL_DEFS), documentation(f'{lang}/train') ) subsets[f'{lang}/dev'] = Dataset( docs, TsvQueries(RelativePath(dlc_ds, f'{file_name}/topic.dev.tsv'), lang=lang), TrecQrels(RelativePath(dlc_ds, f'{file_name}/qrels.dev.txt'), QREL_DEFS), documentation(f'{lang}/dev') ) subsets[f'{lang}/test'] = Dataset( docs, TsvQueries(RelativePath(dlc_ds, f'{file_name}/topic.test.tsv'), lang=lang), TrecQrels(RelativePath(dlc_ds, f'{file_name}/qrels.test.txt'), QREL_DEFS), documentation(f'{lang}/test') ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/msmarco_document.py ================================================ from typing import NamedTuple, List import json import ir_datasets from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS from ir_datasets.util import Cache, DownloadConfig, GzipExtract, Lazy, Migrator from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries, FilteredScoredDocs, FilteredQrels from ir_datasets.formats import TrecDocs, TsvQueries, TrecQrels, TrecScoredDocs, BaseDocs from ir_datasets.datasets.msmarco_passage import DUA, DL_HARD_QIDS_BYFOLD, DL_HARD_QIDS NAME = 'msmarco-document' _logger = ir_datasets.log.easy() QRELS_DEFS = { 1: 'Document contains a passage labeled as relevant in msmarco-passage' } TREC_DL_QRELS_DEFS = { 3: "Perfectly relevant: Document is dedicated to the query, it is worthy of being a top result " "in a search engine.", 2: "Highly relevant: The content of this document provides substantial information on the query.", 1: "Relevant: Document provides some information relevant to the query, which may be minimal.", 0: "Irrelevant: Document does not provide any useful information about the query", } ORCAS_QLRES_DEFS = { 1: "User click", } class MsMarcoDocument(NamedTuple): doc_id: str url: str title: str body: str def default_text(self): """ title + body """ return f'{self.title} {self.body}' # Use the TREC-formatted docs so we get all the available formatting (namely, line breaks) class MsMarcoTrecDocs(TrecDocs): def __init__(self, docs_dlc): super().__init__(docs_dlc, parser='text', lang='en', docstore_size_hint=14373971970, count_hint=ir_datasets.util.count_hint(NAME)) @ir_datasets.util.use_docstore def docs_iter(self): for doc in super().docs_iter(): if isinstance(doc, MsMarcoDocument): # It's coming from the docstore yield doc else: # It's coming from the TredDocs parser... Do a little more reformatting: # The first two lines are the URL and page title url, title, *body = doc.text.lstrip('\n').split('\n', 2) body = body[0] if body else '' yield MsMarcoDocument(doc.doc_id, url, title, body) def docs_cls(self): return MsMarcoDocument def docs_namespace(self): return NAME class MsMarcoAnchorTextDocument(NamedTuple): doc_id: str text: str anchors: List[str] def default_text(self): """ text + anchors """ return f'{self.text} ' + ' '.join(self.anchors) class MsMarcoAnchorTextDocs(BaseDocs): def __init__(self, dlc, count_hint): super().__init__() self._dlc = dlc self._count_hint = count_hint @ir_datasets.util.use_docstore def docs_iter(self): with self._dlc.stream() as stream: for line in stream: data = json.loads(line) yield MsMarcoAnchorTextDocument(data['id'], ' '.join(data['anchors']), data['anchors']) def docs_cls(self): return MsMarcoAnchorTextDocument def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f'{ir_datasets.util.home_path()}/{NAME}/anchor-text.pklz4', init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], count_hint=self._count_hint, options=options ) def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_namespace(self): return f'{NAME}/anchor-text' def docs_lang(self): return 'en' def _init(): base_path = ir_datasets.util.home_path()/NAME documentation = YamlDocumentation(f'docs/{NAME}.yaml') dlc = DownloadConfig.context(NAME, base_path, dua=DUA) subsets = {} collection = MsMarcoTrecDocs(GzipExtract(dlc['docs'])) subsets['train'] = Dataset( collection, TsvQueries(GzipExtract(dlc['train/queries']), namespace='msmarco', lang='en'), TrecQrels(GzipExtract(dlc['train/qrels']), QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['train/scoreddocs'])), ) subsets['dev'] = Dataset( collection, TsvQueries(GzipExtract(dlc['dev/queries']), namespace='msmarco', lang='en'), TrecQrels(GzipExtract(dlc['dev/qrels']), QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['dev/scoreddocs'])), ) subsets['eval'] = Dataset( collection, TsvQueries(GzipExtract(dlc['eval/queries']), namespace='msmarco', lang='en'), TrecScoredDocs(GzipExtract(dlc['eval/scoreddocs'])), ) subsets['trec-dl-2019'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2019/queries']), namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['trec-dl-2019/scoreddocs'])), ) subsets['trec-dl-2020'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2020/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['trec-dl-2020/scoreddocs'])), ) subsets['orcas'] = Dataset( collection, TsvQueries(GzipExtract(dlc['orcas/queries']), namespace='orcas', lang='en'), TrecQrels(GzipExtract(dlc['orcas/qrels']), ORCAS_QLRES_DEFS), TrecScoredDocs(GzipExtract(dlc['orcas/scoreddocs'])), ) dl19_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()}) subsets['trec-dl-2019/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_judged), FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(), dl19_judged), subsets['trec-dl-2019'], ) dl20_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2020'].qrels_iter()}) subsets['trec-dl-2020/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2020'].queries_handler(), dl20_judged), FilteredScoredDocs(subsets['trec-dl-2020'].scoreddocs_handler(), dl20_judged), subsets['trec-dl-2020'], ) # DL-Hard dl_hard_qrels_migrator = Migrator(base_path/'trec-dl-hard'/'irds_version.txt', 'v2', affected_files=[base_path/'trec-dl-hard'/'qrels'], message='Updating trec-dl-hard qrels') hard_qids = Lazy(lambda: DL_HARD_QIDS) dl_hard_base_queries = TsvQueries([ Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path/'trec-dl-2019/queries.tsv'), Cache(GzipExtract(dlc['trec-dl-2020/queries']), base_path/'trec-dl-2020/queries.tsv')], namespace='msmarco', lang='en') subsets['trec-dl-hard'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), dl_hard_qrels_migrator(TrecQrels(dlc['trec-dl-hard/qrels'], TREC_DL_QRELS_DEFS)), documentation('trec-dl-hard') ) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['1']) subsets['trec-dl-hard/fold1'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold1') ) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['2']) subsets['trec-dl-hard/fold2'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold2') ) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['3']) subsets['trec-dl-hard/fold3'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold3') ) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['4']) subsets['trec-dl-hard/fold4'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold4') ) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['5']) subsets['trec-dl-hard/fold5'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold5') ) subsets['anchor-text'] = Dataset( MsMarcoAnchorTextDocs( Cache(GzipExtract(dlc['anchor-text']), base_path / "anchor-text.json"), count_hint=1703834 ), documentation('anchor-text') ) ir_datasets.registry.register(NAME, Dataset(collection, documentation("_"))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets collection, subsets = _init() ================================================ FILE: ir_datasets/datasets/msmarco_document_v2.py ================================================ import contextlib import gzip import io from pathlib import Path import json from typing import NamedTuple, Tuple, List import tarfile import ir_datasets from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS from ir_datasets.util import Cache, DownloadConfig, GzipExtract, Lazy, Migrator, TarExtractAll from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries, FilteredScoredDocs, FilteredQrels from ir_datasets.formats import TsvQueries, TrecQrels, TrecScoredDocs, BaseDocs from ir_datasets.datasets.msmarco_passage import DUA, DL_HARD_QIDS_BYFOLD, DL_HARD_QIDS from ir_datasets.datasets.msmarco_document import TREC_DL_QRELS_DEFS _logger = ir_datasets.log.easy() NAME = 'msmarco-document-v2' QRELS_DEFS = { 1: 'Document contains a passage labeled as relevant in msmarco-passage' } class MsMarcoV2Document(NamedTuple): doc_id: str url: str title: str headings: str body: str def default_text(self): """ title + headings + body """ return f'{self.title} {self.headings} {self.body}' class MsMarcoV2Docs(BaseDocs): def __init__(self, dlc): super().__init__() self._dlc = dlc @ir_datasets.util.use_docstore def docs_iter(self): with self._dlc.stream() as stream, \ tarfile.open(fileobj=stream, mode='r|') as tarf: for record in tarf: if not record.name.endswith('.gz'): continue file = tarf.extractfile(record) with gzip.open(file) as file: for line in file: data = json.loads(line) yield MsMarcoV2Document( data['docid'], data['url'], data['title'], data['headings'], data['body']) def docs_cls(self): return MsMarcoV2Document def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): # NOTE: the MS MARCO v2 documents have this really neat quality that they contain the offset # position in the source file: <https://microsoft.github.io/msmarco/TREC-Deep-Learning.html>. # Unfortunately, it points to the position in the *uncompressed* file, so for this to work, we'd # need to decompress the source files, inflating the size ~3.3x. The options would be to: # 1) Always de-compress the source files, costing everybody ~3.3x the storage. Ouch. # 2) De-compress the source files the first time that the docstore is requested. This would # only cost the users who use the docstore 3.3x, but increases the complexity of the # iteration code to handle both compressed and non-compressed versions. Would also need code # to handle stuff like fancy slicing, which wouldn't be trivial. Would we also keep # the original source file around? If so, it actually ends up being 4.3x. # 3) Build a PickleLz4FullStore on demand, as normal. This would only cost the users who use # the docstore ~2.7x (accounting for worse lz4 compression rate and keeping around original # copy of the data), but is also slightly slower because of the O(log n) position lookups and # decompression. (This may be offset because pickle parsing is faster than json though.) # It also reduces the complexity of the code, as it does not require a new docstore # implementation for this dataset, and is just doing the normal procedure. return PickleLz4FullStore( path=f'{self._dlc.path(force=False)}.pklz4', init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], key_field_prefix='msmarco_doc_', # cut down on storage by removing prefix in lookup structure size_hint=66500029281, count_hint=ir_datasets.util.count_hint(NAME), options=options ) # return MsMArcoV2DocStore(self) def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_namespace(self): return NAME def docs_lang(self): return 'en' class MsMarcoV2AnchorTextDocument(NamedTuple): doc_id: str text: str anchors: List[str] def default_text(self): """ text + anchors """ return f'{self.text} ' + ' '.join(self.anchors) class MsMarcoV2AnchorTextDocs(BaseDocs): def __init__(self, dlc, count_hint): super().__init__() self._dlc = dlc self._count_hint = count_hint @ir_datasets.util.use_docstore def docs_iter(self): with self._dlc.stream() as stream: for line in stream: data = json.loads(line) yield MsMarcoV2AnchorTextDocument(data['id'], ' '.join(data['anchors']), data['anchors']) def docs_cls(self): return MsMarcoV2AnchorTextDocument def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f'{ir_datasets.util.home_path()}/{NAME}/anchor-text.pklz4', init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], count_hint=self._count_hint, options=options ) def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_namespace(self): return f'{NAME}/anchor-text' def docs_lang(self): return 'en' def _init(): base_path = ir_datasets.util.home_path()/NAME documentation = YamlDocumentation(f'docs/{NAME}.yaml') dlc = DownloadConfig.context(NAME, base_path, dua=DUA) subsets = {} collection = MsMarcoV2Docs(dlc['docs']) subsets['train'] = Dataset( collection, TsvQueries(dlc['train_queries'], namespace='msmarco', lang='en'), TrecQrels(dlc['train_qrels'], QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['train_scoreddocs'])), ) subsets['dev1'] = Dataset( collection, TsvQueries(dlc['dev1_queries'], namespace='msmarco', lang='en'), TrecQrels(dlc['dev1_qrels'], QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['dev1_scoreddocs'])), ) subsets['dev2'] = Dataset( collection, TsvQueries(dlc['dev2_queries'], namespace='msmarco', lang='en'), TrecQrels(dlc['dev2_qrels'], QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['dev2_scoreddocs'])), ) subsets['trec-dl-2019'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2019/queries']), namespace='msmarco', lang='en'), TrecQrels(GzipExtract(dlc['trec_dl_2019_qrels']), TREC_DL_QRELS_DEFS), ) subsets['trec-dl-2020'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco', lang='en'), TrecQrels(GzipExtract(dlc['trec_dl_2020_qrels']), TREC_DL_QRELS_DEFS), ) dl19_v2_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()}) subsets['trec-dl-2019/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_v2_judged), subsets['trec-dl-2019'], ) dl20_v2_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2020'].qrels_iter()}) subsets['trec-dl-2020/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2020'].queries_handler(), dl20_v2_judged), subsets['trec-dl-2020'], ) subsets['trec-dl-2021'] = Dataset( collection, TsvQueries(dlc['trec-dl-2021/queries'], namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2021/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['trec-dl-2021/scoreddocs'])), ) dl21_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2021'].qrels_iter()}) subsets['trec-dl-2021/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2021'].queries_handler(), dl21_judged), FilteredScoredDocs(subsets['trec-dl-2021'].scoreddocs_handler(), dl21_judged), subsets['trec-dl-2021'], ) subsets['trec-dl-2022'] = Dataset( collection, TsvQueries(dlc['trec-dl-2022/queries'], namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2022/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['trec-dl-2022/scoreddocs'])), ) dl22_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2022'].qrels_iter()}) subsets['trec-dl-2022/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2022'].queries_handler(), dl22_judged), FilteredScoredDocs(subsets['trec-dl-2022'].scoreddocs_handler(), dl22_judged), subsets['trec-dl-2022'], ) subsets['trec-dl-2023'] = Dataset( collection, TsvQueries(dlc['trec-dl-2023/queries'], namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2023/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['trec-dl-2023/scoreddocs'])), ) dl23_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2023'].qrels_iter()}) subsets['trec-dl-2023/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2023'].queries_handler(), dl23_judged), FilteredScoredDocs(subsets['trec-dl-2023'].scoreddocs_handler(), dl23_judged), subsets['trec-dl-2023'], ) subsets['anchor-text'] = Dataset( MsMarcoV2AnchorTextDocs( Cache(GzipExtract(dlc['anchor-text']), base_path / "anchor-text.json"), count_hint=4821244 ), documentation('anchor-text') ) ir_datasets.registry.register(NAME, Dataset(collection, documentation("_"))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets collection, subsets = _init() ================================================ FILE: ir_datasets/datasets/msmarco_passage.py ================================================ import hashlib import io import codecs import re import ir_datasets from ir_datasets.util import Cache, TarExtract, IterStream, GzipExtract, Lazy, DownloadConfig, Migrator from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredScoredDocs, FilteredQrels, FilteredDocPairs, YamlDocumentation from ir_datasets.formats import TsvQueries, TsvDocs, TrecQrels, TrecScoredDocs, TsvDocPairs _logger = ir_datasets.log.easy() NAME = 'msmarco-passage' DUA = ("Please confirm you agree to the MSMARCO data usage agreement found at " "<http://www.msmarco.org/dataset.aspx>") QRELS_DEFS = { 1: 'Labeled by crowd worker as relevant' } TREC_DL_QRELS_DEFS = { 3: "Perfectly relevant: The passage is dedicated to the query and contains the exact answer.", 2: "Highly relevant: The passage has some answer for the query, but the answer may be a bit " "unclear, or hidden amongst extraneous information.", 1: "Related: The passage seems related to the query but does not answer it.", 0: "Irrelevant: The passage has nothing to do with the query.", } SPLIT200_QIDS = {'484694', '836399', '683975', '428803', '1035062', '723895', '267447', '325379', '582244', '148817', '44209', '1180950', '424238', '683835', '701002', '1076878', '289809', '161771', '807419', '530982', '600298', '33974', '673484', '1039805', '610697', '465983', '171424', '1143723', '811440', '230149', '23861', '96621', '266814', '48946', '906755', '1142254', '813639', '302427', '1183962', '889417', '252956', '245327', '822507', '627304', '835624', '1147010', '818560', '1054229', '598875', '725206', '811871', '454136', '47069', '390042', '982640', '1174500', '816213', '1011280', '368335', '674542', '839790', '270629', '777692', '906062', '543764', '829102', '417947', '318166', '84031', '45682', '1160562', '626816', '181315', '451331', '337653', '156190', '365221', '117722', '908661', '611484', '144656', '728947', '350999', '812153', '149680', '648435', '274580', '867810', '101999', '890661', '17316', '763438', '685333', '210018', '600923', '1143316', '445800', '951737', '1155651', '304696', '958626', '1043094', '798480', '548097', '828870', '241538', '337392', '594253', '1047678', '237264', '538851', '126690', '979598', '707766', '1160366', '123055', '499590', '866943', '18892', '93927', '456604', '560884', '370753', '424562', '912736', '155244', '797512', '584995', '540814', '200926', '286184', '905213', '380420', '81305', '749773', '850038', '942745', '68689', '823104', '723061', '107110', '951412', '1157093', '218549', '929871', '728549', '30937', '910837', '622378', '1150980', '806991', '247142', '55840', '37575', '99395', '231236', '409162', '629357', '1158250', '686443', '1017755', '1024864', '1185054', '1170117', '267344', '971695', '503706', '981588', '709783', '147180', '309550', '315643', '836817', '14509', '56157', '490796', '743569', '695967', '1169364', '113187', '293255', '859268', '782494', '381815', '865665', '791137', '105299', '737381', '479590', '1162915', '655989', '292309', '948017', '1183237', '542489', '933450', '782052', '45084', '377501', '708154'} # from <https://github.com/grill-lab/DL-Hard/blob/main/dataset/folds.json> on 30 April 2021 DL_HARD_QIDS_BYFOLD = { "1": {'915593', '451602', '966413', '1056204', '182539', '655914', '67316', '883915', '1049519', '174463'}, "2": {'794429', '588587', '1114646', '537817', '1065636', '144862', '443396', '332593', '1103812', '19335'}, "3": {'177604', '1108939', '264403', '86606', '1133485', '1117817', '705609', '315637', '673670', '1105792'}, "4": {'801118', '507445', '87452', '88495', '554515', '166046', '730539', '1108100', '1109707', '1056416'}, "5": {'190044', '527433', '489204', '877809', '1106007', '47923', '1136769', '1112341', '1103153', '273695'}, } DL_HARD_QIDS = set.union(*DL_HARD_QIDS_BYFOLD.values()) # Converts "top1000" MS run files "QID DID QText DText" to "QID DID" to remove tons of redundant # storage with query and document files. class ExtractQidPid: def __init__(self, streamer): self._streamer = streamer def stream(self): return io.BufferedReader(IterStream(iter(self)), buffer_size=io.DEFAULT_BUFFER_SIZE) def __iter__(self): with self._streamer.stream() as stream: for line in _logger.pbar(stream, desc='extracting QID/PID pairs', unit='pair'): qid, did, _, _ = line.split(b'\t') yield qid + b'\t' + did + b'\n' # The encoding of the MS MARCO passage collection is... weird... # Some characters are properly utf8-encoded, while others are not, even within the same passage. # So, thi cutom-built streaming class aims to fix that. What it does is finds "suspicious" # characters, basically anything in the 128-255 range. Once found, it will pick 2-4 characters # around it and try to encode them as latin-1 and decode them at utf8. class FixEncoding: def __init__(self, streamer): self._streamer = streamer def stream(self): return io.BufferedReader(IterStream(iter(self)), buffer_size=io.DEFAULT_BUFFER_SIZE) def __iter__(self): SUS = '[\x80-\xff]' # Find sequences of up to 4 characters that contain a suspicious character. # We'll attempt to interpret these as latin1 characters and then decode them back to UTF8. # With this technique, we get 100% matches with MS MARCO QnA passages (which do not have this encoding issue) # This approach is more than twice as fast as using ftfy regexes = [ re.compile(f'(...{SUS}|..{SUS}.|.{SUS}..|{SUS}...)'), re.compile(f'(..{SUS}|.{SUS}.|{SUS}..)'), re.compile(f'(.{SUS}|{SUS}.)'), ] with self._streamer.stream() as stream, \ _logger.pbar_raw(desc='fixing encoding', unit='B', unit_scale=True) as pbar: # NOTE: codecs.getreader is subtly broken here; it sometimes splits lines between special characters (and it's unclear why) for line in stream: pbar.update(len(line)) line = line.decode('utf8') for regex in regexes: pos = 0 while pos < len(line): match = regex.search(line, pos=pos) if not match: break try: fixed = match.group().encode('latin1').decode('utf8') if len(fixed) == 1: line = line[:match.start()] + fixed + line[match.end():] except UnicodeError: pass pos = match.start() + 1 yield line.encode() # Converts "small triples" MS files to "qid/pos_did/neg_did" format to remove tons of redundant storage. class MapSmallTriplesQidPid: def __init__(self, streamer, corpus_stream, queries_handler): self._streamer = streamer self._corpus_stream = corpus_stream # note: must use raw topics here beacuse this file also includes the broken text found in the corpus file self._queries_handler = queries_handler def stream(self): return io.BufferedReader(IterStream(iter(self)), buffer_size=io.DEFAULT_BUFFER_SIZE) def __iter__(self): # Strangely, in this file, the query text is mangled, even though the query source file isn't. # So we need to apply the encoding fix that's normally applied to the docs to the queries here. SUS = '[\x80-\xff]' regexes = [ re.compile(f'(...{SUS}|..{SUS}.|.{SUS}..|{SUS}...)'), re.compile(f'(..{SUS}|.{SUS}.|{SUS}..)'), re.compile(f'(.{SUS}|{SUS}.)'), ] passagehash_did_map = {} with self._corpus_stream.stream() as fin: for line in _logger.pbar(fin, desc='build d text lookup (step 1 of 3)', total=8841823): did, contents = line.rstrip(b'\n').split(b'\t') content_hash = hashlib.md5(contents).digest()[:7] # 7 byte version results in no collisions & reduces memory assert content_hash not in passagehash_did_map passagehash_did_map[bytes(content_hash)] = int(did) # int did reduces memory queryhash_qid_map = {} for query in _logger.pbar(self._queries_handler.queries_iter(), desc='build q text lookup (step 2 of 3)', total=808731): query_hash = hashlib.md5(query.text.encode()).digest()[:6] # 6 byte version results in no collisions & reduces memory assert query_hash not in queryhash_qid_map queryhash_qid_map[bytes(query_hash)] = int(query.query_id) # int qid reduces memory with self._streamer.stream() as fin: for line in _logger.pbar(fin, desc='map d/q text to IDs (step 3 of 3)', total=39780811): query, doc1, doc2 = line.rstrip(b'\n').split(b'\t') query = query.decode('utf8') for regex in regexes: pos = 0 while pos < len(query): match = regex.search(query, pos=pos) if not match: break try: fixed = match.group().encode('latin1').decode('utf8') if len(fixed) == 1: query = query[:match.start()] + fixed + query[match.end():] except UnicodeError: pass pos = match.start() + 1 query_hash = hashlib.md5(query.encode()).digest()[:6] doc1_hash = hashlib.md5(doc1).digest()[:7] doc2_hash = hashlib.md5(doc2).digest()[:7] yield f'{queryhash_qid_map[query_hash]}\t{passagehash_did_map[doc1_hash]}\t{passagehash_did_map[doc2_hash]}\n'.encode() def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path, dua=DUA) migrator = Migrator(base_path/'irds_version.txt', 'v2', affected_files=[base_path/'collection.tsv', base_path/'collection.tsv.pklz4'], message=f'Migrating {NAME} (fixing passage encoding)') collection = TsvDocs(Cache(FixEncoding(TarExtract(dlc['collectionandqueries'], 'collection.tsv')), base_path/'collection.tsv'), namespace='msmarco', lang='en', docstore_size_hint=14373971970, count_hint=ir_datasets.util.count_hint(NAME)) collection = migrator(collection) subsets = {} subsets['train'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.train.tsv'), base_path/'train/queries.tsv'), namespace='msmarco', lang='en'), TrecQrels(dlc['train/qrels'], QRELS_DEFS), TsvDocPairs(GzipExtract(dlc['train/docpairs'])), TrecScoredDocs(Cache(ExtractQidPid(TarExtract(dlc['train/scoreddocs'], 'top1000.train.txt')), base_path/'train/ms.run')), ) subsets['train/triples-v2'] = Dataset( collection, subsets['train'].queries_handler(), subsets['train'].qrels_handler(), TsvDocPairs(GzipExtract(dlc['train/docpairs/v2'])), subsets['train'].scoreddocs_handler(), ) subsets['train/triples-small'] = Dataset( collection, subsets['train'].queries_handler(), subsets['train'].qrels_handler(), TsvDocPairs(Cache(MapSmallTriplesQidPid(TarExtract(dlc['train/docpairs/small'], 'triples.train.small.tsv'), TarExtract(dlc['collectionandqueries'], 'collection.tsv'), subsets['train'].queries_handler()), base_path/'train/small.triples.qidpid.tsv')), subsets['train'].scoreddocs_handler(), ) subsets['dev'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.dev.tsv'), base_path/'dev/queries.tsv'), namespace='msmarco', lang='en'), TrecQrels(dlc['dev/qrels'], QRELS_DEFS), ) subsets['dev/small'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['collectionandqueries'], 'queries.dev.small.tsv'), base_path/'dev/small/queries.tsv'), namespace='msmarco', lang='en'), TrecQrels(Cache(TarExtract(dlc['collectionandqueries'], 'qrels.dev.small.tsv'), base_path/'dev/small/qrels'), QRELS_DEFS), TrecScoredDocs(Cache(ExtractQidPid(TarExtract(dlc['dev/scoreddocs'], 'top1000.dev')), base_path/'dev/ms.run')), ) subsets['eval'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.eval.tsv'), base_path/'eval/queries.tsv'), namespace='msmarco', lang='en'), ) subsets['eval/small'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['collectionandqueries'], 'queries.eval.small.tsv'), base_path/'eval/small/queries.tsv'), namespace='msmarco', lang='en'), TrecScoredDocs(Cache(ExtractQidPid(TarExtract(dlc['eval/scoreddocs'], 'top1000.eval')), base_path/'eval/ms.run')), ) subsets['trec-dl-2019'] = Dataset( collection, TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS), TsvQueries(Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path/'trec-dl-2019/queries.tsv'), namespace='msmarco', lang='en'), TrecScoredDocs(Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2019/scoreddocs'])), base_path/'trec-dl-2019/ms.run')), ) subsets['trec-dl-2020'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2020/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs(Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2020/scoreddocs'])), base_path/'trec-dl-2020/ms.run')), ) # A few subsets that are contrainted to just the queries/qrels/docpairs that have at least # 1 relevance assessment train_judged = Lazy(lambda: {q.query_id for q in subsets['train'].qrels_iter()}) subsets['train/judged'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), train_judged), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_judged), subsets['train'], ) dev_judged = Lazy(lambda: {q.query_id for q in subsets['dev'].qrels_iter()}) subsets['dev/judged'] = Dataset( FilteredQueries(subsets['dev'].queries_handler(), dev_judged), subsets['dev'], ) dl19_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()}) subsets['trec-dl-2019/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_judged), FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(), dl19_judged), subsets['trec-dl-2019'], ) dl20_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2020'].qrels_iter()}) subsets['trec-dl-2020/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2020'].queries_handler(), dl20_judged), FilteredScoredDocs(subsets['trec-dl-2020'].scoreddocs_handler(), dl20_judged), subsets['trec-dl-2020'], ) # split200 -- 200 queries held out from the training data for validation split200 = Lazy(lambda: SPLIT200_QIDS) subsets['train/split200-train'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), split200, mode='exclude'), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), split200, mode='exclude'), FilteredQrels(subsets['train'].qrels_handler(), split200, mode='exclude'), FilteredDocPairs(subsets['train'].docpairs_handler(), split200, mode='exclude'), subsets['train'], ) subsets['train/split200-valid'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), split200, mode='include'), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), split200, mode='include'), FilteredQrels(subsets['train'].qrels_handler(), split200, mode='include'), FilteredDocPairs(subsets['train'].docpairs_handler(), split200, mode='include'), subsets['train'], ) dev2_qids = Lazy(lambda: {q.query_id for q in ir_datasets.load('msmarco-passage-v2/dev2').queries}) subsets['dev/2'] = Dataset( FilteredQueries(subsets['dev'].queries_handler(), dev2_qids), FilteredQrels(subsets['dev'].qrels_handler(), dev2_qids), subsets['dev'], ) # Medical subset def train_med(): with dlc['medmarco_ids'].stream() as stream: stream = codecs.getreader('utf8')(stream) return {l.rstrip() for l in stream} train_med = Lazy(train_med) subsets['train/medical'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), train_med), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_med), FilteredDocPairs(subsets['train'].docpairs_handler(), train_med), FilteredQrels(subsets['train'].qrels_handler(), train_med), subsets['train'], ) # DL-Hard dl_hard_qrels_migrator = Migrator(base_path/'trec-dl-hard'/'irds_version.txt', 'v3', affected_files=[base_path/'trec-dl-hard'/'qrels'], message='Updating trec-dl-hard qrels') hard_qids = Lazy(lambda: DL_HARD_QIDS) dl_hard_base_queries = TsvQueries([ Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path/'trec-dl-2019/queries.tsv'), Cache(GzipExtract(dlc['trec-dl-2020/queries']), base_path/'trec-dl-2020/queries.tsv')], namespace='msmarco', lang='en') subsets['trec-dl-hard'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), dl_hard_qrels_migrator(TrecQrels(dlc['trec-dl-hard/qrels'], TREC_DL_QRELS_DEFS)), documentation('trec-dl-hard') ) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['1']) subsets['trec-dl-hard/fold1'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold1') ) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['2']) subsets['trec-dl-hard/fold2'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold2') ) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['3']) subsets['trec-dl-hard/fold3'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold3') ) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['4']) subsets['trec-dl-hard/fold4'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold4') ) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['5']) subsets['trec-dl-hard/fold5'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold5') ) ir_datasets.registry.register(NAME, Dataset(collection, documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets collection, subsets = _init() ================================================ FILE: ir_datasets/datasets/msmarco_passage_v2.py ================================================ import re import os import contextlib import gzip import io from pathlib import Path import json from typing import NamedTuple, Tuple import tarfile import ir_datasets from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS, FileAccess from ir_datasets.util import Cache, DownloadConfig, GzipExtract, Lazy, Migrator, TarExtractAll from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries, FilteredScoredDocs, FilteredQrels from ir_datasets.formats import TsvQueries, TrecQrels, TrecScoredDocs, BaseDocs from ir_datasets.datasets.msmarco_passage import DUA, DL_HARD_QIDS_BYFOLD, DL_HARD_QIDS, TREC_DL_QRELS_DEFS QRELS_DEFS = { 1: 'Based on mapping from v1 of MS MARCO' } _logger = ir_datasets.log.easy() NAME = 'msmarco-passage-v2' class MsMarcoV2Passage(NamedTuple): doc_id: str text: str spans: Tuple[Tuple[int, int], ...] msmarco_document_id: str def default_text(self): """ text """ return self.text def parse_msmarco_passage(line): data = json.loads(line) # extract spans in the format of "(123,456),(789,101123)" spans = tuple((int(a), int(b)) for a, b in re.findall(r'\((\d+),(\d+)\)', data['spans'])) return MsMarcoV2Passage( data['pid'], data['passage'], spans, data['docid']) class MsMarcoV2Passages(BaseDocs): def __init__(self, dlc, pos_dlc=None): super().__init__() self._dlc = dlc self._pos_dlc = pos_dlc @ir_datasets.util.use_docstore def docs_iter(self): if self._pos_dlc is not None: # the shortcut only applies if the default pos # files are used (i.e., no filtering is applied) yield from self.docs_store() else: with self._dlc.stream() as stream, \ tarfile.open(fileobj=stream, mode='r|') as tarf: for record in tarf: if not record.name.endswith('.gz'): continue file = tarf.extractfile(record) with gzip.open(file) as file: for line in file: yield parse_msmarco_passage(line) def docs_cls(self): return MsMarcoV2Passage def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): assert field == 'doc_id' # Unlike for msmarco-document-v2, using the docstore actually hurts performance. return MsMarcoV2DocStore(self, options=options) def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_namespace(self): return NAME def docs_lang(self): return 'en' def docs_path(self, force=True): return self._dlc.path(force) class MsMarcoV2DocStore(ir_datasets.indices.Docstore): def __init__(self, docs_handler, options=DEFAULT_DOCSTORE_OPTIONS): super().__init__(docs_handler.docs_cls(), 'doc_id', options=options) self.np = ir_datasets.lazy_libs.numpy() self.docs_handler = docs_handler self.dlc = docs_handler._dlc self.pos_dlc = docs_handler._pos_dlc self.base_path = docs_handler.docs_path(force=False) + '.extracted' if not os.path.exists(self.base_path): os.makedirs(self.base_path) self.size_hint = 60880127751 if options.file_access != FileAccess.FILE: _logger.warning(f"MsMarcoV2 passage only allows FILE access (requested {options.file_access})") def get_many_iter(self, keys): self.build() # adapted from <https://microsoft.github.io/msmarco/TREC-Deep-Learning.html> bundles = {} for key in keys: if not key.count('_') == 3: continue (string1, string2, bundlenum, position) = key.split('_') assert string1 == 'msmarco' and string2 == 'passage' if bundlenum not in bundles: bundles[bundlenum] = [] bundles[bundlenum].append(int(position)) for bundlenum, positions in bundles.items(): positions = sorted(positions) file = f'{self.base_path}/msmarco_passage_{bundlenum}' if not os.path.exists(file): # invalid doc_id -- doesn't point to a real bundle continue if self.docs_handler._pos_dlc is not None: # check the positions are valid for these doc_ids -- only return valid ones mmp = self.np.memmap(os.path.join(self.pos_dlc.path(), f'msmarco_passage_{bundlenum}.pos'), dtype='<u4') positions = self.np.array(positions, dtype='<u4') positions = positions[self.np.isin(positions, mmp)].tolist() del mmp with open(file, 'rt', encoding='utf8') as in_fh: for position in positions: in_fh.seek(position) try: yield parse_msmarco_passage(in_fh.readline()) except json.JSONDecodeError: # invalid doc_id -- pointed to a wrong position pass def build(self): if self.built(): return np = ir_datasets.lazy_libs.numpy() ir_datasets.util.check_disk_free(self.base_path, self.size_hint) with _logger.pbar_raw('extracting source documents', total=70, unit='file') as pbar, \ self.dlc.stream() as stream, \ tarfile.open(fileobj=stream, mode='r|') as tarf: for record in tarf: if not record.name.endswith('.gz'): continue file = tarf.extractfile(record) fname = record.name.split('/')[-1][:-len('.gz')] positions = [] with gzip.open(file) as fin, \ open(os.path.join(self.base_path, fname), 'wb') as fout: for line in fin: positions.append(fout.tell()) fout.write(line) # keep track of the positions for efficient slicing with open(os.path.join(self.base_path, f'{fname}.pos'), 'wb') as posout: posout.write(np.array(positions, dtype='<u4').tobytes()) pbar.update(1) (Path(self.base_path) / '_built').touch() def built(self): return (Path(self.base_path) / '_built').exists() def __iter__(self): self.build() return MsMarcoV2PassageIter(self, slice(0, self.count())) def _iter_source_files(self): for i in range(70): yield os.path.join(self.base_path, f'msmarco_passage_{i:02d}') def count(self): if self.docs_handler._pos_dlc is not None: base_path = self.pos_dlc.path() return sum(os.path.getsize(os.path.join(base_path, f)) for f in os.listdir(base_path)) // 4 return 138_364_198 class MsMarcoV2PassageIter: def __init__(self, docstore, slice): self.np = ir_datasets.lazy_libs.numpy() self.docstore = docstore self.slice = slice self.next_index = 0 self.file_iter = docstore._iter_source_files() self.current_file = None self.current_pos_mmap = None self.current_file_start_idx = 0 self.current_file_end_idx = 0 def __next__(self): if self.slice.start >= self.slice.stop: raise StopIteration while self.next_index != self.slice.start or self.current_file is None or self.current_file_end_idx <= self.slice.start or self.current_pos_mmap[self.slice.start - self.current_file_start_idx] != self.current_file.tell(): if self.current_file is None or self.current_file_end_idx <= self.slice.start: # First iteration or no docs remaining in this file if self.current_file is not None: self.current_file.close() self.current_file = None # jump ahead to the file that contains the desired index first = True while first or self.current_file_end_idx < self.slice.start: source_file = next(self.file_iter) self.next_index = self.current_file_end_idx self.current_file_start_idx = self.current_file_end_idx pos_file = source_file + '.pos' if self.docstore.pos_dlc is not None: pos_file = os.path.join(self.docstore.pos_dlc.path(), source_file.split('/')[-1] + '.pos') self.current_file_end_idx = self.current_file_start_idx + (os.path.getsize(pos_file) // 4) first = False self.current_file = open(source_file, 'rb') self.current_pos_mmap = self.np.memmap(pos_file, dtype='<u4') else: # jump to the position of the next document pos = self.current_pos_mmap[self.slice.start - self.current_file_start_idx] self.current_file.seek(pos) self.next_index = self.slice.start result = parse_msmarco_passage(self.current_file.readline()) self.next_index += 1 self.slice = slice(self.slice.start + (self.slice.step or 1), self.slice.stop, self.slice.step) return result def close(self): self.file_iter = None if self.current_file is not None: self.current_file.close() self.current_file = None self.current_pos_mmap = None def __iter__(self): return self def __del__(self): self.close() def __getitem__(self, key): if isinstance(key, slice): # it[start:stop:step] new_slice = ir_datasets.util.apply_sub_slice(self.slice, key) return MsMarcoV2PassageIter(self.docstore, new_slice) elif isinstance(key, int): # it[index] new_slice = ir_datasets.util.slice_idx(self.slice, key) new_it = MsMarcoV2PassageIter(self.docstore, new_slice) try: return next(new_it) except StopIteration as e: raise IndexError((self.slice, slice(key, key+1), new_slice)) raise TypeError('key must be int or slice') def _init(): base_path = ir_datasets.util.home_path()/NAME documentation = YamlDocumentation(f'docs/{NAME}.yaml') dlc = DownloadConfig.context(NAME, base_path, dua=DUA) subsets = {} migrator = Migrator(base_path/'irds_version.txt', 'v2', affected_files=[base_path/'msmarco_v2_passage.tar.pklz4'], message='Cleaning up pklz4 lookup structure in favor of ID-based lookups') collection = MsMarcoV2Passages(dlc['passages']) collection = migrator(collection) qrels_migrator = Migrator(base_path/'qrels_version.txt', 'v2', affected_files=[base_path/'train'/'qrels.tsv', base_path/'dev1'/'qrels.tsv', base_path/'dev2'/'qrels.tsv'], message='Updating qrels (task organizers removed duplicates)') subsets['dedup'] = Dataset( MsMarcoV2Passages(dlc['passages'], TarExtractAll(dlc['dedup_positions'], base_path/'dedup_positions')) ) subsets['train'] = Dataset( collection, TsvQueries(dlc['train/queries'], namespace='msmarco', lang='en'), qrels_migrator(TrecQrels(dlc['train/qrels'], QRELS_DEFS)), TrecScoredDocs(GzipExtract(dlc['train/scoreddocs'])), ) subsets['dev1'] = Dataset( collection, TsvQueries(dlc['dev1/queries'], namespace='msmarco', lang='en'), qrels_migrator(TrecQrels(dlc['dev1/qrels'], QRELS_DEFS)), TrecScoredDocs(GzipExtract(dlc['dev1/scoreddocs'])), ) subsets['dev2'] = Dataset( collection, TsvQueries(dlc['dev2/queries'], namespace='msmarco', lang='en'), qrels_migrator(TrecQrels(dlc['dev2/qrels'], QRELS_DEFS)), TrecScoredDocs(GzipExtract(dlc['dev2/scoreddocs'])), ) subsets['trec-dl-2021'] = Dataset( collection, TsvQueries(dlc['trec-dl-2021/queries'], namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2021/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['trec-dl-2021/scoreddocs'])), ) dl21_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2021'].qrels_iter()}) subsets['trec-dl-2021/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2021'].queries_handler(), dl21_judged), FilteredScoredDocs(subsets['trec-dl-2021'].scoreddocs_handler(), dl21_judged), subsets['trec-dl-2021'], ) subsets['trec-dl-2022'] = Dataset( collection, TsvQueries(dlc['trec-dl-2022/queries'], namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2022/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['trec-dl-2022/scoreddocs'])), ) dl22_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2022'].qrels_iter()}) subsets['trec-dl-2022/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2022'].queries_handler(), dl22_judged), FilteredScoredDocs(subsets['trec-dl-2022'].scoreddocs_handler(), dl22_judged), subsets['trec-dl-2022'], ) subsets['trec-dl-2023'] = Dataset( collection, TsvQueries(dlc['trec-dl-2023/queries'], namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2023/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['trec-dl-2023/scoreddocs'])), ) dl23_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2023'].qrels_iter()}) subsets['trec-dl-2023/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2023'].queries_handler(), dl23_judged), FilteredScoredDocs(subsets['trec-dl-2023'].scoreddocs_handler(), dl23_judged), subsets['trec-dl-2023'], ) ir_datasets.registry.register(NAME, Dataset(collection, documentation("_"))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets collection, subsets = _init() ================================================ FILE: ir_datasets/datasets/msmarco_qna.py ================================================ import hashlib import re import itertools import contextlib import io import codecs from typing import NamedTuple, Tuple import re import ir_datasets from ir_datasets.indices import DEFAULT_DOCSTORE_OPTIONS, DocstoreOptions from ir_datasets.util import Cache, TarExtract, IterStream, GzipExtract, Lazy, DownloadConfig, Migrator from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredScoredDocs, FilteredQrels, FilteredDocPairs, YamlDocumentation from ir_datasets.formats import TsvQueries, TrecQrels, TrecScoredDocs, TsvDocPairs, DocstoreBackedDocs _logger = ir_datasets.log.easy() NAME = 'msmarco-qna' DUA = ("Please confirm you agree to the MSMARCO data usage agreement found at " "<http://www.msmarco.org/dataset.aspx>") QRELS_DEFS = { 1: 'Marked by annotator as a contribution to their answer', 0: 'Not marked by annotator as a contribution to their answer', } NO_ANSWER_PLACEHOLDER = 'No Answer Present.' class MsMarcoQnAQuery(NamedTuple): query_id: str text: str type: str answers: Tuple[str, ...] def default_text(self): """ text """ return self.text class MsMarcoQnAEvalQuery(NamedTuple): query_id: str text: str type: str def default_text(self): """ text """ return self.text class MsMarcoQnADoc(NamedTuple): doc_id: str text: str url: str msmarco_passage_id: str msmarco_document_id: str def default_text(self): """ text """ return self.text # The MS MARCO QnA data files are in a super inconvenient format. They have a script to convert it # to JSONL format, but it involves loading the entire collection into memory and doing merging via # pandas, which is a non-starter. So we'll incrementally process the dataset using ijson. # Format: # { # "answers": { # "XXX": ["", ""], # ... # }, # "passages": { # "XXX": { # "is_selected": 0, # "passage_text": "", # "url": "" # }, # ... # }, # "query": {"XXX": "", ...}, # "query_type": {"XXX": "", ...}, # "query_id": {"XXX": 0, ...} # } # Where XXX is an ID used only for linking the records here in this file. Luckly, they are sorted # so we don't actually need to deal with them. # What's worse is that "passages" can be repeated and they don't have an ID. So we'll assign one # in the order that they appear in the file, skipping duplicates. # To find duplicates, we'll hash the text and url and keep that in a lookup. It's not ideal, but # better than keeping a copy of all the passage texts in memory. I found that I can use a shorter # version of the hashes that do not end up colliding. This reduces the memory overhead. # The process ends up building out a collection-wide docstore and id/query/type/answers/qrels files # for each split, that then get merged into query and qrel TSV files. class MsMarcoQnAManager: def __init__(self, train_dlc, dev_dlc, eval_dlc, base_path): self._train_dlc = train_dlc self._dev_dlc = dev_dlc self._eval_dlc = eval_dlc self._docs_store = None self._base_path = base_path def docs_store(self, options: DocstoreOptions=DEFAULT_DOCSTORE_OPTIONS): self.build() return self._internal_docs_store(options) def _internal_docs_store(self, options: DocstoreOptions=DEFAULT_DOCSTORE_OPTIONS): if self._docs_store is None: self._docs_store = ir_datasets.indices.PickleLz4FullStore(self._base_path/'docs.pklz4', None, MsMarcoQnADoc, 'doc_id', ['doc_id'], count_hint=ir_datasets.util.count_hint(NAME)) return self._docs_store def build(self): ijson = ir_datasets.lazy_libs.ijson() docs_store = self._internal_docs_store() if docs_store.built(): return # already built dochash_lookup = {} for doc in _logger.pbar(ir_datasets.load('msmarco-passage').docs_iter(), desc='building msmarco-passage lookup', total=ir_datasets.load('msmarco-passage').docs_count(), unit='doc'): dochash = bytes(hashlib.md5(doc.text.encode()).digest()[:8]) assert dochash not in dochash_lookup dochash_lookup[dochash] = (int(doc.doc_id), {}) urlhash_lookup = {} for doc in _logger.pbar(ir_datasets.load('msmarco-document').docs_iter(), desc='building msmarco-document lookup', total=ir_datasets.load('msmarco-document').docs_count(), unit='doc'): urlhash = bytes(hashlib.md5(doc.url.encode()).digest()[:8]) assert urlhash not in urlhash_lookup urlhash_lookup[urlhash] = doc.doc_id nil_doc = MsMarcoQnADoc(None, None, None, None, None) current_doc = nil_doc prefix_passages = re.compile(r'^passages\.\d+\.item$') prefix_answers = re.compile(r'^answers\.\d+\.item$') prefix_type = re.compile(r'^query_type\.\d+$') prefix_text = re.compile(r'^query\.\d+$') prefix_id = re.compile(r'^query_id\.\d+$') pbar_postfix = {'file': None, 'missing_urls': 0, 'key': None} with contextlib.ExitStack() as outer_stack: docs_trans = outer_stack.enter_context(docs_store.lookup.transaction()) pbar = outer_stack.enter_context(_logger.pbar_raw(desc='processing qna', postfix=pbar_postfix, unit='item')) for dlc, file_str in [(self._train_dlc, 'train'), (self._dev_dlc, 'dev'), (self._eval_dlc, 'eval')]: pbar_postfix['file'] = file_str last_ans_prefix = None last_psg_prefix = None is_selected = None with contextlib.ExitStack() as inner_stack: stream = inner_stack.enter_context(dlc.stream()) parser = ijson.parse(stream) out_text = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_text', 'wt')) out_type = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_type', 'wt')) out_id = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_id', 'wt')) if file_str != 'eval': out_qrels = inner_stack.enter_context(open(self._base_path/f'{file_str}.selections', 'wt')) out_answer = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_answer', 'wt+')) out_seq = None else: out_qrels, out_answer = None, None out_seq = inner_stack.enter_context(open(self._base_path/f'{file_str}.seq', 'wt')) for prefix, event, data in parser: pbar_postfix['key'] = prefix pbar.set_postfix(pbar_postfix, refresh=False) pbar.update() if prefix_passages.match(prefix): if event == 'end_map': assert current_doc.text is not None and current_doc.url is not None dochash = bytes(hashlib.md5(current_doc.text.encode()).digest()[:8]) assert dochash in dochash_lookup, "doc_id lookup failed; passage text not found in msmarco-passage" pid = dochash_lookup[dochash][0] urlhash = bytes(hashlib.md5(current_doc.url.encode()).digest()[:8]) add = False if urlhash not in dochash_lookup[dochash][1]: urlidx = len(dochash_lookup[dochash][1]) dochash_lookup[dochash][1][urlhash] = urlidx add = True else: urlidx = dochash_lookup[dochash][1][urlhash] msm_doc_id = urlhash_lookup.get(urlhash) if msm_doc_id is None: pbar_postfix['missing_urls'] += 1 did = f'{pid}-{urlidx}' current_doc = current_doc._replace(doc_id=did, msmarco_passage_id=str(pid), msmarco_document_id=msm_doc_id) if add: docs_trans.add(current_doc) if out_qrels is not None: if last_psg_prefix == prefix: out_qrels.write(f'\t{did} {is_selected}') elif last_psg_prefix is None: out_qrels.write(f'{did} {is_selected}') else: out_qrels.write(f'\n{did} {is_selected}') last_psg_prefix = prefix if out_seq is not None: if last_psg_prefix == prefix: out_seq.write(f'\t{did}') elif last_psg_prefix is None: out_seq.write(f'{did}') else: out_seq.write(f'\n{did}') last_psg_prefix = prefix is_selected = None current_doc = nil_doc elif event == 'map_key': key = data value = next(parser)[2] if key == 'is_selected': is_selected = str(value) elif key == 'passage_text': current_doc = current_doc._replace(text=value) elif key == 'url': current_doc = current_doc._replace(url=value) elif prefix_answers.match(prefix): # a little more annoying because there can be multiple answers (but there's always at least 1) text = str(data).replace("\n", " ").replace("\t", " ") if last_ans_prefix == prefix: out_answer.write(f'\t{text}') elif last_ans_prefix is None: out_answer.write(text) else: out_answer.write(f'\n{text}') last_ans_prefix = prefix elif prefix_text.match(prefix): text = str(data).replace("\n", " ") out_text.write(f'{text}\n') elif prefix_id.match(prefix): text = str(data).replace("\n", " ") out_id.write(f'{text}\n') elif prefix_type.match(prefix): text = str(data).replace("\n", " ") out_type.write(f'{text}\n') if file_str != 'eval': out_answer.write('\n') out_qrels.write('\n') else: out_seq.write('\n') # Merge files for file_str in ['train', 'dev', 'eval']: with contextlib.ExitStack() as stack: f_qid = stack.enter_context(open(self._base_path/f'{file_str}.query_id', 'rt')) f_type = stack.enter_context(open(self._base_path/f'{file_str}.query_type', 'rt')) f_text = stack.enter_context(open(self._base_path/f'{file_str}.query_text', 'rt')) f_queries = stack.enter_context(open(self._base_path/f'{file_str}.queries.tsv', 'wt')) f_run = stack.enter_context(open(self._base_path/f'{file_str}.run', 'wt')) in_files = [f_qid, f_type, f_text] if file_str != 'eval': f_selections = stack.enter_context(open(self._base_path/f'{file_str}.selections', 'rt')) f_answers = stack.enter_context(open(self._base_path/f'{file_str}.query_answer', 'rt')) f_qrels = stack.enter_context(open(self._base_path/f'{file_str}.qrels', 'wt')) in_files += [f_selections, f_answers] else: f_seq = stack.enter_context(open(self._base_path/f'{file_str}.seq', 'rt')) in_files += [f_seq] for columns in _logger.pbar(zip(*in_files), desc=f'merging {file_str} files', unit='doc'): columns = [x.strip() for x in columns] qid, typ, text = columns[:3] if file_str != 'eval': selections, answers = columns[3:] # Remove the "no answer" placeholder answers = answers.replace(NO_ANSWER_PLACEHOLDER, '') if answers: answers = f'\t{answers}' f_queries.write(f'{qid}\t{text}\t{typ}{answers}\n') for i, qrel in enumerate(selections.split('\t')): did, label = qrel.split() f_qrels.write(f'{qid} 0 {did} {label}\n') f_run.write(f'{qid} Q0 {did} {i} {-i} qna\n') else: seq, = columns[3:] f_queries.write(f'{qid}\t{text}\t{typ}\n') for i, did in enumerate(seq.split('\t')): f_run.write(f'{qid} Q0 {did} {i} {-i} qna\n') # clean up temp files (self._base_path/f'{file_str}.query_id').unlink() (self._base_path/f'{file_str}.query_type').unlink() (self._base_path/f'{file_str}.query_text').unlink() if file_str != 'eval': (self._base_path/f'{file_str}.selections').unlink() (self._base_path/f'{file_str}.query_answer').unlink() def file_ref(self, path): return _ManagedDlc(self, self._base_path/path) class _ManagedDlc: def __init__(self, manager, path): self._manager = manager self._path = path @contextlib.contextmanager def stream(self): self._manager.build() with open(self._path, 'rb') as f: yield f def path(self, force=True): if force: self._manager.build() return self._path def _init(): base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path, dua=DUA) documentation = YamlDocumentation(f'docs/{NAME}.yaml') manager = MsMarcoQnAManager(GzipExtract(dlc['train']), GzipExtract(dlc['dev']), GzipExtract(dlc['eval']), base_path) migrator = Migrator(base_path/'irds_version.txt', 'v2', affected_files=[ base_path/'docs.pklz4', base_path/'train.run', base_path/'train.qrels', base_path/'dev.run', base_path/'dev.qrels', base_path/'eval.run', ], message='Migrating msmarco-qna (correcting doc_ids)') collection = DocstoreBackedDocs(manager.docs_store, docs_cls=MsMarcoQnADoc, namespace=NAME, lang='en') collection = migrator(collection) subsets = {} subsets['train'] = Dataset( collection, TsvQueries(manager.file_ref('train.queries.tsv'), query_cls=MsMarcoQnAQuery, namespace='msmarco', lang='en'), migrator(TrecQrels(manager.file_ref('train.qrels'), QRELS_DEFS)), migrator(TrecScoredDocs(manager.file_ref('train.run'))), ) subsets['dev'] = Dataset( collection, TsvQueries(manager.file_ref('dev.queries.tsv'), query_cls=MsMarcoQnAQuery, namespace='msmarco', lang='en'), migrator(TrecQrels(manager.file_ref('dev.qrels'), QRELS_DEFS)), migrator(TrecScoredDocs(manager.file_ref('dev.run'))), ) subsets['eval'] = Dataset( collection, TsvQueries(manager.file_ref('eval.queries.tsv'), query_cls=MsMarcoQnAEvalQuery, namespace='msmarco', lang='en'), migrator(TrecScoredDocs(manager.file_ref('eval.run'))), ) ir_datasets.registry.register(NAME, Dataset(collection, documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets collection, subsets = _init() ================================================ FILE: ir_datasets/datasets/nano_beir.py ================================================ import ir_datasets from ir_datasets.datasets.base import Dataset, YamlDocumentation from ir_datasets.formats import ( BaseDocs, BaseQrels, BaseQueries, GenericDoc, GenericQuery, TrecQrel, ) from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS _logger = ir_datasets.log.easy() NAME = "nano-beir" def _map_field(field, data): if field in ("doc_id", "query_id"): return data["_id"] if field == "text": return data["text"] raise ValueError(f"unknown field: {field}") def parquet_iter(path): pq = ir_datasets.lazy_libs.pyarrow_parquet() # https://stackoverflow.com/a/77150113 batch_size = 64 with pq.ParquetFile(path) as parquet_file: for record_batch in parquet_file.iter_batches(batch_size=batch_size): for d in record_batch.to_pylist(): yield d class NanoBeirDocs(BaseDocs): def __init__(self, name, dlc, doc_type): super().__init__() self._name = name self._dlc = dlc self._doc_type = doc_type def docs_iter(self): return iter(self.docs_store()) def _docs_iter(self): for d in parquet_iter(self._dlc.path()): yield self._doc_type(*(_map_field(f, d) for f in self._doc_type._fields)) def docs_cls(self): return self._doc_type def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f"{ir_datasets.util.home_path()/NAME/self._name}/docs.pklz4", init_iter_fn=self._docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=["doc_id"], count_hint=ir_datasets.util.count_hint(f"{NAME}/{self._name}"), options=options ) def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_namespace(self): return f"{NAME}/{self._name}" def docs_lang(self): return "en" class NanoBeirQueries(BaseQueries): def __init__(self, name, dlc, query_type): super().__init__() self._name = name self._dlc = dlc self._query_type = query_type def queries_iter(self): for d in parquet_iter(self._dlc.path()): yield self._query_type(*(_map_field(f, d) for f in self._query_type._fields)) def queries_cls(self): return self._query_type def queries_namespace(self): return f"{NAME}/{self._name}" def queries_lang(self): return "en" class NanoBeirQrels(BaseQrels): def __init__(self, qrels_dlc, qrels_defs): self._qrels_dlc = qrels_dlc self._qrels_defs = qrels_defs def qrels_path(self): return self._qrels_dlc.path() def qrels_iter(self): for d in parquet_iter(self.qrels_path()): yield TrecQrel(d["query-id"], d["corpus-id"], 1, "0") def qrels_cls(self): return TrecQrel def qrels_defs(self): return self._qrels_defs def _init(): base_path = ir_datasets.util.home_path() / NAME dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f"docs/{NAME}.yaml") base = Dataset(documentation("_")) subsets = {} benchmarks = [ "climate-fever", "dbpedia-entity", "fever", "fiqa", "hotpotqa", "msmarco", "nfcorpus", "nq", "quora", "scidocs", "arguana", "scifact", "webis-touche2020", ] for ds in benchmarks: docs = NanoBeirDocs(ds, dlc[f"{ds}/docs"], GenericDoc) queries = NanoBeirQueries(ds, dlc[f"{ds}/queries"], GenericQuery) qrels = NanoBeirQrels(dlc[f"{ds}/qrels"], qrels_defs={1: 'relevant'}) subsets[ds] = Dataset( docs, queries, qrels, documentation(ds), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f"{NAME}/{s}", subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/natural_questions.py ================================================ from typing import NamedTuple, List import json import contextlib import ir_datasets from ir_datasets.datasets.base import Dataset, YamlDocumentation from ir_datasets.formats import DocstoreBackedDocs, TsvQueries, BaseQrels, BaseScoredDocs, GenericScoredDoc from ir_datasets.indices import DocstoreOptions, DEFAULT_DOCSTORE_OPTIONS _logger = ir_datasets.log.easy() NAME = 'natural-questions' class NqPassageDoc(NamedTuple): doc_id: str # a sequentially-assigned document ID (unique based on URL) + the index of the passage text: str # tokenized text of the passage, with all HTML tokens removed html: str # raw HTML of the passage start_byte: int # the following are from the `long_answer_candidates` objects and may be useful for something end_byte: int start_token: int end_token: int document_title: str # from document itself document_url: str # from document itself parent_doc_id: str # doc_id of the largest passage it's under (e.g., a sentence under a paragraph), or None if it's a top-level passage def default_text(self): """ document_title and text """ return f'{self.document_title} {self.text}' class NqQrel(NamedTuple): query_id: str doc_id: str relevance: int # always 1 short_answers: List[str] # the **string** representations of the answers (this is similar to how DPH evaluates) yes_no_answer: str class NqManager: def __init__(self, dlcs, base_path): self._dlcs = dlcs self._docs_store = None self._base_path = base_path def docs_store(self, options: DocstoreOptions = DEFAULT_DOCSTORE_OPTIONS): self.build() return self._internal_docs_store(options) def _internal_docs_store(self, options: DocstoreOptions = DEFAULT_DOCSTORE_OPTIONS): if self._docs_store is None: self._docs_store = ir_datasets.indices.PickleLz4FullStore(self._base_path/'docs.pklz4', None, NqPassageDoc, 'doc_id', ['doc_id'], count_hint=ir_datasets.util.count_hint(NAME)) return self._docs_store def build(self): docs_store = self._internal_docs_store() if docs_store.built(): return # already built pbar_postfix = {'file': None} doc_url_to_id = {} with contextlib.ExitStack() as stack: docs_trans = stack.enter_context(docs_store.lookup.transaction()) pbar = stack.enter_context(_logger.pbar_raw(desc='processing nq', postfix=pbar_postfix, unit='question')) train_queries = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'train.queries.tsv', 'wt')) train_qrels = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'train.qrels.jsonl', 'wt')) train_scoreddocs = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'train.scoreddocs.tsv', 'wt')) dev_queries = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'dev.queries.tsv', 'wt')) dev_qrels = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'dev.qrels.jsonl', 'wt')) dev_scoreddocs = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'dev.scoreddocs.tsv', 'wt')) for file_name in sorted(self._dlcs.contents().keys()): pbar_postfix['file'] = file_name pbar.set_postfix(pbar_postfix) if 'train' in file_name: f_queries, f_qrels, f_scoreddocs = train_queries, train_qrels, train_scoreddocs elif 'dev' in file_name: f_queries, f_qrels, f_scoreddocs = dev_queries, dev_qrels, dev_scoreddocs with ir_datasets.util.GzipExtract(self._dlcs[file_name]).stream() as stream: for line in stream: data = json.loads(line) qid = str(data['example_id']) # docs if data['document_url'] not in doc_url_to_id: did = str(len(doc_url_to_id)) doc_url_to_id[data['document_url']] = did last_end_idx, last_did = -1, None for idx, cand in enumerate(data['long_answer_candidates']): text = ' '.join(t['token'] for t in data['document_tokens'][cand['start_token']:cand['end_token']] if not t['html_token']) html = ' '.join(t['token'] for t in data['document_tokens'][cand['start_token']:cand['end_token']]) parent_doc_id = last_did if cand['start_token'] < last_end_idx else None doc = NqPassageDoc( f'{did}-{idx}', text, html, cand['start_byte'], cand['end_byte'], cand['start_token'], cand['end_token'], data['document_title'], data['document_url'], parent_doc_id, ) docs_trans.add(doc) if parent_doc_id is None: last_end_idx, last_did = cand['end_token'], doc.doc_id else: did = doc_url_to_id[data['document_url']] # queries f_queries.write('{}\t{}\n'.format(qid, data['question_text'].replace('\t', ' '))) # qrels qrels = {} for ann in data['annotations']: if ann['long_answer']['candidate_index'] == -1: continue passage_id = '{}-{}'.format(did, ann['long_answer']['candidate_index']) short_answers = [' '.join(t['token'] for t in data['document_tokens'][s['start_token']:s['end_token']] if not t['html_token']) for s in ann['short_answers']] if passage_id in qrels: qrel = qrels[passage_id] short_answers = [s for s in short_answers if s not in short_answers] qrel.short_answers.extend(short_answers) else: qrel = NqQrel( qid, passage_id, 1, short_answers, ann['yes_no_answer'], ) qrels[passage_id] = qrel for qrel in qrels.values(): json.dump(qrel._asdict(), f_qrels) f_qrels.write('\n') # scoreddocs count = len(data['long_answer_candidates']) f_scoreddocs.write(f'{qid}\t{did}\t{count}\n') pbar.update(1) def file_ref(self, path): return _ManagedDlc(self, self._base_path/path) class _ManagedDlc: def __init__(self, manager, path): self._manager = manager self._path = path @contextlib.contextmanager def stream(self): self._manager.build() with open(self._path, 'rb') as f: yield f def path(self, force=True): if force: self._manager.build() return self._path class NqQrels(BaseQrels): def __init__(self, dlc): super().__init__() self.dlc = dlc def qrels_iter(self): with self.dlc.stream() as stream: for line in stream: data = json.loads(line) yield NqQrel(**data) def qrels_cls(self): return NqQrel def qrels_defs(self): return {1: 'passage marked by annotator as a "long" answer to the question'} class NqScoredDocs(BaseScoredDocs): def __init__(self, dlc): super().__init__() self.dlc = dlc def scoreddocs_iter(self): with self.dlc.stream() as stream: for line in stream: qid, did, count = line.decode().strip().split('\t') for i in range(int(count)): yield GenericScoredDoc(qid, f'{did}-{i}', 0.) def scoreddocs_cls(self): return GenericScoredDoc def _init(): base_path = ir_datasets.util.home_path()/NAME dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path) manager = NqManager(dlc, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = DocstoreBackedDocs(manager.docs_store, docs_cls=NqPassageDoc, namespace=NAME, lang='en') base = Dataset( collection, documentation('_')) subsets = {} subsets['train'] = Dataset( collection, TsvQueries(manager.file_ref('train.queries.tsv'), namespace=NAME, lang='en'), NqQrels(manager.file_ref('train.qrels.jsonl')), NqScoredDocs(manager.file_ref('train.scoreddocs.tsv')), documentation('train'), ) subsets['dev'] = Dataset( collection, TsvQueries(manager.file_ref('dev.queries.tsv'), namespace=NAME, lang='en'), NqQrels(manager.file_ref('dev.qrels.jsonl')), NqScoredDocs(manager.file_ref('dev.scoreddocs.tsv')), documentation('dev'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/neuclir.py ================================================ import gzip import json from functools import lru_cache import ir_datasets from ir_datasets.util import DownloadConfig, Lazy from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries from ir_datasets.formats.trec import TrecQrels from ir_datasets.formats import ExctractedCCDocs, ExctractedCCQueries, ExctractedCCNoReportQuery, ExctractedCCNoReportNoHtNarQuery, ExctractedCCMultiMtQuery from ir_datasets.datasets.hc4 import NAME as HC4_NAME from ir_datasets.util.fileio import GzipExtract, TarExtract NAME = 'neuclir' DOC_COUNTS = { 'zh': 3179209, 'fa': 2232016, 'ru': 4627543 } @lru_cache(maxsize=3) # three languages def get_ids(dlcs): dlcs = dlcs if isinstance(dlcs, (list, tuple)) else [dlcs] ids = [] for dlc in dlcs: with GzipExtract(dlc).stream() as f: ids += [ json.loads(line)['id'] for line in f ] return set(ids) class FilteredExctractedCCDocs(ExctractedCCDocs): def __init__(self, docs_dlc, subset_lang, include_doc_id_dlc, filter_name=None, namespace=None, count=None): super().__init__(docs_dlc, subset_lang, namespace, count) self._filter_name = filter_name or "filtered" self._include_doc_id_dlc = include_doc_id_dlc def _doc_store_path(self): return self.docs_path(force=False) + f".{self._filter_name}" def _internal_docs_iter(self): include_doc_id = get_ids(self._include_doc_id_dlc) for doc in super()._internal_docs_iter(): if doc.doc_id in include_doc_id: yield doc class FilteredTrecQrels(TrecQrels): def __init__(self, qrels_dlc, qrels_defs, include_doc_id_dlc, format_3col=False): super().__init__(qrels_dlc, qrels_defs, format_3col) self._include_doc_id_dlc = include_doc_id_dlc def qrels_iter(self): include_doc_id = get_ids(self._include_doc_id_dlc) for qrel in super().qrels_iter(): if qrel.doc_id in include_doc_id: yield qrel class LangFilteredTrecQrels(TrecQrels): def __init__(self, qrels_dlc, qrels_defs, lang, format_3col=False): super().__init__(qrels_dlc, qrels_defs, format_3col) self._lang = lang def qrels_iter(self): for qrel in super().qrels_iter(): if qrel.iteration == self._lang: yield qrel QREL_DEFS = { 3: 'Very-valuable. Information in the document would be found in the lead paragraph of a report that is later written on the topic.', 1: 'Somewhat-valuable. The most valuable information in the document would be found in the remainder of such a report.', 0: 'Not-valuable. Information in the document might be included in a report footnote, or omitted entirely.', } def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) hc4_dlc = DownloadConfig.context(HC4_NAME, ir_datasets.util.home_path()/HC4_NAME) documentation = YamlDocumentation(f'docs/{NAME}.yaml') base = Dataset(documentation('_')) # dummy top level ds subsets["1"] = Dataset(documentation('1')) # dummy year level ds qrels2022 = dlc['trec-2022/qrels'] # For NeuCLIR Collection 1 for lang in ['zh', 'fa', 'ru']: lang3 = {'fa': 'fas', 'zh': 'zho', 'ru': 'rus'}[lang] lang_docs = ExctractedCCDocs(GzipExtract(dlc[f'1/{lang}/docs']), subset_lang=lang, namespace=NAME, count=DOC_COUNTS[lang]) subsets[f"1/{lang}"] = Dataset( lang_docs, documentation(f"1/{lang}") ) qrels = LangFilteredTrecQrels(qrels2022, QREL_DEFS, lang3) subsets[f"1/{lang}/trec-2022"] = Dataset( lang_docs, FilteredQueries(ExctractedCCQueries(dlc['trec-2022/queries'], subset_lang=lang, filter_lwq=False, cls=ExctractedCCNoReportQuery, namespace=NAME), _lazy_qids_set(qrels), mode='include'), qrels, documentation(f"1/{lang}/trec-2022"), ) subsets[f"1/{lang}/trec-2023"] = Dataset( lang_docs, FilteredQueries(ExctractedCCQueries(dlc['trec-2023/queries'], subset_lang=lang, filter_lwq=False, cls=ExctractedCCNoReportNoHtNarQuery, namespace=NAME), _lazy_qids_set(qrels), mode='include'), TrecQrels(TarExtract(dlc['trec-2023/qrels'], f'qrels.final.gains.{lang3}'), QREL_DEFS), documentation(f"1/{lang}/trec-2023"), ) include_doc_id_dlc = hc4_dlc[f'{lang}/docs/ids'] if lang != 'ru' else tuple([ hc4_dlc[f'{lang}/docs/ids/{i}'] for i in range(8) ]) subsets[f"1/{lang}/hc4-filtered"] = Dataset( FilteredExctractedCCDocs(GzipExtract(dlc[f'1/{lang}/docs']), subset_lang=lang, namespace=NAME, include_doc_id_dlc=include_doc_id_dlc), ExctractedCCQueries([hc4_dlc['dev/topics'], hc4_dlc['test/topics']], subset_lang=lang, namespace=NAME), FilteredTrecQrels([ hc4_dlc[f'{lang}/dev/qrels'], hc4_dlc[f'{lang}/test/qrels'] ], QREL_DEFS, include_doc_id_dlc=include_doc_id_dlc), documentation(f"1/{lang}/hc4-filtered") ) multi_docs = ExctractedCCDocs([GzipExtract(dlc[f'1/{lang}/docs']) for lang in ['zh', 'fa', 'ru']], namespace=NAME, count=sum(DOC_COUNTS.values()), docstore_path=base_path/'1'/'multi') subsets['1/multi'] = Dataset( multi_docs, documentation("1/multi") ) subsets['1/multi/trec-2023'] = Dataset( multi_docs, ExctractedCCQueries(dlc['trec-2023/queries'], filter_lwq=False, cls=ExctractedCCMultiMtQuery, namespace=NAME), TrecQrels(TarExtract(dlc['trec-2023/qrels'], 'qrels.final.gains'), QREL_DEFS), documentation("1/multi/trec-2023") ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets def _lazy_qids_set(qrels): return Lazy(lambda: {qrel.query_id for qrel in qrels.qrels_iter()}) base, subsets = _init() ================================================ FILE: ir_datasets/datasets/neumarco.py ================================================ import io import codecs import re import ir_datasets from ir_datasets.util import DownloadConfig, TarExtract, Cache from ir_datasets.datasets.base import Dataset, YamlDocumentation from ir_datasets.datasets import msmarco_passage from ir_datasets.formats import TsvDocs NAME = 'neumarco' def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} subsets_from_msmarco = { 'train': [ ir_datasets.registry['msmarco-passage/train'].queries_handler(), ir_datasets.registry['msmarco-passage/train'].qrels_handler(), ir_datasets.registry['msmarco-passage/train'].docpairs_handler(), ], 'train/judged': [ ir_datasets.registry['msmarco-passage/train/judged'].queries_handler(), ir_datasets.registry['msmarco-passage/train/judged'].qrels_handler(), ir_datasets.registry['msmarco-passage/train/judged'].docpairs_handler(), ], 'dev': [ ir_datasets.registry['msmarco-passage/dev'].queries_handler(), ir_datasets.registry['msmarco-passage/dev'].qrels_handler(), ], 'dev/small': [ ir_datasets.registry['msmarco-passage/dev/small'].queries_handler(), ir_datasets.registry['msmarco-passage/dev/small'].qrels_handler(), ], 'dev/judged': [ ir_datasets.registry['msmarco-passage/dev/judged'].queries_handler(), ir_datasets.registry['msmarco-passage/dev/judged'].qrels_handler(), ] } base_dlc = dlc['main'] for lang3, lang2 in [('fas', 'fa'), ('zho', 'zh'), ('rus', 'ru')]: corpus_dlc = Cache(TarExtract(base_dlc, f'eng-{lang3}/msmarco.collection.20210731-scale21-sockeye2-tm1.tsv'), base_path/f'{lang2}.tsv') collection = TsvDocs(corpus_dlc, namespace=f'{NAME}/{lang2}', lang=lang2, count_hint=ir_datasets.util.count_hint(f'{NAME}/{lang2}')) subsets[f'{lang2}'] = Dataset(collection, documentation(f'{lang2}')) for s, items in subsets_from_msmarco.items(): subsets[f'{lang2}/{s}'] = Dataset( collection, *items, documentation(f'{lang2}/{s}')) ir_datasets.registry.register(NAME, Dataset(documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return collection, subsets collection, subsets = _init() ================================================ FILE: ir_datasets/datasets/nfcorpus.py ================================================ import io import codecs import re from typing import NamedTuple import ir_datasets from ir_datasets.util import Cache, TarExtract, IterStream, GzipExtract, Lazy, DownloadConfig from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredScoredDocs, FilteredQrels, FilteredDocPairs, YamlDocumentation from ir_datasets.formats import TsvQueries, TsvDocs, TrecQrels, TrecScoredDocs, TsvDocPairs, BaseQueries NAME = 'nfcorpus' _logger = ir_datasets.log.easy() QRELS_DEFS = { 2: "A direct link from the query to the document the cited sources section of a page.", 1: "A link exists from the query to another query that directly links to the document.", 0: "Marginally relevant, based on topic containment.", } class NfCorpusDoc(NamedTuple): doc_id: str url: str title: str abstract: str def default_text(self): """ title and abstract """ return f'{self.title} {self.abstract}' class NfCorpusQuery(NamedTuple): query_id: str title: str all: str def default_text(self): """ title """ return self.title class NfCorpusVideoQuery(NamedTuple): query_id: str title: str desc: str def default_text(self): """ title """ return self.title class ZipQueries(BaseQueries): def __init__(self, queries, idxs, qtype): self._queries = queries self._idxs = idxs self._qtype = qtype def queries_iter(self): for qs in zip(*(q.queries_iter() for q in self._queries)): assert len({q.query_id for q in qs}) == 1 # all query IDs should be the same yield self._qtype(*(qs[i][j] for i, j in self._idxs)) def queries_cls(self): return self._qtype def queries_path(self): return self._queries[0].queries_path() def queries_namespace(self): return NAME def queries_lang(self): return self._queries[0].queries_lang() def _init(): base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') main_dlc = dlc['main'] collection = TsvDocs(Cache(TarExtract(main_dlc, 'nfcorpus/raw/doc_dump.txt'), base_path/'collection.tsv'), doc_cls=NfCorpusDoc, namespace=NAME, lang='en', count_hint=ir_datasets.util.count_hint(NAME)) subsets = {} def read_lines(file): file = Cache(TarExtract(main_dlc, f'nfcorpus/raw/{file}'), base_path/file) with file.stream() as stream: stream = codecs.getreader('utf8')(stream) return {l.rstrip() for l in stream} nontopic_qid_filter = Lazy(lambda: read_lines('nontopics.ids')) video_qid_filter = Lazy(lambda: read_lines('all_videos.ids')) subsets['train'] = Dataset( collection, ZipQueries([ TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/train.titles.queries'), base_path/'train/queries.titles.tsv'), namespace=NAME, lang='en'), TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/train.all.queries'), base_path/'train/queries.all.tsv'), namespace=NAME, lang='en'), ], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery), TrecQrels(Cache(TarExtract(main_dlc, 'nfcorpus/train.3-2-1.qrel'), base_path/'train/qrels'), QRELS_DEFS), documentation('train'), ) subsets['train/nontopic'] = Dataset( collection, TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/train.nontopic-titles.queries'), base_path/'train/nontopic/queries.tsv'), namespace=NAME, lang='en'), FilteredQrels(subsets['train'].qrels_handler(), nontopic_qid_filter, mode='include'), documentation('train/nontopic'), ) subsets['train/video'] = Dataset( collection, ZipQueries([ TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/train.vid-titles.queries'), base_path/'train/video/queries.titles.tsv'), namespace=NAME, lang='en'), TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/train.vid-desc.queries'), base_path/'train/video/queries.desc.tsv'), namespace=NAME, lang='en'), ], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery), FilteredQrels(subsets['train'].qrels_handler(), video_qid_filter, mode='include'), documentation('train/video'), ) subsets['dev'] = Dataset( collection, ZipQueries([ TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/dev.titles.queries'), base_path/'dev/queries.titles.tsv'), namespace=NAME, lang='en'), TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/dev.all.queries'), base_path/'dev/queries.all.tsv'), namespace=NAME, lang='en'), ], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery), TrecQrels(Cache(TarExtract(main_dlc, 'nfcorpus/dev.3-2-1.qrel'), base_path/'dev/qrels'), QRELS_DEFS), documentation('dev'), ) subsets['dev/nontopic'] = Dataset( collection, TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/dev.nontopic-titles.queries'), base_path/'dev/nontopic/queries.tsv'), namespace=NAME, lang='en'), FilteredQrels(subsets['dev'].qrels_handler(), nontopic_qid_filter, mode='include'), documentation('dev/nontopic'), ) subsets['dev/video'] = Dataset( collection, ZipQueries([ TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/dev.vid-titles.queries'), base_path/'dev/video/queries.titles.tsv'), namespace=NAME, lang='en'), TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/dev.vid-desc.queries'), base_path/'dev/video/queries.desc.tsv'), namespace=NAME, lang='en'), ], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery), FilteredQrels(subsets['dev'].qrels_handler(), video_qid_filter, mode='include'), documentation('dev/video'), ) subsets['test'] = Dataset( collection, ZipQueries([ TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/test.titles.queries'), base_path/'test/queries.titles.tsv'), namespace=NAME, lang='en'), TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/test.all.queries'), base_path/'test/queries.all.tsv'), namespace=NAME, lang='en'), ], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery), TrecQrels(Cache(TarExtract(main_dlc, 'nfcorpus/test.3-2-1.qrel'), base_path/'test/qrels'), QRELS_DEFS), documentation('test'), ) subsets['test/nontopic'] = Dataset( collection, TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/test.nontopic-titles.queries'), base_path/'test/nontopic/queries.tsv'), namespace=NAME, lang='en'), FilteredQrels(subsets['test'].qrels_handler(), nontopic_qid_filter, mode='include'), documentation('test/nontopic'), ) subsets['test/video'] = Dataset( collection, ZipQueries([ TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/test.vid-titles.queries'), base_path/'test/video/queries.titles.tsv'), namespace=NAME, lang='en'), TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/test.vid-desc.queries'), base_path/'test/video/queries.desc.tsv'), namespace=NAME, lang='en'), ], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery), FilteredQrels(subsets['test'].qrels_handler(), video_qid_filter, mode='include'), documentation('test/video'), ) ir_datasets.registry.register(NAME, Dataset(collection, documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return collection, subsets collection, subsets = _init() ================================================ FILE: ir_datasets/datasets/nyt.py ================================================ import io import tarfile from typing import NamedTuple import ir_datasets from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS from ir_datasets.util import Lazy, DownloadConfig, Migrator from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredQrels, YamlDocumentation from ir_datasets.formats import BaseDocs, BaseQueries, BaseQrels, GenericQuery, GenericQrel, TrecQueries, TrecQrels NAME = 'nyt' QREL_DEFS = { 1: 'title is associated with article body', } CORE_QREL_DEFS = { 0: "not relevant", 1: "relevant", 2: "highly relevant", } VALID_IDS = {'1206388', '46335', '1223589', '1642970', '144845', '420493', '1186325', '564166', '1092844', '1232733', '243508', '946470', '1147459', '84957', '87385', '1298633', '1327402', '1482333', '1069716', '1575477', '1110091', '655579', '1562062', '541298', '1571257', '639395', '1341710', '663400', '1174700', '1406944', '1368755', '1315376', '1609162', '1746895', '1447812', '193348', '882027', '213652', '126658', '799474', '1677212', '1254313', '43743', '250901', '426439', '1803638', '1111630', '1220244', '1142672', '944176', '860862', '342011', '1556809', '1574691', '292048', '855559', '1473717', '157893', '252570', '305646', '198014', '1444467', '1842149', '161276', '455333', '146910', '1414339', '1413851', '1352725', '509114', '563685', '1738087', '1115555', '639541', '427073', '1435887', '862324', '476212', '870108', '315852', '144389', '684154', '845724', '117999', '35935', '716125', '1818546', '551762', '687923', '1817616', '135841', '618338', '1597113', '1549790', '1292666', '147051', '1778945', '1347630', '1337511', '299371', '1384273', '388274', '938995', '263847', '195638', '303927', '646946', '1620311', '1455534', '325463', '1380230', '1038853', '1040633', '1831119', '363686', '260491', '1611855', '147526', '542544', '581106', '1766627', '899656', '236785', '1408409', '300748', '742732', '986023', '1662861', '1083296', '152722', '1458233', '1203328', '1810235', '996231', '1226680', '427277', '517560', '1230947', '185677', '1524891', '492603', '1023515', '334223', '1219069', '1021319', '152336', '1227959', '1501876', '765819', '395940', '524179', '1494335', '66871', '105130', '1660760', '744794', '1616161', '876120', '714837', '35529', '42617', '198139', '1811671', '147293', '1041065', '841417', '1346509', '200467', '850536', '1235945', '184078', '1269259', '1314141', '1368414', '387436', '896464', '84650', '375608', '423014', '1201696', '883245', '137547', '1376881', '1207160', '280170', '968570', '1438840', '626732', '1085071', '632127', '1206647', '399973', '1316303', '1187122', '805546', '1727291', '570037', '1178896', '555992', '977573', '1340396', '632958', '63542', '1280664', '977205', '1567169', '783676', '814977', '1668678', '1735184', '1074278', '1652858', '1108702', '955404', '1784962', '1185130', '250831', '818408', '623624', '134405', '104342', '965709', '956076', '1260229', '27255', '1500603', '1127679', '1722973', '1734641', '309555', '1681934', '695555', '48767', '433808', '995051', '180797', '123367', '378006', '1216681', '324683', '1711346', '211935', '1801492', '103678', '446767', '594334', '860460', '660793', '1393998', '266826', '876460', '994066', '1282229', '1587147', '815344', '1103826', '343997', '1200405', '179480', '742314', '1780439', '1066709', '1330760', '1368900', '1549318', '1110897', '619788', '188464', '173770', '34154', '578909', '645650', '1157537', '62836', '700552', '1388063', '408649', '848686', '1694615', '1617883', '1765655', '1466678', '155464', '1445513', '1303273', '231804', '581627', '742052', '1212886', '1405769', '481040', '1855639', '54259', '111905', '1313586', '387001', '1185491', '1670617', '906527', '69825', '499522', '1819890', '164762', '970999', '1179216', '993221', '372699', '296270', '1185999', '792835', '1037962', '1740374', '1624046', '954664', '368818', '1087747', '1026355', '812422', '1544110', '1226870', '155570', '1190376', '869921', '296349', '595907', '614301', '1241703', '442373', '995807', '1369864', '1709789', '114305', '184927', '1120202', '584073', '828184', '1473187', '1521230', '440704', '1013610', '1830313', '721770', '1658974', '313921', '692325', '368461', '985252', '290240', '1251117', '1538562', '422046', '1630032', '1181653', '125066', '1837263', '1656997', '441', '490006', '1643057', '165954', '69049', '1199388', '1507218', '1329673', '509136', '1466695', '16687', '508419', '268880', '969961', '340902', '253378', '256155', '863620', '1683671', '1560798', '675553', '1748098', '458865', '1665924', '1055150', '66385', '215071', '13148', '986080', '236365', '517825', '873311', '441741', '720189', '572737', '1225926', '624119', '997868', '515426', '691257', '419206', '1130476', '100471', '6461', '1807548', '1544601', '407787', '380030', '1152266', '1065150', '694778', '811554', '1854529', '444117', '1099590', '922315', '1217477', '1779802', '369061', '775743', '72992', '144419', '552889', '1181556', '1292830', '1778514', '1489202', '914269', '1706337', '1196929', '184181', '314027', '1227737', '559948', '784834', '1704396', '1256508', '1508836', '317087', '96486', '747998', '1632274', '950708', '1649807', '446890', '593993', '814566', '1292672', '560408', '1077779', '978883', '393982', '844217', '398230', '183055', '53060', '1210135', '916178', '1532407', '1139738', '1518821', '728959', '1304148', '491724', '1568275', '712403', '1728481', '660217', '821176', '1222683', '1778005', '1195123', '1817074', '974513', '426701', '1111638', '1240027', '1664639', '1464379', '521007', '1199739', '578456', '1439699', '284928', '494919', '491912', '232568', '923474', '99386', '1643092', '1790124', '1061993', '621986', '1122877', '100662', '1473138', '1030173', '71586', '1096287', '1138157', '262640', '602945', '1300130', '1338721', '1270177', '39801', '1692635', '56624', '211659', '1646283', '324374', '255385', '1255526', '1786203', '1406143', '1788514', '289251', '672936', '452286', '137862', '185683', '1430', '1380422', '845912', '775802', '647375', '145796', '355527', '146542', '1410218', '345442', '190717', '371036', '1797336', '120994', '1718571', '1054043', '4558', '428059', '1396897', '1201117', '1158485', '1089656', '519981', '43015', '520964', '1494349', '1094063', '1392684', '978574', '1052143', '1118795', '1687088', '1314160', '162771', '911024', '1820168', '1192318', '91766', '143489', '1004985', '518421', '166275', '370104', '974150', '546915', '1323563', '1798085', '938123', '182313', '1364401', '9506', '557187', '112370', '611777', '1159485', '1403348', '683930', '797900', '1383582', '114608', '350383', '1604331', '568871', '1047323', '394651', '165898', '283949', '810556', '105425', '1013875', '1464119', '1312394', '1695169', '58536', '1169598', '1125874', '1665958', '769476', '594319', '683707', '882361', '1302321', '450679', '254550', '1033539', '1301128', '1320428', '41154', '1657029', '1227578', '171871', '1792745', '288902', '453868', '271254', '409591', '143722', '535764', '1830350', '578047', '230266', '111402', '773754', '1245031', '1350576', '1624207', '1807992', '1015799', '1794740', '511024', '789525', '319777', '1132669', '1327710', '1272568', '1390168', '1533260', '617767', '638910', '496086', '1205039', '1626665', '191596', '1810513', '1556267', '1100153', '207238', '1501543', '834402', '279588', '568816', '1632682', '822260', '343317', '430137', '1768788', '545282', '279954', '165473', '828347', '1470816', '1327112', '1529515', '1016007', '270386', '1702078', '286404', '1088273', '1322387', '1643857', '489043', '380855', '1083556', '1619528', '583350', '132853', '546862', '1253587', '535138', '264437', '943235', '1620828', '1006607', '553760', '828792', '1624460', '1434951', '833541', '212690', '200229', '1064862', '220330', '1579543', '363926', '1258350', '1184051', '720391', '1459592', '457690', '38548', '81369', '1679222', '390074', '286007', '378270', '816642', '283001', '372084', '411601', '910971', '1590440', '135775', '1112005', '75424', '213834', '689492', '1005355', '1139329', '808335', '720425', '1267233', '263546', '1222854', '258056', '837513', '940506', '1103175', '1378900', '1385626', '237112', '730612', '301649', '273771', '497029', '736059', '1193481', '797044', '1144902', '1030001', '719277', '1119289', '1337197', '942773', '982474', '584235', '1707268', '1754255', '1104478', '1534921', '128481', '470969', '347013', '509587', '408644', '772685', '1733430', '1317735', '848134', '404829', '267884', '953680', '1303696', '884333', '968388', '1201708', '1112434', '303328', '1304264', '1133757', '1724836', '1334405', '1829066', '925761', '946016', '552534', '943383', '1100246', '1846843', '1088146', '544438', '1753939', '74810', '1807078', '100915', '1236323', '803592', '429972', '393687', '1378937', '456043', '1613185', '613184', '417913', '1563559', '1339387', '1502489', '656071', '365604', '1151482', '1259752', '277596', '673808', '161493', '873580', '832327', '260612', '924572', '1064547', '1125330', '1641045', '1151695', '256879', '394244', '556588', '1305678', '1263185', '136826', '1399892', '557148', '1358190', '1776190', '249236', '1492533', '1303288', '521017', '1066272', '541133', '1623539', '137859', '687241', '237814', '1369332', '371264', '24081', '1552898', '1502059', '1047404', '1023221', '177279', '1267817', '1411135', '191656', '980600', '951516', '499404', '1695509', '811244', '238763', '1284303', '585143', '1033260', '942257', '1349353', '1429932', '140492', '1044892', '418808', '698145', '1796223', '59227', '194957', '269275', '730734', '1145222', '253742', '581098', '45351', '66070', '426605', '1050966', '529688', '1801056', '1718077', '1266182', '129555', '1531233', '74473', '302447', '215843', '792070', '1104761', '1573381', '202553', '60314', '1503921', '280964', '711987', '136821', '832921', '1419515', '1662966', '1819530', '716942', '219736', '436016', '1735969', '713752', '60858', '121707', '689812', '193395', '1624062', '1330056', '563645', '1492653', '1449544', '376209', '1750188', '1478352', '410699', '777880', '1029514', '108914', '720269', '1448513', '74549', '972109', '215002', '404357', '1647764', '550693', '1255375', '1293865', '1264570', '896848', '789563', '826347', '903589', '1018558', '277290', '1683375', '1496790', '1112399', '860557', '127350', '1015623', '312660', '233953', '1565217', '1639977', '1607902', '397905', '490534', '1513419', '174443', '1215224', '66269', '275494', '209655', '516500', '1675849', '836893', '947869', '789401', '1553981', '155710', '496679', '821652', '1139493', '286234', '128146', '1207153', '1199733', '1778364', '1704065', '326315', '317132', '1824346', '319345', '1219375', '99297', '1850878', '755324', '1737932', '1556261', '1389561', '128767', '24850', '1105008', '1046487', '390245', '899371', '623036', '1190883', '1218126', '334762', '1496567', '1228970', '540795', '689403', '1465965', '1585171', '734591', '1257610', '685476', '784313', '1178416', '1468942', '883627', '1000719', '952670', '51709', '933442'} class NytDoc(NamedTuple): doc_id: str headline: str body: str source_xml: str def default_text(self): """ headline and body """ return f'{self.headline} {self.body}' class NytDocs(BaseDocs): def __init__(self, dlc): self._dlc = dlc def docs_path(self, force=True): return self._dlc.path(force) def docs_cls(self): return NytDoc def docs_iter(self): return iter(self.docs_store()) def _docs_iter(self): BeautifulSoup = ir_datasets.lazy_libs.bs4().BeautifulSoup with self._dlc.stream() as stream: with tarfile.open(fileobj=stream, mode='r|gz') as tgz_outer: for member_o in tgz_outer: if not member_o.isfile() or not (member_o.name.endswith('.tar') or member_o.name.endswith('.tgz')): continue file = tgz_outer.extractfile(member_o) with tarfile.open(fileobj=file, mode='r|gz' if member_o.name.endswith('.tgz') else 'r|') as tgz_inner: for member_i in tgz_inner: if not member_i.isfile(): continue full_xml = tgz_inner.extractfile(member_i).read() soup = BeautifulSoup(full_xml, 'lxml-xml') did = soup.find('doc-id') did = did['id-string'] if did else '' headline = soup.find('hl1') # 'headline' element can contain multiple (e.g. hl2 for online) headline = headline.get_text() if headline else '' full_text = soup.find('block', {'class': 'full_text'}) full_text = full_text.get_text().strip() if full_text else '' yield NytDoc(did, headline, full_text, full_xml) def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f'{self.docs_path()}.pklz4', init_iter_fn=self._docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], count_hint=ir_datasets.util.count_hint(NAME), options=options ) def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_namespace(self): return NAME def docs_lang(self): return 'en' class NytQueries(BaseQueries): def __init__(self, collection): self._collection = collection def queries_iter(self): for doc in self._collection.docs_iter(): yield GenericQuery(doc.doc_id, doc.headline) def queries_namespace(self): return NAME def queries_lang(self): return 'en' class NytQrels(BaseQrels): def __init__(self, collection): self._collection = collection def qrels_iter(self): for doc in self._collection.docs_iter(): yield GenericQrel(doc.doc_id, doc.doc_id, 1) def qrels_defs(self): return QREL_DEFS def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') migrator = Migrator(base_path/'irds_version.txt', 'v2', affected_files=[base_path/'nyt.tgz.pklz4'], message='Migrating nyt (extracting body text)') collection = migrator(NytDocs(dlc['source'])) base = Dataset(collection, documentation('_')) # core17 subsets['trec-core-2017'] = Dataset( TrecQueries(dlc['trec-core-2017/queries'], namespace='trec-core-2017', lang='en'), TrecQrels(dlc['trec-core-2017/qrels'], CORE_QREL_DEFS), collection, documentation('trec-core-2017')) # wksup all_queries = NytQueries(collection) all_qrels = NytQrels(collection) match_qids = Lazy(lambda: VALID_IDS) subsets['wksup'] = Dataset( all_queries, all_qrels, collection, documentation('wksup/train')) subsets['wksup/train'] = Dataset( FilteredQueries(all_queries, match_qids, mode='exclude'), FilteredQrels(all_qrels, match_qids, mode='exclude'), collection, documentation('wksup/train')) subsets['wksup/valid'] = Dataset( FilteredQueries(all_queries, match_qids, mode='include'), FilteredQrels(all_qrels, match_qids, mode='include'), collection, documentation('wksup/valid')) ir_datasets.registry.register('nyt', base) for s in sorted(subsets): ir_datasets.registry.register(f'nyt/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/pmc.py ================================================ import codecs import tarfile import itertools from typing import NamedTuple, Tuple from zipfile import ZipFile import xml.etree.ElementTree as ET import ir_datasets from ir_datasets.util import DownloadConfig, GzipExtract, ZipExtract from ir_datasets.formats import BaseDocs, GenericQuery, TrecQrels, TrecXmlQueries from ir_datasets.datasets.base import Dataset, YamlDocumentation from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS _logger = ir_datasets.log.easy() QREL_DEFS = { 0: 'not relevant', 1: 'possibly relevant', 2: 'definitely relevant' } QUERY_FILE_MAP = { 'number': 'query_id', 'type': 'type', 'description': 'description', 'summary': 'summary', 'note': 'note', } NAME = 'pmc' class PmcDoc(NamedTuple): doc_id: str journal: str title: str abstract: str body: str def default_text(self): """ title, abstract, and body """ return f'{self.title} {self.abstract} {self.body}' class TrecCdsQuery(NamedTuple): query_id: str type: str description: str summary: str def default_text(self): """ description """ return self.description class TrecCds2016Query(NamedTuple): query_id: str type: str note: str description: str summary: str def default_text(self): """ description """ return self.description class PmcDocs(BaseDocs): def __init__(self, dlcs, path, duplicate_dlcs=[], count_hint=None): self._dlcs = dlcs self._path = path self._duplicate_dlcs = duplicate_dlcs self._count_hint = count_hint def docs_iter(self): return iter(self.docs_store()) def _docs_iter(self): # There's a set of known "duplicate" files, which are not considered # for scoring. Skip them. duplicate_file_names = set() for dlc in self._duplicate_dlcs: with dlc.stream() as f: for line in codecs.getreader('utf8')(f): for fn in line.split(): duplicate_file_names.add(fn) for dlc in self._dlcs: with dlc.stream() as f, tarfile.open(fileobj=f, mode=f'r|gz') as tarf: for file in tarf: if not file.isfile() or file.name in duplicate_file_names: continue xml = tarf.extractfile(file).read() # Some files have a problem where spaces are missing between tag and attributes. # Fix those here. xml = xml.replace(b'<xrefref-type=', b'<xref ref-type=') xml = xml.replace(b'<tex-mathid=', b'<tex-math id=') xml = xml.replace(b'<graphicxlink:href=', b'<graphic xlink:href=') xml = xml.replace(b'<ext-linkext-link-type=', b'<ext-link ext-link-type=') xml = xml.replace(b'<pub-idpub-id-type=', b'<pub-id pub-id-type=') # Extract relevant parts from the XML xml = ET.fromstring(xml) doc_id = file.name.split('/')[-1].split('.')[0] journal = xml.find('.//journal-title') journal = '\n'.join(journal.itertext()) if journal is not None else '' title = xml.find('.//article-title') title = '\n'.join(title.itertext()) if title is not None else '' abstract = xml.find('.//abstract') abstract = '\n'.join(abstract.itertext()) if abstract is not None else '' body = xml.find('.//body') body = '\n'.join(body.itertext()) if body is not None else '' yield PmcDoc(doc_id, journal, title, abstract, body) def docs_path(self, force=True): return self._path def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f'{self.docs_path()}.pklz4', init_iter_fn=self._docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], count_hint=self._count_hint, options=options ) def docs_cls(self): return PmcDoc def docs_namespace(self): return NAME def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_lang(self): return 'en' def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} v1_collection = PmcDocs([dlc['v1/source0'], dlc['v1/source1'], dlc['v1/source2'], dlc['v1/source3']], ir_datasets.util.home_path()/NAME/'v1'/'corpus', duplicate_dlcs=[dlc['v1/dup1'], dlc['v1/dup2']], count_hint=ir_datasets.util.count_hint(f'{NAME}/v1')) v2_collection = PmcDocs([dlc['v2/source0'], dlc['v2/source1'], dlc['v2/source2'], dlc['v2/source3']], ir_datasets.util.home_path()/NAME/'v2'/'corpus', count_hint=ir_datasets.util.count_hint(f'{NAME}/v2')) base = Dataset(documentation('_')) subsets['v1'] = Dataset(v1_collection, documentation('v1')) subsets['v2'] = Dataset(v2_collection, documentation('v2')) subsets['v1/trec-cds-2014'] = Dataset( v1_collection, TrecXmlQueries(dlc['trec-cds-2014/queries'], TrecCdsQuery, QUERY_FILE_MAP, namespace='trec-cds-2014', lang='en'), TrecQrels(dlc['trec-cds-2014/qrels'], QREL_DEFS), documentation('v1/trec-cds-2014'), ) subsets['v1/trec-cds-2015'] = Dataset( v1_collection, TrecXmlQueries(dlc['trec-cds-2015/queries'], TrecCdsQuery, QUERY_FILE_MAP, namespace='trec-cds-2015', lang='en'), TrecQrels(dlc['trec-cds-2015/qrels'], QREL_DEFS), documentation('v1/trec-cds-2015'), ) subsets['v2/trec-cds-2016'] = Dataset( v2_collection, TrecXmlQueries(dlc['trec-cds-2016/queries'], TrecCds2016Query, QUERY_FILE_MAP, namespace='trec-cds-2016', lang='en'), TrecQrels(dlc['trec-cds-2016/qrels'], QREL_DEFS), documentation('v2/trec-cds-2016'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/sara.py ================================================ import ir_datasets from ir_datasets.datasets.base import Dataset, YamlDocumentation from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredQrels, YamlDocumentation from ir_datasets.formats import BaseDocs, TrecXmlQueries, TrecQrels, GenericQuery, GenericQrel, TsvQueries from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS from ir_datasets.util import DownloadConfig from typing import NamedTuple, Tuple import itertools import csv import io import zipfile # A unique identifier for this dataset. This should match the file name (with "-" instead of "_") NAME = "sara" # What do the relevance levels in qrels mean? QREL_DEFS = { 2: 'highly relevant', 1: 'partially relevant', 0: 'not relevant', } class SaraDoc(NamedTuple): doc_id: str text: str sensitivity: int def default_text(self): return self.text class SaraDocs(BaseDocs): def __init__(self,dlc): super().__init__() self._dlc = dlc def docs_iter(self): return iter(self.docs_store()) def _docs_iter(self): max_int = 229739 csv.field_size_limit(max_int) with self._dlc.stream() as stream: with zipfile.ZipFile(stream) as zf: # Adjust this if the filename inside differs with zf.open(zf.namelist()[0]) as f: text_stream = io.TextIOWrapper( f, encoding="utf-8-sig", errors="replace", newline="" ) reader = csv.DictReader(text_stream) for row in reader: yield SaraDoc( doc_id=row["docno"], text=row["text"], sensitivity=int(row["sensitivity"]) ) def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f'{ir_datasets.util.home_path()/NAME}/docs.pklz4', init_iter_fn=self._docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], count_hint=ir_datasets.util.count_hint(NAME), options=options ) def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_namespace(self): return NAME def docs_lang(self): return 'en' def docs_cls(self): return SaraDoc # An initialization function is used to keep the namespace clean def _init(): base_path = ir_datasets.util.home_path()/NAME # # Load an object that is used for providing the documentation documentation = YamlDocumentation(f'docs/{NAME}.yaml') dlc = DownloadConfig.context(NAME, base_path) docs = SaraDocs(dlc["docs"]) queries = TsvQueries(dlc['queries'], namespace=NAME, lang='en') qrels = TrecQrels(dlc['qrels'], QREL_DEFS) # Package the docs, queries, qrels, and documentation into a Dataset object dataset = Dataset(docs, queries, qrels, documentation('_')) # Register the dataset in ir_datasets ir_datasets.registry.register(NAME, dataset) return dataset # used for exposing dataset to the namespace _init() ================================================ FILE: ir_datasets/datasets/touche.py ================================================ from typing import Dict from ir_datasets import registry from ir_datasets.datasets.base import Dataset, YamlDocumentation from ir_datasets.formats import ToucheQueries, ToucheTitleQueries, \ ToucheComparativeQueries, ToucheQrels, ToucheQualityQrels, \ ToucheQualityComparativeStanceQrels, ToucheControversialStanceQrels, \ ToucheQualityCoherenceQrels, TouchePassageDocs from ir_datasets.util import DownloadConfig, home_path, Cache, ZipExtract, GzipExtract NAME = "touche" QRELS_DEFS_2020_TASK_1: Dict[int, str] = { -2: "spam, non-argument", 1: "very low relevance", 2: "low relevance", 3: "moderate relevance", 4: "high relevance", 5: "very high relevance", } QRELS_DEFS_2020_TASK_2: Dict[int, str] = { 0: "not relevant", 1: "relevant", 2: "highly relevant", } QRELS_DEFS_2021_TASK_1: Dict[int, str] = { -2: "spam", 0: "not relevant", 1: "relevant", 2: "highly relevant", } QRELS_DEFS_2021_TASK_2: Dict[int, str] = { 0: "not relevant", 1: "relevant", 2: "highly relevant", } QRELS_DEFS_2022_TASK_1: Dict[int, str] = { 0: "not relevant", 1: "relevant", 2: "highly relevant", } QRELS_DEFS_2022_TASK_2: Dict[int, str] = { 0: "not relevant", 1: "relevant", 2: "highly relevant", } QRELS_DEFS_2022_TASK_3: Dict[int, str] = { 0: "not relevant", 1: "relevant", } def _init(): base_path = home_path() / NAME documentation = YamlDocumentation(f"docs/{NAME}.yaml") download_config = DownloadConfig.context(NAME, base_path) def cached_download(name: str, extension: str) -> Cache: return Cache( download_config[name], base_path / f"{name}.{extension}" ) def cached_zip_download(name: str, zip_path: str, extension: str) -> Cache: return Cache( ZipExtract( download_config[name], zip_path ), base_path / f"{name}.{extension}" ) def cached_gzip_download(name: str, extension: str) -> Cache: return Cache( GzipExtract(download_config[name]), base_path / f"{name}.{extension}" ) # Define and create task datasets. task_base_datasets = { f"argsme/2020-04-01/{NAME}-2020-task-1": Dataset( registry["argsme/2020-04-01"].docs_handler(), ToucheQueries( cached_zip_download("2020/task-1/queries", "topics-task-1.xml", "xml"), namespace=f"argsme/2020-04-01/{NAME}-2020-task-1", language="en", ), ToucheQrels( cached_download("2020/task-1/qrels", "qrels"), QRELS_DEFS_2020_TASK_1, ), documentation("2020/task-1"), ), f"clueweb12/{NAME}-2020-task-2": Dataset( registry["clueweb12"].docs_handler(), ToucheQueries( cached_zip_download("2020/task-2/queries", "topics-task-2.xml", "xml"), namespace=f"clueweb12/{NAME}-2020-task-2", language="en", ), ToucheQrels( cached_download("2020/task-2/qrels", "qrels"), QRELS_DEFS_2020_TASK_2, ), documentation("2020/task-2"), ), f"argsme/2020-04-01/{NAME}-2021-task-1": Dataset( registry["argsme/2020-04-01"].docs_handler(), ToucheTitleQueries( cached_zip_download("2021/task-1/queries", "topics-task-1-only-titles.xml", "xml"), namespace=f"argsme/2020-04-01/{NAME}-2021-task-1", language="en", ), ToucheQualityQrels( cached_download("2021/task-1/qrels-relevance", "qrels"), cached_download("2021/task-1/qrels-quality", "qrels"), QRELS_DEFS_2021_TASK_1, ), documentation("2021/task-1"), ), f"clueweb12/{NAME}-2021-task-2": Dataset( registry["clueweb12"].docs_handler(), ToucheQueries( cached_zip_download("2021/task-2/queries", "topics-task2-51-100.xml", "xml"), namespace=f"clueweb12/{NAME}-2021-task-2", language="en", ), ToucheQualityQrels( cached_download("2021/task-2/qrels-relevance", "qrels"), cached_download("2021/task-2/qrels-quality", "qrels"), QRELS_DEFS_2021_TASK_2, ), documentation("2021/task-2"), ), f"argsme/2020-04-01/processed/{NAME}-2022-task-1": Dataset( registry["argsme/2020-04-01/processed"].docs_handler(), ToucheQueries( cached_download("2022/task-1/queries", "xml"), namespace=f"argsme/2020-04-01-processed/{NAME}-2022-task-1", language="en", ), ToucheQualityCoherenceQrels( cached_download("2022/task-1/qrels-relevance", "qrels"), cached_download("2022/task-1/qrels-quality", "qrels"), cached_download("2022/task-1/qrels-coherence", "qrels"), QRELS_DEFS_2022_TASK_1, ), documentation("2022/task-1"), ), f"clueweb12/{NAME}-2022-task-2": Dataset( TouchePassageDocs( cached_gzip_download("2022/task-2/passages", "jsonl"), namespace=f"clueweb12/{NAME}-2022-task-2", language="en", count_hint=868655, ), ToucheComparativeQueries( cached_zip_download("2022/task-2/queries", "topics-task2.xml", "xml"), namespace=f"clueweb12/{NAME}-2022-task-2", language="en", ), ToucheQualityComparativeStanceQrels( cached_download("2022/task-2/qrels-relevance", "qrels"), cached_download("2022/task-2/qrels-quality", "qrels"), cached_download("2022/task-2/qrels-stance", "qrels"), QRELS_DEFS_2022_TASK_2, ), documentation("2022/task-2"), ), f"touche-image/2022-06-13/{NAME}-2022-task-3": Dataset( registry["touche-image/2022-06-13"].docs_handler(), ToucheQueries( cached_download("2022/task-3/queries", "xml"), namespace=f"{NAME}/{NAME}-2022-task-3", language="en", ), ToucheControversialStanceQrels( cached_download("2022/task-3/qrels", "qrels"), QRELS_DEFS_2022_TASK_3, ), documentation("2022/task-3"), ), } for name, dataset in task_base_datasets.items(): registry.register(name, dataset) # Define and create task sub-datasets. task_sub_datasets = { f"argsme/1.0/{NAME}-2020-task-1/uncorrected": Dataset( registry["argsme/1.0"].docs_handler(), registry[f"argsme/2020-04-01/{NAME}-2020-task-1"].queries_handler(), ToucheQrels( cached_download("2020/task-1/qrels-argsme-1.0-uncorrected", "qrels"), QRELS_DEFS_2020_TASK_1, allow_float_score=True, ), documentation("2020/task-1/argsme-1.0/uncorrected"), ), f"argsme/2020-04-01/{NAME}-2020-task-1/uncorrected": Dataset( registry["argsme/2020-04-01"].docs_handler(), registry[f"argsme/2020-04-01/{NAME}-2020-task-1"].queries_handler(), ToucheQrels( cached_download("2020/task-1/qrels-argsme-2020-04-01-uncorrected", "qrels"), QRELS_DEFS_2020_TASK_1, allow_float_score=True, ), documentation("2020/task-1/argsme-2020-04-01/uncorrected"), ), f"clueweb12/{NAME}-2022-task-2/expanded-doc-t5-query": Dataset( TouchePassageDocs( cached_gzip_download("2022/task-2/passages-expanded-doc-t5-query", "jsonl"), namespace=f"clueweb12/{NAME}-2022-task-2", language="en", count_hint=868655 ), registry[f"clueweb12/{NAME}-2022-task-2"].queries_handler(), registry[f"clueweb12/{NAME}-2022-task-2"].qrels_handler(), documentation("2022/task-2/expanded-doc-t5-query"), ), } for name, dataset in task_sub_datasets.items(): registry.register(name, dataset) return task_base_datasets, task_sub_datasets _init() ================================================ FILE: ir_datasets/datasets/touche_image.py ================================================ from ir_datasets import registry from ir_datasets.datasets.base import Dataset, YamlDocumentation from ir_datasets.formats import ToucheImageDocs from ir_datasets.util import DownloadConfig, home_path, Cache NAME = "touche-image" def _init(): base_path = home_path() / NAME documentation = YamlDocumentation(f"docs/{NAME}.yaml") download_config = DownloadConfig.context(NAME, base_path) base = Dataset(documentation('_')) def cached_download(name: str, extension: str) -> Cache: return Cache( download_config[name], base_path / f"{name}.{extension}" ) datasets = { f"2022-06-13": Dataset( ToucheImageDocs( cached_download("2022-06-13/images-main", "zip"), cached_download("2022-06-13/images-nodes", "zip"), cached_download("2022-06-13/images-png", "zip"), namespace=f"{NAME}/2022-06-13", language="en", count_hint=23841, ), documentation("2022-06-13"), ) } # NOTE: the following datasets are defined in touche.py: # - touche-image/2022-06-13/touche-2022-task-3 # Register datasets. registry.register(NAME, base) for name, images in datasets.items(): registry.register(f'{NAME}/{name}', images) return base, datasets dataset = _init() ================================================ FILE: ir_datasets/datasets/trec_arabic.py ================================================ import ir_datasets from ir_datasets.util import DownloadConfig from ir_datasets.formats import TrecQrels, TrecDocs, TrecQueries from ir_datasets.datasets.base import Dataset, YamlDocumentation NAME = 'trec-arabic' QREL_DEFS = { 1: 'relevant', 0: 'not relevant', } QTYPE_MAP = { '<num> *(Number:)? *AR': 'query_id', # Remove AR prefix from QIDs '<title> *(Topic:)?': 'title', '<desc> *(Description:)?': 'description', '<narr> *(Narrative:)?': 'narrative' } def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = TrecDocs(dlc['docs'], encoding='utf8', path_globs=['arabic_newswire_a/transcripts/*/*.sgm.gz'], namespace=NAME, lang='ar', count_hint=ir_datasets.util.count_hint(NAME)) base = Dataset(collection, documentation('_')) subsets['ar2001'] = Dataset( TrecQueries(dlc['ar2001/queries'], qtype_map=QTYPE_MAP, encoding='ISO-8859-6', namespace=NAME, lang='ar'), TrecQrels(dlc['ar2001/qrels'], QREL_DEFS), collection, documentation('ar2001')) subsets['ar2002'] = Dataset( TrecQueries(dlc['ar2002/queries'], qtype_map=QTYPE_MAP, encoding='ISO-8859-6', namespace=NAME, lang='ar'), TrecQrels(dlc['ar2002/qrels'], QREL_DEFS), collection, documentation('ar2002')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/trec_cast.py ================================================ import gzip from hashlib import md5 import os from functools import cached_property, lru_cache, partial from collections import defaultdict import re import json import itertools from typing import List, NamedTuple, Optional, Tuple import ir_datasets from ir_datasets.util import BaseDownload, DownloadConfig, Lazy from ir_datasets.formats import ( TrecQrels, TrecScoredDocs, BaseDocs, BaseQueries, GenericDoc, ) from ir_datasets.datasets.base import ( Dataset, YamlDocumentation, FilteredQueries, FilteredScoredDocs, ) from ir_datasets.util.docs.lazy import ( IRDSDocuments, BaseTransformedDocs, IterDocs, LazyDocs, TransformedDocs, ) from ir_datasets.util.docs.multiple import PrefixedDocs, PrefixedDocsSpec import numpy as np from ir_datasets.util.docs.subset import ColonCommaDupes, DocsSubset, Dupes from ir_datasets.indices import DEFAULT_DOCSTORE_OPTIONS _logger = ir_datasets.log.easy() NAME = "trec-cast" QRELS_DEFS = { 4: "Fully meets. The passage is a perfect answer for the turn. It includes all of the information needed to fully answer the turn in the conversation context. It focuses only on the subject and contains little extra information.", 3: "Highly meets. The passage answers the question and is focused on the turn. It would be a satisfactory answer if Google Assistant or Alexa returned this passage in response to the query. It may contain limited extraneous information.", 2: "Moderately meets. The passage answers the turn, but is focused on other information that is unrelated to the question. The passage may contain the answer, but users will need extra effort to pick the correct portion. The passage may be relevant, but it may only partially answer the turn, missing a small aspect of the context.", 1: "Slightly meets. The passage includes some information about the turn, but does not directly answer it. Users will find some useful information in the passage that may lead to the correct answer, perhaps after additional rounds of conversation (better than nothing).", 0: "Fails to meet. The passage is not relevant to the question. The passage is unrelated to the target query.", } QRELS_DEFS_TRAIN = { 2: "very relevant", 1: "relevant", 0: "not relevant", } class CastPassage(NamedTuple): passage_id: str text: str marked_up_text: str class CastDoc(NamedTuple): doc_id: str title: str url: str passages: Tuple[CastPassage, ...] def default_text(self): """ Combines the title and text of constituent passages. """ return "\n".join([self.title] + [p.text for p in self.passages]) class CastPassageDoc(NamedTuple): doc_id: str title: str url: str text: str def default_text(self): """ Combines the title from the source document with the text of this passage. """ return f"{self.title}\n{self.text}" class Cast2019Query(NamedTuple): query_id: str raw_utterance: str topic_number: int turn_number: int topic_title: str topic_description: str def default_text(self): """ raw_utterance """ return self.raw_utterance class Cast2020Query(NamedTuple): query_id: str raw_utterance: str automatic_rewritten_utterance: str manual_rewritten_utterance: str manual_canonical_result_id: str topic_number: int turn_number: int def default_text(self): """ raw_utterance """ return self.raw_utterance class Cast2021Query(NamedTuple): query_id: str raw_utterance: str automatic_rewritten_utterance: str manual_rewritten_utterance: str canonical_result_id: str topic_number: int turn_number: int def default_text(self): """ raw_utterance """ return self.raw_utterance class Cast2022Query(NamedTuple): query_id: str parent_id: str participant: str raw_utterance: str manual_rewritten_utterance: str response: str provenance: List[str] topic_number: int turn_number: int def default_text(self): """ raw_utterance """ return self.raw_utterance class CastPassageIter: def __init__(self, docstore, doc_psg_offsets, slice): self.next_psg_index = 0 self.docstore = docstore self.doc_iter = iter(docstore) self.doc = None self.slice = slice if self.slice.start != 0: start_doc_idx = ( int(np.searchsorted(doc_psg_offsets(), self.slice.start, side="right")) - 1 ) self.doc_iter = self.doc_iter[start_doc_idx:] self.next_psg_index = self.slice.start - doc_psg_offsets()[start_doc_idx] self.doc_psg_offsets = doc_psg_offsets def __next__(self): if self.slice.start >= self.slice.stop or self.doc is StopIteration: raise StopIteration if self.doc is None: self.doc = next(self.doc_iter, StopIteration) while self.next_psg_index >= len(self.doc.passages): self.next_psg_index -= len(self.doc.passages) self.doc = next(self.doc_iter, StopIteration) if self.doc is StopIteration: raise StopIteration result = self.doc.passages[self.next_psg_index] result = CastPassageDoc( f"{self.doc.doc_id}-{self.next_psg_index+1}", self.doc.title, self.doc.url, self.doc.passages[self.next_psg_index], ) self.next_psg_index += self.slice.step or 1 self.slice = slice( self.slice.start + (self.slice.step or 1), self.slice.stop, self.slice.step ) return result def __iter__(self): return self def __getitem__(self, key): if isinstance(key, slice): # it[start:stop:step] new_slice = ir_datasets.util.apply_sub_slice(self.slice, key) return CastPassageIter(self.docstore, self.doc_psg_offsets, new_slice) elif isinstance(key, int): # it[index] new_slice = ir_datasets.util.slice_idx(self.slice, key) new_it = CastPassageIter(self.docstore, self.doc_psg_offsets, new_slice) try: return next(new_it) except StopIteration as e: raise IndexError(e) raise TypeError("key must be int or slice") class CastPassageDocstore(ir_datasets.indices.Docstore): def __init__(self, docs_docstore, options=DEFAULT_DOCSTORE_OPTIONS): super().__init__(GenericDoc, "doc_id", options=options) self._docs_docstore = docs_docstore def get_many_iter(self, doc_ids): passage_ids = list(doc_ids) did2pids = defaultdict(set) for pid in passage_ids: if pid.count("-") >= 1: did, idx = pid.rsplit("-", 1) if idx.isnumeric(): did2pids[did].add(int(idx) - 1) for doc in self._docs_docstore.get_many_iter(did2pids.keys()): for idx in did2pids[doc.doc_id]: if len(doc.passages) > idx: passage = doc.passages[idx] yield CastPassageDoc( f"{doc.doc_id}-{idx+1}", doc.title, doc.url, passage ) class LazyCastPassageIter: def __init__(self, docs: "CastPassageDocs"): self._docs = docs self._doc_iter = docs._docs.docs_iter() self._doc = None self._passage_ix = None def __iter__(self): return self def __next__(self): while (self._doc is None) or (len(self._doc.passages) <= self._passage_ix): self._doc = next(self._doc_iter) self._passage_ix = 0 self._passage_ix += 1 return CastPassageDoc( f"{self._doc.doc_id}-{self._passage_ix}", self._doc.title, self._doc.url, self._doc.passages[self._passage_ix - 1], ) def __getitem__(self, key): docstore = self._docs._docs.docs_store() @lru_cache() def offsets_fn(): """Stores the number of passages for each document of the initial collection""" offsets_path = f"{str(docstore.path)}.psg_offsets.np" if not os.path.exists(offsets_path): offsets = np.empty(docstore.count() + 1, dtype=np.uint32) count = 0 for i, doc in enumerate( _logger.pbar( iter(docstore), total=docstore.count(), desc="building passage offset file", ) ): offsets[i] = count count += len(doc.passages) offsets[-1] = count with ir_datasets.util.finialized_file(offsets_path, "wb") as fout: fout.write(offsets.tobytes()) return offsets else: return np.memmap(offsets_path, dtype=np.uint32, mode="r") passage_iter = CastPassageIter( docstore, offsets_fn, slice(0, self._docs._count, 1) ) return passage_iter[key] class CastPassageDocs(BaseDocs): def __init__(self, docs, count): super().__init__() self._docs = docs self._count = count def docs_iter(self): return LazyCastPassageIter(self) def docs_cls(self): return CastPassageDoc def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS): return CastPassageDocstore(self._docs.docs_store(field, options=options), options=options) def docs_count(self): return self._count def docs_namespace(self): return NAME def docs_lang(self): return "en" class SegmentedDocs(BaseTransformedDocs): """Segmented document collection based on pre-computed offsets segments_dl points to a compressed JSONL file where the ranges refer to the original document text, e.g.: {"id":"MARCO_00_1454834","ranges":[[[0,917]],[[918,2082]],[[2083,3220]],[[3221,3763]]],"md5":"f0577db28de265852932224525710486"} """ def __init__(self, docs, segments_dl: BaseDownload, store_name: str): super().__init__(docs, CastDoc, store_name) self._segments_dl = segments_dl def docs_iter(self): # Process files with self._segments_dl.stream() as fin, gzip.open(fin) as offsets_stream: for doc, data_json in zip(self._docs, offsets_stream): data = json.loads(data_json) assert ( doc.doc_id == data["id"] ), f"Error in processing offsets, docids differ: expected {data['id']} (offset), got {doc.doc_id} (document)" body: str = doc.passages[0] computer = md5() passages = [] for ranges in data["ranges"]: texts = [] computer.update(b"\x00") for start, end in ranges: computer.update(b"\x01") text = body[start:end] texts.append(text) computer.update(text.encode("utf-8")) passages.append(" ".join(texts)) assert computer.digest().hex() == data["md5"] yield doc._replace(passages=passages) class CastQueries(BaseQueries): def __init__(self, dlc, query_type): super().__init__() self._dlc = dlc self._query_type = query_type def queries_iter(self): with self._dlc.stream() as stream: topics = json.load(stream) for topic in topics: topic_number = topic["number"] for turn in topic["turn"]: turn_number = turn["number"] if self._query_type is Cast2019Query: yield Cast2019Query( f"{topic_number}_{turn_number}", turn["raw_utterance"], topic_number, turn_number, topic["title"], topic.get("description", ""), ) elif self._query_type is Cast2020Query: yield Cast2020Query( f"{topic_number}_{turn_number}", turn["raw_utterance"], turn["automatic_rewritten_utterance"], turn["manual_rewritten_utterance"], turn["manual_canonical_result_id"], topic_number, turn_number, ) elif self._query_type is Cast2021Query: yield Cast2021Query( f"{topic_number}_{turn_number}", turn["raw_utterance"], turn["automatic_rewritten_utterance"], turn["manual_rewritten_utterance"], turn["canonical_result_id"], topic_number, turn_number, ) elif self._query_type is Cast2022Query: if parent_id := turn.get("parent"): parent_id = f"{topic_number}_{parent_id}" yield Cast2022Query( f"{topic_number}_{turn_number}", parent_id, turn["participant"], turn.get("utterance"), turn.get("manual_rewritten_utterance"), turn.get("response"), turn.get("provenance", []), topic_number, turn_number, ) def queries_cls(self): return self._query_type def queries_namespace(self): return NAME def queries_lang(self): return "en" class WapoV4Docs(IRDSDocuments): def __init__(self, dsid: str): super().__init__(dsid) def docs_cls(self): return CastDoc def docs_iter(self): CLEANR = re.compile("<.*?>") dup_dids = set() for data in self.docs.docs_handler().docs_wapo_raw_iter(): if data["id"] in dup_dids: continue dup_dids.add(data["id"]) doc_id = str(data["id"]) title = data.get("title", "No Title") if data["article_url"]: if "www.washingtonpost.com" not in data["article_url"]: url = "https://www.washingtonpost.com" + data["article_url"] else: url = data["article_url"] else: url = "" body = "" if data.get("contents") and len(data["contents"]) > 0: for item in data["contents"]: # if item is not None and item.get('subtype') == 'paragraph': if item is not None and item.get("subtype") == "paragraph": body += " " + item["content"] body = re.sub(CLEANR, "", body) body = body.replace("\n", " ").strip() if body: yield CastDoc(doc_id, title, url, [body]) class KiltCastDocs(TransformedDocs): def __init__(self, dsid: str): super().__init__(LazyDocs(dsid), CastDoc) def docs_iter(self): for doc in map( self.transform, self._docs.docs.docs_handler().docs_kilt_raw_iter() ): if doc is not None: yield doc def transform(self, doc): title = doc["wikipedia_title"] body = " ".join(doc["text"]).replace("\n", " ").strip() url = doc["history"]["url"] return CastDoc(doc["wikipedia_id"], title, url, [body]) class WapoDupes(Dupes): @cached_property def doc_ids(self): doc_ids = set() with self._base.stream() as fp: for line in fp: base_id, wapo_id, *__ = line.strip().split(b" ", 3) if base_id != wapo_id: if doc_id := self._remove_prefix(wapo_id.decode("utf-8")): doc_ids.add(doc_id) return doc_ids def transform_msmarco_v1(doc): return CastDoc( doc.doc_id, doc.title, doc.url, [doc.body.replace("\n", " ").strip()] ) def transform_msmarco_v2(doc): doc_id = doc.doc_id[len("msmarco_doc_") :] return CastDoc(doc_id, doc.title, doc.url, [doc.body.replace("\n", " ").strip()]) def _init(): subsets = {} base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f"docs/{NAME}.yaml") def wapo_converter(dsid, dupes: Dupes): BeautifulSoup = ir_datasets.lazy_libs.bs4().BeautifulSoup # NOTE: These rules are very specific in order to replicate the behaviour present in the official script # here: <https://github.com/grill-lab/trec-cast-tools/blob/8fa243a7e058ce4b1b378c99768c53546460c0fe/src/main/python/wapo_trecweb.py> # Specifically, things like skipping empty documents, filtering by "paragraph" subtype, and starting the # paragraph index at 1 are all needed to perfectly match the above script. # Note that the script does NOT strip HTML markup, which is meant to be removed out in a later stage (e.g., indexing). # We do that here for user simplicity, as it will allow the text to be consumed directly by various models # without the need for further pre-processing. (Though a bit of information is lost.) for wapo_doc in ir_datasets.load(dsid).docs_handler().docs_wapo_raw_iter(): doc_id = wapo_doc["id"] # Ignore this one if dupes.has(doc_id): continue pid = itertools.count(1) # paragrah index starts at 1 for paragraph in wapo_doc["contents"]: if ( paragraph is not None and paragraph.get("subtype") == "paragraph" and paragraph["content"] != "" ): text = paragraph["content"] if paragraph.get("mime") == "text/html": text = BeautifulSoup( f"<OUTER>{text}</OUTER>", "lxml-xml" ).get_text() yield GenericDoc(f"WAPO_{doc_id}-{next(pid)}", text) # --- Version 0 and 1 (2019 and 2020) # https://github.com/daltonj/treccastweb#year-2-trec-2020 # documents = MARCO Ranking passages (v1) and Wikipedia (TREC CAR) # Version 0 contains WAPO (but this is not used) docs_v0 = PrefixedDocs( f"{NAME}/docs_v0", PrefixedDocsSpec( "WAPO_", IterDocs( f"{NAME}/v1/wapo-v2", partial(wapo_converter, "wapo/v2", ColonCommaDupes(dlc["wapo_dupes"], prefix="WAPO_") ), ), ), PrefixedDocsSpec( "MARCO_", DocsSubset( f"{NAME}/v1/msmarco-passages", LazyDocs("msmarco-passage"), ColonCommaDupes(dlc["marco_dupes"], prefix="MARCO_"), ), ), PrefixedDocsSpec("CAR_", LazyDocs("car/v2.0")), ) docs_v1 = PrefixedDocs( f"{NAME}/docs_v1", PrefixedDocsSpec( "MARCO_", DocsSubset( f"{NAME}/v1/msmarco-passages", LazyDocs("msmarco-passage"), ColonCommaDupes(dlc["marco_dupes"], prefix="MARCO_"), ), ), PrefixedDocsSpec("CAR_", LazyDocs("car/v2.0")), ) base = Dataset(documentation("_")) subsets["v0"] = Dataset(docs_v0) subsets["v0/train"] = Dataset( docs_v0, CastQueries(dlc["2019/train/queries"], Cast2019Query), TrecQrels(dlc["2019/train/qrels"], QRELS_DEFS_TRAIN), TrecScoredDocs(dlc["2019/train/scoreddocs"]), ) qids_train_v0 = Lazy(lambda: {q.query_id for q in subsets["v0/train"].qrels_iter()}) subsets["v0/train/judged"] = Dataset( docs_v0, FilteredQueries(subsets["v0/train"].queries_handler(), qids_train_v0), subsets["v0/train"].qrels_handler(), FilteredScoredDocs(subsets["v0/train"].scoreddocs_handler(), qids_train_v0), ) subsets["v1"] = Dataset(docs_v1) subsets["v1/2019"] = Dataset( docs_v1, CastQueries(dlc["2019/eval/queries"], Cast2019Query), TrecQrels(dlc["2019/eval/qrels"], QRELS_DEFS), TrecScoredDocs(dlc["2019/eval/scoreddocs"]), ) qids_2019 = Lazy(lambda: {q.query_id for q in subsets["v1/2019"].qrels_iter()}) subsets["v1/2019/judged"] = Dataset( docs_v1, FilteredQueries(subsets["v1/2019"].queries_handler(), qids_2019), subsets["v1/2019"].qrels_handler(), FilteredScoredDocs(subsets["v1/2019"].scoreddocs_handler(), qids_2019), ) subsets["v1/2020"] = Dataset( docs_v1, CastQueries(dlc["2020/queries"], Cast2020Query), TrecQrels(dlc["2020/qrels"], QRELS_DEFS), ) qids_2020 = Lazy(lambda: {q.query_id for q in subsets["v1/2020"].qrels_iter()}) subsets["v1/2020/judged"] = Dataset( docs_v1, FilteredQueries(subsets["v1/2020"].queries_handler(), qids_2020), subsets["v1/2020"].qrels_handler(), ) # --- Version 2 (2021) # https://github.com/daltonj/treccastweb#year-3-trec-2021 # Documents = WAPO 2020, KILT and MS Marco v1 (documents) # We provide passage offsets for the three document collections # Duplicates are in two files: # wapo-near-duplicates for WAPO # marco_duplicates.txt for MS-MARCO def register_docs(namespace: str, use_docs: bool, *tuples): """Register all documents (sub)collections Tuples: (name prefix, document ID prefix, raw documents, passage count) """ all_docs_spec = [] all_passages_spec = [] passages = [] for dsid, prefix, raw, count in tuples: prefixed = PrefixedDocs(None, PrefixedDocsSpec(prefix, raw)) subsets[f"{namespace}/{dsid}"] = Dataset(prefixed) segmented = SegmentedDocs( prefixed, dlc[f"{namespace}/offsets/{dsid}"], f"{NAME}/docs_{namespace}_{dsid}", ) subsets[f"{namespace}/{dsid}/segmented"] = Dataset(segmented) passage = CastPassageDocs(segmented, count) passages.append(passage) subsets[f"{namespace}/{dsid}/passages"] = Dataset(passage) # Add this all_docs_spec.append( PrefixedDocsSpec(prefix, (raw if use_docs else passage), not use_docs) ) all_passages_spec.append( PrefixedDocsSpec(prefix, passage, True) ) # All documents together all_docs = PrefixedDocs(f"{NAME}/docs_{namespace}", *all_docs_spec) subsets[f"{namespace}"] = Dataset(all_docs) if use_docs: # Add a passage dataset subsets[f"{namespace}/passages"] = PrefixedDocs(f"{NAME}/passages_{namespace}", *all_passages_spec) return all_docs """ docs_v2 = register_docs( "v2", True, ( "msmarco", "MARCO_", TransformedDocs( DocsSubset( f"{NAME}/v2/msmarco-documents", LazyDocs("msmarco-document"), ColonCommaDupes(dlc["v2/dupes/marco_v1"]), ), CastDoc, transform_msmarco_v1, ), 19_092_817, ), ( "wapo", "WAPO_", DocsSubset( f"{NAME}/v2/wapo-v4", WapoV4Docs("wapo/v4"), WapoDupes(dlc["v2/dupes/wapo"]), ), 3_728_553, ), ("kilt", "KILT_", KiltCastDocs("kilt"), 17_124_025), ) """ #subsets["v2/2021"] = Dataset( # docs_v2, # CastQueries(dlc["2021/queries"], Cast2021Query), # TrecQrels(dlc["2021/qrels"], QRELS_DEFS), #) # --- Version 3 (2022) # https://github.com/daltonj/treccastweb#year-4-trec-2022 # Official documents = processed (split) WAPO 2020, KILT, MS Marco V2 v3_dupes = dlc["v3/dupes"] """ docs_v3 = register_docs( "v3", False, ( "msmarco", "MARCO_", DocsSubset( f"{NAME}/v3/msmarco-documents-v2", TransformedDocs( LazyDocs("msmarco-document-v2"), CastDoc, transform_msmarco_v2 ), Dupes(v3_dupes, prefix="MARCO_"), ), 86_326_322, ), ( "wapo", "WAPO_", DocsSubset( f"{NAME}/v3/wapo-v4", WapoV4Docs("wapo/v4"), Dupes(v3_dupes, prefix="WAPO_"), ), 2_963_130, ), ( "kilt", "KILT_", DocsSubset( f"{NAME}/v3/kilt-v4", KiltCastDocs("kilt"), Dupes(v3_dupes, prefix="KILT_"), ), 17_111_488, ), ) """ #subsets["v3/2022"] = Dataset( # docs_v3, # CastQueries(dlc["2022/queries"], Cast2022Query), # TrecQrels(dlc["2022/qrels"], QRELS_DEFS), #) # --- Register all datasets ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register( f"{NAME}/{s}", Dataset(subsets[s], documentation(s)) ) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/trec_fair.py ================================================ import json import codecs from typing import NamedTuple, Dict, List, Optional import ir_datasets from ir_datasets.util import GzipExtract, Cache, Lazy from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries, Deprecated from ir_datasets.formats import BaseQueries, BaseDocs, BaseQrels, TrecQrel from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS from itertools import chain _logger = ir_datasets.log.easy() NAME = 'trec-fair' QREL_DEFS = { 1: "relevant" } class FairTrecDoc(NamedTuple): doc_id: str title: str text: str marked_up_text: str url: str quality_score: Optional[float] geographic_locations: Optional[List[str]] quality_score_disk: Optional[str] def default_text(self): """ title and text """ return f"{self.title} {self.text}" class FairTrec2022Doc(NamedTuple): doc_id: str title: str text: str url: str pred_qual: Optional[float] qual_cat: Optional[str] page_countries: Optional[List[str]] page_subcont_regions: Optional[List[str]] source_countries: Optional[Dict[str, int]] source_subcont_regions: Optional[Dict[str, int]] gender: Optional[List[str]] occupations: Optional[List[str]] years: Optional[List[int]] num_sitelinks: Optional[int] relative_pageviews: Optional[float] first_letter: Optional[str] creation_date: Optional[str] first_letter_category: Optional[str] gender_category: Optional[str] creation_date_category: Optional[str] years_category: Optional[str] relative_pageviews_category: Optional[str] num_sitelinks_category: Optional[str] def default_text(self): """ title and text """ return f'{self.title} {self.text}' class FairTrecQuery(NamedTuple): query_id: str text: str keywords: List[str] scope: str homepage: str def default_text(self): """ text """ return self.text class FairTrec2022TrainQuery(NamedTuple): query_id: str text: str url: str def default_text(self): """ text """ return self.text class FairTrecEvalQuery(NamedTuple): query_id: str text: str keywords: List[str] scope: str def default_text(self): """ text """ return self.text class FairTrecDocs(BaseDocs): def __init__(self, dlc, mlc): super().__init__() self._dlc = dlc self._mlc = mlc def docs_iter(self): return iter(self.docs_store()) def _docs_iter(self): def _metadata_iter(): with self._mlc.stream() as stream2: for metadata_line in stream2: yield json.loads(metadata_line) textifier = ir_datasets.lazy_libs.pyautocorpus().Textifier() metadata_iter = _metadata_iter() next_metadata = None with self._dlc.stream() as stream1: for line in stream1: data1 = json.loads(line) if next_metadata is None: next_metadata = next(metadata_iter, None) if next_metadata is not None: if data1['id'] == next_metadata['page_id']: match = next_metadata next_metadata = None try: plaintext = textifier.textify(data1['text']) except ValueError as err: message, position = err.args if message == "Expected markup type 'comment'": # unmatched <!-- comment tag # The way Wikipedia renders this is it cuts the article off at this point. # We'll follow that here, given it's only 22 articles of the 6M. # (Note: the position is a byte offset, so that's why it encodes/decodes.) plaintext = textifier.textify(data1['text'].encode()[:position].decode()) else: raise if match: # has metadata yield FairTrecDoc(str(data1['id']), data1['title'], plaintext, data1['text'], data1['url'], match['quality_score'], match['geographic_locations'], str(match['quality_score_disc'])) else: # no metadata yield FairTrecDoc(str(data1['id']), data1['title'], plaintext, data1['text'], data1['url'], None, None, None) def docs_cls(self): return FairTrecDoc def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f'{ir_datasets.util.home_path()/NAME}/2021/docs.pklz4', init_iter_fn=self._docs_iter, data_cls=self.docs_cls(), lookup_field=field, size_hint=30735927055, index_fields=['doc_id'], count_hint=ir_datasets.util.count_hint(NAME), options=options ) def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_namespace(self): return NAME def docs_lang(self): return 'en' class FairTrecQueries(BaseQueries): def __init__(self, dlc, qtype): super().__init__() self._dlc = dlc self._qtype = qtype def queries_iter(self): with self._dlc.stream() as stream: for line in stream: data = json.loads(line) if self._qtype is FairTrecEvalQuery: yield FairTrecEvalQuery(str(data['id']), data['title'], data["keywords"], data["scope"]) elif self._qtype is FairTrecQuery: yield FairTrecQuery(str(data['id']), data['title'], data["keywords"], data["scope"], data["homepage"]) elif self._qtype is FairTrec2022TrainQuery: yield FairTrec2022TrainQuery(str(data['id']), data['title'], data["url"]) def queries_cls(self): return self._qtype def queries_lang(self): return 'en' class FairTrecQrels(BaseQrels): def __init__(self, qrels_dlc): self._qrels_dlc = qrels_dlc def qrels_path(self): return self._qrels_dlc.path() def qrels_iter(self): with self._qrels_dlc.stream() as stream: for line in stream: data = json.loads(line) for rlDoc in data["rel_docs"]: yield TrecQrel(str(data["id"]), str(rlDoc), 1, "0") def qrels_cls(self): return TrecQrel def qrels_defs(self): return QREL_DEFS class JsonlDocs(BaseDocs): def __init__(self, dlc, metadata_dlc, doc_type, field_map, count_hint): super().__init__() self._metadata_dlc = metadata_dlc self._dlc = dlc self._doc_type = doc_type self._field_map = field_map self._count_hint = count_hint def docs_iter(self): return iter(self.docs_store()) def _docs_iter_first(self): metadata = {} with self._metadata_dlc.stream() as stream: for line in _logger.pbar(stream, desc='pre-loading metadata', total=6460238): doc = json.loads(line) metadata[doc['page_id']] = doc with self._dlc.stream() as stream: for line in stream: doc = json.loads(line) if doc['id'] in metadata: doc.update(metadata[doc['id']]) yield self._doc_type(**{dest: self._doc_type.__annotations__[dest](doc.get(src)) if 'typing' not in str(self._doc_type.__annotations__[dest]) else doc.get(src) for dest, src in self._field_map.items()}) def docs_cls(self): return self._doc_type def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f'{self._dlc.path(force=False)}.pklz4', init_iter_fn=self._docs_iter_first, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], count_hint=self._count_hint, options=options ) def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_namespace(self): return NAME def docs_lang(self): return 'en' def _init(): base_path = ir_datasets.util.home_path()/NAME dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection2021 = FairTrecDocs(GzipExtract(dlc["2021/docs"]), GzipExtract(dlc["2021/metadata"])) mapping2022 = {'doc_id': 'id', 'title': 'title', 'url': 'url', 'text': 'plain', 'pred_qual': 'pred_qual','qual_cat': 'qual_cat','page_countries': 'page_countries','page_subcont_regions': 'page_subcont_regions','source_countries': 'source_countries','source_subcont_regions': 'source_subcont_regions','gender': 'gender','occupations': 'occupations','years': 'years','num_sitelinks': 'num_sitelinks','relative_pageviews': 'relative_pageviews','first_letter': 'first_letter','creation_date': 'creation_date','first_letter_category': 'first_letter_category','gender_category': 'gender_category','creation_date_category': 'creation_date_category','years_category': 'years_category','relative_pageviews_category': 'relative_pageviews_category','num_sitelinks_category': 'num_sitelinks_category'} collection2022 = JsonlDocs(GzipExtract(dlc["2022/docs"]), GzipExtract(dlc["2022/metadata"]), FairTrec2022Doc, mapping2022, ir_datasets.util.count_hint(f'{NAME}/2022')) base = Dataset(documentation('_')) subsets = {} subsets['2021'] = Dataset( collection2021, documentation('_')) train2021_topics = GzipExtract(dlc["2021/train/topics"]) subsets['2021/train'] = Dataset( collection2021, FairTrecQueries(train2021_topics, FairTrecQuery), FairTrecQrels(train2021_topics), documentation('2021/train')) subsets['2021/eval'] = Dataset( collection2021, FairTrecQueries(GzipExtract(dlc["2021/eval/topics"]), FairTrecEvalQuery), FairTrecQrels(GzipExtract(dlc["2021/eval/qrels"])), documentation('2021/eval')) subsets['2022'] = Dataset( collection2022, documentation('2022')) train2022_topics = dlc["2022/train/topics"] subsets['2022/train'] = Dataset( collection2022, FairTrecQueries(train2022_topics, FairTrec2022TrainQuery), FairTrecQrels(train2022_topics), documentation('2022/train')) ir_datasets.registry.register(NAME, base) for s in subsets: ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) # old versions that include the year in the top level ID ir_datasets.registry.register('trec-fair-2021', Dataset(subsets['2021'], Deprecated('acessing TREC Fair Ranking 2021 through trec-fair-2021 is deprecated; use trec-fair/2021 instead.'))) ir_datasets.registry.register('trec-fair-2021/train', Dataset(subsets['2021/train'], Deprecated('acessing TREC Fair Ranking 2021 through trec-fair-2021/train is deprecated; use trec-fair/2021/train instead.'))) ir_datasets.registry.register('trec-fair-2021/eval', Dataset(subsets['2021/eval'], Deprecated('acessing TREC Fair Ranking 2021 through trec-fair-2021/train is deprecated; use trec-fair/2021/train instead.'))) # move old version if it's found base_2021 = ir_datasets.util.home_path()/'trec-fair-2021' if base_2021.exists(): if not base_path.exists(): base_path.mkdir(parents=True, exist_ok=True) target = base_path/'2021' if not target.exists(): base_2021.rename(target) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/trec_mandarin.py ================================================ from typing import NamedTuple import ir_datasets from ir_datasets.util import GzipExtract, DownloadConfig from ir_datasets.formats import TrecQrels, TrecDocs, TrecQueries from ir_datasets.datasets.base import Dataset, YamlDocumentation NAME = 'trec-mandarin' class TrecMandarinQuery(NamedTuple): query_id: str title_en: str title_zh: str description_en: str description_zh: str narrative_en: str narrative_zh: str def default_text(self): """ title_zh """ return self.title_zh QREL_DEFS = { 1: 'relevant', 0: 'not relevant', } QTYPE_MAP = { '<num> *(Number:)? *CH': 'query_id', # Remove CH prefix from QIDs '<E-title> *(Topic:)?': 'title_en', '<C-title> *(Topic:)?': 'title_zh', '<E-desc> *(Description:)?': 'description_en', '<C-desc> *(Description:)?': 'description_zh', '<E-narr> *(Narrative:)?': 'narrative_en', '<C-narr> *(Narrative:)?': 'narrative_zh', } def _init(): subsets = {} documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) collection = TrecDocs(dlc['docs'], encoding='GB18030', path_globs=['**/xinhua/x*', '**/peoples-daily/pd*'], namespace=NAME, lang='zh', count_hint=ir_datasets.util.count_hint(NAME)) base = Dataset(collection, documentation('_')) subsets['trec5'] = Dataset( TrecQueries(GzipExtract(dlc['trec5/queries']), qtype=TrecMandarinQuery, qtype_map=QTYPE_MAP, encoding='GBK', namespace=NAME, lang=None), # queries have multiple languages TrecQrels(GzipExtract(dlc['trec5/qrels']), QREL_DEFS), collection, documentation('trec5')) subsets['trec6'] = Dataset( TrecQueries(GzipExtract(dlc['trec6/queries']), qtype=TrecMandarinQuery, qtype_map=QTYPE_MAP, encoding='GBK', namespace=NAME, lang=None), # queries have multiple languages TrecQrels(GzipExtract(dlc['trec6/qrels']), QREL_DEFS), collection, documentation('trec6')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/trec_robust04.py ================================================ import ir_datasets from ir_datasets.util import GzipExtract, Lazy, DownloadConfig from ir_datasets.formats import TrecQrels, TrecDocs, TrecQueries from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredQrels, YamlDocumentation, Deprecated NAME = 'trec-robust04' QREL_DEFS = { 2: 'highly relevant', 1: 'relevant', 0: 'not relevant', } DUA = ("Please confirm you agree to the TREC Robust 2004 data usage agreement found at " "<https://trec.nist.gov/data/cd45/index.html>") # folds from Huston & Croft 2014 <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.646.7749> FOLDS = { 'fold1': {'302', '303', '309', '316', '317', '319', '323', '331', '336', '341', '356', '357', '370', '373', '378', '381', '383', '392', '394', '406', '410', '411', '414', '426', '428', '433', '447', '448', '601', '607', '608', '612', '617', '619', '635', '641', '642', '646', '647', '654', '656', '662', '665', '669', '670', '679', '684', '690', '692', '700'}, 'fold2': {'301', '308', '312', '322', '327', '328', '338', '343', '348', '349', '352', '360', '364', '365', '369', '371', '374', '386', '390', '397', '403', '419', '422', '423', '424', '432', '434', '440', '446', '602', '604', '611', '623', '624', '627', '632', '638', '643', '651', '652', '663', '674', '675', '678', '680', '683', '688', '689', '695', '698'}, 'fold3': {'306', '307', '313', '321', '324', '326', '334', '347', '351', '354', '358', '361', '362', '363', '376', '380', '382', '396', '404', '413', '415', '417', '427', '436', '437', '439', '444', '445', '449', '450', '603', '605', '606', '614', '620', '622', '626', '628', '631', '637', '644', '648', '661', '664', '666', '671', '677', '685', '687', '693'}, 'fold4': {'320', '325', '330', '332', '335', '337', '342', '344', '350', '355', '368', '377', '379', '387', '393', '398', '402', '405', '407', '408', '412', '420', '421', '425', '430', '431', '435', '438', '616', '618', '625', '630', '633', '636', '639', '649', '650', '653', '655', '657', '659', '667', '668', '672', '673', '676', '682', '686', '691', '697'}, 'fold5': {'304', '305', '310', '311', '314', '315', '318', '329', '333', '339', '340', '345', '346', '353', '359', '366', '367', '372', '375', '384', '385', '388', '389', '391', '395', '399', '400', '401', '409', '416', '418', '429', '441', '442', '443', '609', '610', '613', '615', '621', '629', '634', '640', '645', '658', '660', '681', '694', '696', '699'} } DEPRECATED_MESSAGE = '{} is deprecated. Consider using {} instead, which provides better parsing of the corpus.' def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path, dua=DUA) subsets = {} collection = TrecDocs(dlc['docs'], path_globs=['**/FBIS/FB*', '**/FR94/??/FR*', '**/FT/*/FT*', '**/LATIMES/LA*'], namespace=NAME, lang='en', expected_file_count=2295, count_hint=ir_datasets.util.count_hint(NAME)) queries = TrecQueries(GzipExtract(dlc['queries']), namespace=NAME, lang='en') qrels = TrecQrels(dlc['qrels'], QREL_DEFS) base = Dataset( collection, queries, qrels, documentation('_'), Deprecated(DEPRECATED_MESSAGE.format(NAME, f'disks45/nocr/trec-robust-2004'))) for fold in FOLDS: qid_filter = make_filter(fold) subsets[fold] = Dataset( FilteredQueries(queries, qid_filter), FilteredQrels(qrels, qid_filter), collection, documentation(fold), Deprecated(DEPRECATED_MESSAGE.format(f'{NAME}/{fold}', f'disks45/nocr/trec-robust-2004/{fold}'))) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets def make_filter(fold): return Lazy(lambda: FOLDS[fold]) base, subsets = _init() ================================================ FILE: ir_datasets/datasets/trec_spanish.py ================================================ from typing import NamedTuple import ir_datasets from ir_datasets.util import GzipExtract, DownloadConfig from ir_datasets.formats import TrecQrels, TrecDocs, TrecQueries from ir_datasets.datasets.base import Dataset, YamlDocumentation NAME = 'trec-spanish' class TrecDescOnlyQuery(NamedTuple): query_id: str description: str def default_text(self): """ description """ return self.description class TrecSpanish3Query(NamedTuple): query_id: str title_es: str title_en: str description_es: str description_en: str narrative_es: str narrative_en: str def default_text(self): """ title_es """ return self.title_es class TrecSpanish4Query(NamedTuple): query_id: str description_es1: str description_en1: str description_es2: str description_en2: str def default_text(self): """ description_es1 """ return self.description_es1 QREL_DEFS = { 1: 'relevant', 0: 'not relevant', } QTYPE_MAP_3 = { '<num> *(Number:)? *SP': 'query_id', # Remove SP prefix from QIDs '<title> *(Topic:)?': 'title', '<desc> *(Description:)?': 'description', '<narr> *(Narrative:)?': 'narrative', } QTYPE_MAP_4 = { '<num> *(Number:)? *SP': 'query_id', # Remove SP prefix from QIDs '<desc> *(Description:)?': 'description', } # TREC Spanish has this strange convention where lines that start with ** are # translations of the query. Rather than trying to bake this into TrecQueries, # I'm using an adapter to apply this just for TREC Spanish. class TrecSpanishTranslateQueries: def __init__(self, parent, query_cls): self._parent = parent self._query_cls = query_cls def __getattr__(self, attr): return getattr(self._parent, attr) def queries_iter(self): qcls = self._query_cls for query in self._parent.queries_iter(): qid = query.query_id tup = [qid,] for value in query[1:]: tup.append('') for line in value.split('\n'): if line.strip() == '': tup[-1] = tup[-1].strip() tup.append('') # Translations begin with ** if line.lstrip().startswith('**'): line = line.lstrip()[2:] tup[-1] += line.strip() + ' ' # Sometimes not all translations are available. Fill in remaining with blanks tup += [''] * (len(qcls._fields) - len(tup)) yield qcls(*tup) def queries_cls(self): return self._query_cls def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = TrecDocs(dlc['docs'], encoding='ISO-8859-1', path_globs=['**/afp_text/af*', '**/infosel_data/ism_*'], namespace=NAME, lang='es', count_hint=ir_datasets.util.count_hint(NAME)) base = Dataset(collection, documentation('_')) subsets['trec3'] = Dataset( TrecSpanishTranslateQueries(TrecQueries(GzipExtract(dlc['trec3/queries']), qtype_map=QTYPE_MAP_3, encoding='ISO-8859-1', namespace=NAME, lang=None), TrecSpanish3Query), TrecQrels(GzipExtract(dlc['trec3/qrels']), QREL_DEFS), collection, documentation('trec3')) subsets['trec4'] = Dataset( TrecSpanishTranslateQueries(TrecQueries(GzipExtract(dlc['trec4/queries']), qtype=TrecDescOnlyQuery, qtype_map=QTYPE_MAP_4, encoding='ISO-8859-1', namespace=NAME, lang=None), TrecSpanish4Query), TrecQrels(GzipExtract(dlc['trec4/qrels']), QREL_DEFS), collection, documentation('trec4')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/trec_tot.py ================================================ import ir_datasets from ir_datasets.util import ZipExtract, Cache, Lazy, DownloadConfig from ir_datasets.formats import TrecQrels, JsonlQueries, JsonlDocs, TrecQrels from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredQrels, YamlDocumentation, Deprecated from typing import NamedTuple, List, Dict NAME = 'trec-tot' class TipOfTheTongueDoc(NamedTuple): doc_id: str page_title: str wikidata_id: str wikidata_classes: List[str] text: str sections: Dict[str, str] infoboxes: List[Dict[str, str]] def default_text(self): """ We use the title and text of the TipOfTheTongueQuery as default_text because that is everything available for users who want to respond to such an information need. """ return self.page_title + ' ' + self.text class TipOfTheTongueDoc2024(NamedTuple): doc_id: str title: str wikidata_id: str text: str sections: Dict[str, str] def default_text(self): """ We use the title and text of the TipOfTheTongueQuery as default_text because that is everything available for users who want to respond to such an information need. """ return self.title + ' ' + self.text class TipOfTheTongueQuery2024(NamedTuple): query_id: str query: str def default_text(self): return self.query class TipOfTheTongueQuery(NamedTuple): query_id: str url: str domain: str title: str text: str sentence_annotations: List[Dict[str, str]] def default_text(self): return self.title + ' ' + self.text QUERY_MAP = {'query_id': 'id', 'url': 'url', 'domain': 'domain', 'title': 'title', 'text': 'text', 'sentence_annotations': 'sentence_annotations'} def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} main_dlc = dlc['2023'] base = Dataset( documentation('_'), ) ir_datasets.registry.register(NAME, base) docs_2023_handler = JsonlDocs(Cache(ZipExtract(main_dlc, 'TREC-TOT/corpus.jsonl'), base_path/'2023/corpus.jsonl'), doc_cls=TipOfTheTongueDoc, lang='en') subsets['2023'] = Dataset( docs_2023_handler, documentation('2023'), ) ir_datasets.registry.register(f'{NAME}/2023', subsets['2023']) for s in ['train', 'dev']: subsets[f'2023/{s}'] = Dataset( docs_2023_handler, JsonlQueries(Cache(ZipExtract(main_dlc, f'TREC-TOT/{s}/queries.jsonl'), base_path/f'2023/{s}/queries.jsonl'), query_cls=TipOfTheTongueQuery, mapping=QUERY_MAP, lang='en'), TrecQrels(Cache(ZipExtract(main_dlc, f'TREC-TOT/{s}/qrel.txt'), base_path/f'2023/{s}/qrel.txt'), {0: 'Not Relevant', 1: 'Relevant'}), documentation(f'2023/{s}'), ) ir_datasets.registry.register(f'{NAME}/2023/{s}', subsets[f'2023/{s}']) main_dlc = dlc['2024'] docs_2024_handler = JsonlDocs(Cache(ZipExtract(main_dlc, 'corpus.jsonl'), base_path/'2024/corpus.jsonl'), doc_cls=TipOfTheTongueDoc2024, lang='en') subsets['2024'] = Dataset( docs_2024_handler, documentation('2024'), ) ir_datasets.registry.register(f'{NAME}/2024', subsets['2024']) for s in ['test']: subsets[f'2024/{s}'] = Dataset( docs_2024_handler, JsonlQueries(Cache(ZipExtract(dlc[f'2024-{s}'], f'{s}-2024/queries.jsonl'), base_path/f'2024/{s}-2024/queries.jsonl'), query_cls=TipOfTheTongueQuery2024, lang='en'), documentation(f'2024/{s}'), ) ir_datasets.registry.register(f'{NAME}/2024/{s}', subsets[f'2024/{s}']) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/trec_tot_2025.py ================================================ from ir_datasets import registry from ir_datasets.datasets.base import Dataset, YamlDocumentation from ir_datasets.util.download import RequestsDownload from ir_datasets.formats.base import BaseDocs from ir_datasets.indices import Docstore from ir_datasets.util import ZipExtractCache, home_path, Cache, DownloadConfig from ir_datasets.formats import BaseDocs, TrecQrels, JsonlQueries from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS import os import gzip import json from tqdm import tqdm from typing import NamedTuple NAME = "trec-tot" class JsonlDocumentOffset(NamedTuple): doc_id: str offset_start: int offset_end: int class TrecToT2025Doc(NamedTuple): doc_id: str title: str url: str text: str @staticmethod def _from_json(json_doc): return TrecToT2025Doc(json_doc["id"], json_doc["title"], json_doc["url"], json_doc["text"]) def default_text(self): return self.title + " " + self.text class JsonlWithOffsetsDocsStore(Docstore): def __init__(self, docs, offsets, options=DEFAULT_DOCSTORE_OPTIONS): self.__docs = docs self.__offsets = offsets self._docs_dict = None self._id_field = "doc_id" self._options = options def offsets_iter(self): with gzip.open(self.__offsets.path(), "rt") as f: for i in f: i = json.loads(i) yield JsonlDocumentOffset(doc_id=i["id"], offset_start=i["offset_start"], offset_end=i["offset_end"]) def docs_dict(self): return PickleLz4FullStore( path=str(self.__offsets.path()) + '.pklz4', init_iter_fn=self.offsets_iter, data_cls=JsonlDocumentOffset, lookup_field="doc_id", index_fields=("doc_id",), options=self._options ) def get_many_iter(self, doc_ids): offsets = self.docs_dict() with open(self.__docs.path(), "rb") as f: for doc in doc_ids: doc = offsets.get(doc) f.seek(doc.offset_start) raw_content_bytes = f.read(doc.offset_end - doc.offset_start) yield gzip.decompress(raw_content_bytes) class TrecToT2025DocsStore(JsonlWithOffsetsDocsStore): def get_many_iter(self, doc_ids): for i in super().get_many_iter(doc_ids): yield TrecToT2025Doc._from_json(json.loads(i)) class JsonlDocumentsWithOffsets(BaseDocs): def __init__(self, docs, offsets): self.__docs = docs self.__offsets = offsets def docs_iter(self): with gzip.open(self.__docs.path()) as f: for l in f: yield TrecToT2025Doc._from_json(json.loads(l)) def docs_cls(self): return TrecToT2025Doc def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): return TrecToT2025DocsStore(self.__docs, self.__offsets, options=options) def docs_namespace(self): raise ValueError("ToDo: Implement this") def docs_count(self): return 6407814 def docs_lang(self): return "en" class TrecToT2025Dataset(Dataset): def __init__(self, docs_jsonl_file, offset_jsonl_file, queries=None, qrels=None, documentation=None): docs = JsonlDocumentsWithOffsets(docs_jsonl_file, offset_jsonl_file) if queries: queries = JsonlQueries(queries, lang='en', mapping={"text": "query", "query_id": "query_id"}) if qrels: qrels = TrecQrels(qrels, {0: 'Not Relevant', 1: 'Relevant'}) super().__init__(docs, queries, qrels, documentation) def register_dataset(): if f"{NAME}/2025" in registry: return dlc = DownloadConfig.context("trec-tot-2025", home_path() / NAME / "2025") documentation = YamlDocumentation(f'docs/{NAME}.yaml') doc_offsets = dlc['trec-tot-2025-offsets.jsonl.gz'] doc_corpus = dlc['trec-tot-2025-corpus.jsonl.gz'] registry.register(f"{NAME}/2025", TrecToT2025Dataset(doc_corpus, doc_offsets, documentation=documentation("2025"))) for i in ["train", "dev1", "dev2", "dev3"]: qrels = dlc[i + "-2025-qrel.txt"] queries = dlc[i + "-2025-queries.jsonl"] registry.register(f"{NAME}/2025/{i}", TrecToT2025Dataset(doc_corpus, doc_offsets, queries, qrels, documentation(f"2025/{i}"))) # datasets that currently do not have qrels for i in ["test"]: queries = dlc[i + "-2025-queries.jsonl"] registry.register(f"{NAME}/2025/{i}", TrecToT2025Dataset(doc_corpus, doc_offsets, queries, None, documentation(f"2025/{i}"))) register_dataset() ================================================ FILE: ir_datasets/datasets/tripclick.py ================================================ from pathlib import Path import json import re import os import io import hashlib from datetime import datetime from typing import NamedTuple, Tuple import contextlib import ir_datasets from ir_datasets.util import TarExtract, TarExtractAll, RelativePath, DownloadConfig, Cache, IterStream from ir_datasets.formats import TrecQrels, TrecDocs, TrecQueries, GenericQuery, TrecScoredDocs, BaseQueries, TsvDocPairs, BaseQrels, BaseScoredDocs, TsvDocs, BaseQlogs from ir_datasets.datasets.base import Dataset, YamlDocumentation _logger = ir_datasets.log.easy() NAME = 'tripclick' QREL_DEFS = { 1: 'clicked', 0: 'not clicked and appeared higher in search results', } QREL_DCTR_DEFS = { 3: 'highly relevant; clicked more than 30% of the times it was shown', 2: 'relevant; clicked more than 4% but less than 30% of times it was shown', 1: 'partially relevant; clicked less than 4% of times it was shown (but at least once)', 0: 'not relevant; never clicked', } QTYPE_MAP = { '<num> *(Number:)? *': 'query_id', '<title> *': 'text', } Q_HASH_LEN = 11 class ConcatQueries(BaseQueries): def __init__(self, queries): self._queries = queries def queries_iter(self): for q in self._queries: yield from q.queries_iter() def queries_path(self): return None def queries_cls(self): return self._queries[0].queries_cls() def queries_namespace(self): return self._queries[0].queries_namespace() def queries_lang(self): return self._queries[0].queries_lang() class ConcatQrels(BaseQrels): def __init__(self, qrels): self._qrels = qrels def qrels_iter(self): for q in self._qrels: yield from q.qrels_iter() def qrels_path(self): return None def qrels_cls(self): return self._qrels[0].qrels_cls() def qrels_defs(self): return self._qrels[0].qrels_defs() class ConcatScoreddocs(BaseScoredDocs): def __init__(self, scoreddocs): self._scoreddocs = scoreddocs def scoreddocs_iter(self): for q in self._scoreddocs: yield from q.scoreddocs_iter() def scoreddocs_path(self, force=True): return None def scoreddocs_cls(self): return self._scoreddocs[0].scoreddocs_cls() class LogItem(NamedTuple): doc_id: str clicked: bool class TripClickQlog(NamedTuple): session_id: str query_id: str query: str query_orig: str time: datetime items: Tuple[LogItem, ...] class TripClickPartialDoc(NamedTuple): doc_id: str title: str url: str def default_text(self): """ title """ return self.title class TripClickQlogs(BaseQlogs): def __init__(self, dlc): self.dlc = dlc def qlogs_iter(self): for file in sorted(Path(self.dlc.path()).glob('logs/*.json')): with file.open('rt') as fin: for line in fin: record = json.loads(line) time = re.match(r'^/Date\(([0-9]+)\)/$', record['DateCreated']).group(1) query_norm = re.sub(r'\b(AND|OR)\b', ' ', record['Keywords']).replace('title:', ' ') query_norm = ' '.join(ir_datasets.util.ws_tok(query_norm)) items = [LogItem(str(did), did == record['DocumentId']) for did in record['Documents']] if record['DocumentId'] and not any(i.clicked for i in items): items += [LogItem(str(record['DocumentId']), True)] yield TripClickQlog( record['SessionId'], hashlib.md5(query_norm.encode()).hexdigest()[:Q_HASH_LEN], query_norm, record['Keywords'], datetime.fromtimestamp(int(time)/1000), tuple(items) ) def qlogs_cls(self): return TripClickQlog def qlogs_count(self): return 5_317_350 class DocPairGenerator: def __init__(self, docpair_dlc, collection, queries, cache_path): self._docpair_dlc = docpair_dlc self._collection = collection self._queries = queries self._cache_path = cache_path def path(self, force=True): if force and not os.path.exists(self._cache_path): _logger.info('tripclick includes docpairs in an expanded format (with raw text). Linking these records back to the query and doc IDs.') SPACES = re.compile(r'\s+') doc_map = {} for doc in _logger.pbar(self._collection.docs_iter(), desc='build doc lookup', unit='doc'): # doctext = f'{doc.title} <eot> {doc.text}'.replace('\t', ' ').replace('\n', ' ').replace('\u2029', ' ').replace('\u2028', ' ').replace(' ', ' ').strip() doctext = SPACES.sub(' ', f'{doc.title} <eot> {doc.text}').strip() dochash = hashlib.md5(doctext.encode()).digest()[:6] doc_map[dochash] = doc.doc_id query_map = {} for query in _logger.pbar(self._queries.queries_iter(), desc='build query lookup', unit='query'): queryhash = hashlib.md5(SPACES.sub(' ', query.text).strip().encode()).digest()[:6] query_map[queryhash] = query.query_id with ir_datasets.util.finialized_file(self._cache_path, 'wt') as fout, \ self._docpair_dlc.stream() as stream, \ _logger.pbar_raw(desc='building docpairs', total=23_222_038, unit='docpair') as pbar: skipped = 0 for line in stream: pbar.update() query, doc1, doc2 = line.strip().split(b'\t') queryhash = hashlib.md5(SPACES.sub(' ', query.decode()).strip().encode()).digest()[:6] doc1hash = hashlib.md5(SPACES.sub(' ', doc1.decode()).strip().encode()).digest()[:6] doc2hash = hashlib.md5(SPACES.sub(' ', doc2.decode()).strip().encode()).digest()[:6] qid, did1, did2 = query_map.get(queryhash), doc_map.get(doc1hash), doc_map.get(doc2hash) if qid is None or did1 is None or did2 is None: skipped += 1 pbar.set_postfix({'sk': skipped}) continue fout.write(f'{qid}\t{did1}\t{did2}\n') _logger.info(f'{skipped} lines skipped because queries/documents could not be matched') return self._cache_path @contextlib.contextmanager def stream(self): with open(self.path(), 'rb') as f: yield f # The allarticles.txt file (tsv) has a couple of problems, stemming from the fact that titles # can include \t and \n characters. This class corrects these problems. It also removed the # first (header) line and the final line ("(5196956 rows affected)"), and corrects a few strange # things with some URLs. class FixAllarticles: def __init__(self, streamer): self._streamer = streamer def stream(self): return io.BufferedReader(IterStream(iter(self)), buffer_size=io.DEFAULT_BUFFER_SIZE) def __iter__(self): with self._streamer.stream() as stream, \ _logger.pbar_raw(desc='fixing allarticles.txt', unit='B', unit_scale=True) as pbar: # NOTE: codecs.getreader is subtly broken here; it sometimes splits lines between special characters (and it's unclear why) next(stream) # remove header did, title, url = None, [], None for line in stream: pbar.update(len(line)) line = line.decode().strip() if line == '' or line == '(5196956 rows affected)': continue cols = line.split('\t') if did is None: did = cols[0] assert did.isnumeric(), line cols = cols[1:] if did in ('9283014', '11088688', '11114797'): # a few special cases where the URL is actually missing. If we don't fix this here, we'll end up messing up subsequent records yield ('\t'.join([did, ' '.join(cols), '']) + '\n').encode() did, title, url = None, [], None if len(cols) > 0: if cols[-1].startswith('_http'): # some URLs have this strange prefix, remove cols[-1] = cols[-1][1:] if cols[-1].startswith('ttp://'): cols[-1] = 'h' + cols[-1] if cols[-1].startswith('http') or cols[-1].startswith('file:///C:'): title += cols[:-1] url = cols[-1] yield ('\t'.join([did, ' '.join(title).strip(), url]) + '\n').encode() did, title, url = None, [], None else: title += cols def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = TrecDocs(dlc['benchmark'], parser='tut', path_globs=['**/docs_grp_*.txt'], namespace=NAME, lang='en', count_hint=ir_datasets.util.count_hint(NAME)) topics_and_qrels = TarExtractAll(dlc['benchmark'], base_path/"topics_and_qrels", path_globs=['**/topics.*.txt', '**/qrels.*.txt']) val_runs = TarExtractAll(dlc['dlfiles'], base_path/"val_runs", path_globs=['**/run.trip.BM25.*.val.txt']) test_runs = TarExtractAll(dlc['dlfiles_runs_test'], base_path/"test_runs", path_globs=['**/run.trip.BM25.*.test.txt']) base = Dataset( collection, documentation('_')) subsets['logs'] = Dataset( TsvDocs(Cache(FixAllarticles(TarExtract(dlc['logs'], 'logs/allarticles.txt')), base_path/'allarticles-fixed.tsv'), doc_cls=TripClickPartialDoc, lang='en', count_hint=ir_datasets.util.count_hint(f'{NAME}/logs')), TripClickQlogs(TarExtractAll(dlc['logs'], base_path/'logs', path_globs=['**/*.json'])), documentation('logs')) ### Train subsets['train/head'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.head.train.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels(RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.head.train.txt'), QREL_DEFS), documentation('train/head')) subsets['train/head/dctr'] = Dataset( TrecQrels(RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.dctr.head.train.txt'), QREL_DCTR_DEFS), subsets['train/head'], documentation('train/head/dctr')) subsets['train/torso'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.torso.train.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels(RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.torso.train.txt'), QREL_DEFS), documentation('train/torso')) subsets['train/tail'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.tail.train.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels(RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.tail.train.txt'), QREL_DEFS), documentation('train/tail')) train_queries = ConcatQueries([ subsets['train/head'].queries_handler(), subsets['train/torso'].queries_handler(), subsets['train/tail'].queries_handler(), ]) train_docpairs = DocPairGenerator(TarExtract(dlc['dlfiles'], 'dlfiles/triples.train.tsv'), collection, train_queries, base_path/'train.docpairs') subsets['train'] = Dataset( collection, train_queries, ConcatQrels([ subsets['train/head'].qrels_handler(), subsets['train/torso'].qrels_handler(), subsets['train/tail'].qrels_handler(), ]), TsvDocPairs(train_docpairs), documentation('train')) subsets['train/hofstaetter-triples'] = Dataset( collection, train_queries, subsets['train'].qrels_handler(), TsvDocPairs(dlc['hofstaetter-triples']), documentation('train/hofstaetter-triples')) ### Val subsets['val/head'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.head.val.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels(RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.head.val.txt'), QREL_DEFS), TrecScoredDocs(RelativePath(val_runs, 'dlfiles/run.trip.BM25.head.val.txt')), documentation('val/head')) subsets['val/head/dctr'] = Dataset( TrecQrels(RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.dctr.head.val.txt'), QREL_DCTR_DEFS), subsets['val/head'], documentation('val/head/dctr')) subsets['val/torso'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.torso.val.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels(RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.torso.val.txt'), QREL_DEFS), TrecScoredDocs(RelativePath(val_runs, 'dlfiles/run.trip.BM25.torso.val.txt')), documentation('val/torso')) subsets['val/tail'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.tail.val.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels(RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.tail.val.txt'), QREL_DEFS), TrecScoredDocs(RelativePath(val_runs, 'dlfiles/run.trip.BM25.tail.val.txt')), documentation('val/tail')) subsets['val'] = Dataset( collection, ConcatQueries([ subsets['val/head'].queries_handler(), subsets['val/torso'].queries_handler(), subsets['val/tail'].queries_handler(), ]), ConcatQrels([ subsets['val/head'].qrels_handler(), subsets['val/torso'].qrels_handler(), subsets['val/tail'].qrels_handler(), ]), ConcatScoreddocs([ subsets['val/head'].scoreddocs_handler(), subsets['val/torso'].scoreddocs_handler(), subsets['val/tail'].scoreddocs_handler(), ]), documentation('val')) ### Test subsets['test/head'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.head.test.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecScoredDocs(RelativePath(test_runs, 'runs_test/run.trip.BM25.head.test.txt')), documentation('val/head')) subsets['test/torso'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.torso.test.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecScoredDocs(RelativePath(test_runs, 'runs_test/run.trip.BM25.torso.test.txt')), documentation('test/torso')) subsets['test/tail'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.tail.test.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecScoredDocs(RelativePath(test_runs, 'runs_test/run.trip.BM25.tail.test.txt')), documentation('test/tail')) subsets['test'] = Dataset( collection, ConcatQueries([ subsets['test/head'].queries_handler(), subsets['test/torso'].queries_handler(), subsets['test/tail'].queries_handler(), ]), ConcatScoreddocs([ subsets['test/head'].scoreddocs_handler(), subsets['test/torso'].scoreddocs_handler(), subsets['test/tail'].scoreddocs_handler(), ]), documentation('test')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/tweets2013_ia.py ================================================ import os import itertools import contextlib import shutil import tarfile from collections import Counter from pathlib import Path import bz2 import json from datetime import datetime from typing import NamedTuple import ir_datasets from ir_datasets.util import DownloadConfig from ir_datasets.formats import TrecQrels, TrecQueries, BaseDocs from ir_datasets.datasets.base import Dataset, YamlDocumentation from ir_datasets.indices import Docstore, CacheDocstore, DEFAULT_DOCSTORE_OPTIONS NAME = 'tweets2013-ia' _logger = ir_datasets.log.easy() QTYPE_MAP_13 = { '<num> *Number: *MB': 'query_id', # Remove MB prefix from QIDs '<query> *': 'query', '<querytime> *': 'time', '<querytweettime> *': 'tweet_time', } QTYPE_MAP_14 = { '<num> *Number: *MB': 'query_id', # Remove MB prefix from QIDs '<query> *': 'query', '<querytime> *': 'time', '<querytweettime> *': 'tweet_time', '<querydescription> *': 'description', } RM_TAGS = [' </num>', ' </query>', ' </querytime>', ' </querytweettime>'] QREL_DEFS = { 2: 'highly relevant', 1: 'relevant', 0: 'not relevant', } class TweetDoc(NamedTuple): doc_id: str text: str user_id: str created_at: str lang: str reply_doc_id: str retweet_doc_id: str source: bytes source_content_type: str def default_text(self): """ text """ return self.text class TrecMb13Query(NamedTuple): query_id: str query: str time: str tweet_time: str def default_text(self): """ query """ return self.query class TrecMb14Query(NamedTuple): query_id: str query: str time: str tweet_time: str description: str def default_text(self): """ query """ return self.query """ About the tweets2013-ia collection: This collection uses tweets distributed by the Internet Archive. These archives are not /exactly/ the same that were used for the TREC Microblog 2013--14 shared tasks, but Sequiera and Lin [1] show that it's close enough. The distribution format splits up the tweets by the date/time posted, like: MM/DD/hh/mm.bz4. Since the "snowflake" format of these tweet IDs include a timestamp, you'd think that this would make performing lookups from this structure easy, but it turns out that there's plenty of cases where tweets are in the wrong files. I suspect it may have to do with when the tweets were streamed, not when they were actually created. To make lookups efficient, we re-create the strucutre with a few changes. First, we allocate tweets to files based on the timestamp in their tweet IDs, allowing the ID itself to point to the source file. In the process, we change the compression technique from using bz2 compression to lz4, allowing much faster decompression. lz4 also allows us to append to compressed files, so we can add to files as tweets are encounterd from other source files. At the same time, we add some optimizations to allow for faster lookups in these files. For tweets belonging to the same file are encountered sequentially, they are batched up before writing. These batches are sorted and then split into groups of at most 100 tweets. Each group is precided by a short JSON record containing the start and end tweets IDs in the group. Since these recrods are much shorter than regular tweet records, they can be identified without parsing the JSON. If the target tweet does not appear in this range, we can skip JSON parsing of all recrods until the next short one. My benchmarks showed this speeds up lookups by at most ~5x. The uncompressed files end up looking like this: {"start": 102, "end": 145} {"id": 102, "tweet": "[text]", ...} {"id": 124, "tweet": "[text]", ...} {"id": 125, "tweet": "[text]", ...} {"id": 145, "tweet": "[text]", ...} {"start": 163, "end": 341} {"id": 163, "tweet": "[text]", ...} ... Because of tweets encountered in other files, it's not uncommon to see ranges with only 1 or 2 tweets at the start or end of these files. Note that the entire file is not sorted; this would take a second pass which isn't really needed. Most files have at most a few thousand tweets. We considered a different granularity for these files (by hour? by 5min?) but the by-minute structure seemed reasonable enough. Downsides with this approach are: a) We're not loading directly from source anymore b) The source looks similar to this structure, which may lead to confusion. Alternative approachs would have been to: 1) Build an pklz4 docstore for this collection. Downsides with that are: a) It would have a huge docid lookup table (several GB). Would need to keep this all in memory when indexing. b) No compression between records, so larger file size 2) Store pklz4 in this structure, rather than the source JSON. This would be faster, but downsides there are: a) If we ever want to change the fields in TweetDoc, we'd need to rebuild the whole structure b) Not a human-readable format (i.e., you couldn't use lz4cat to see the contents of these files) [1] https://cs.uwaterloo.ca/~jimmylin/publications/Sequiera_Lin_SIGIR2017.pdf """ class TweetWriter: def __init__(self, base_path, max_tweets_per_block=100): self.base_path = Path(base_path) self.current_file = None self.buffered_tweets = [] self.max_tweets_per_block = max_tweets_per_block def add(self, file_name, tweet_id, tweet_data): if file_name != self.current_file: self.flush() self.current_file = file_name self.buffered_tweets.append((tweet_id, tweet_data)) def flush(self): lz4 = ir_datasets.lazy_libs.lz4_frame() if self.current_file is not None and self.buffered_tweets: (self.base_path / Path(self.current_file)).parent.mkdir(parents=True, exist_ok=True) with lz4.frame.LZ4FrameFile(self.base_path / self.current_file, mode='a', block_linked=True, compression_level=lz4.frame.COMPRESSIONLEVEL_MAX, auto_flush=True) as fout: sorted_tweets = sorted(self.buffered_tweets) while sorted_tweets: block = sorted_tweets[:self.max_tweets_per_block] sorted_tweets = sorted_tweets[self.max_tweets_per_block:] header = json.dumps({'start': block[0][0], 'end': block[-1][0]}).encode() + b'\n' fout.write(b''.join([header] + [t[1] for t in block])) self.current_file = None self.buffered_tweets.clear() class Tweets2013IaDocIter: def __init__(self, tweets_docs, slice): self.tweets_docs = tweets_docs self.slice = slice self.next_index = 0 self.file_iter = tweets_docs._docs_iter_source_files() self.current_file = None self.current_file_start_idx = 0 self.current_file_end_idx = 0 def __next__(self): if self.slice.start >= self.slice.stop: raise StopIteration while self.next_index != self.slice.start or self.current_file is None or self.current_file_end_idx <= self.slice.start: if self.current_file is None or self.current_file_end_idx <= self.slice.start: # First iteration or no docs remaining in this file if self.current_file is not None: self.current_file.close() self.current_file = None # jump ahead to the file that contains the desired index first = True while first or self.current_file_end_idx < self.slice.start: source_file = next(self.file_iter) self.next_index = self.current_file_end_idx self.current_file_start_idx = self.current_file_end_idx self.current_file_end_idx = self.current_file_start_idx + self.tweets_docs._docs_file_counts()[source_file] first = False self.current_file = self.tweets_docs._docs_ctxt_iter_tweets(source_file) else: for _ in zip(range(self.slice.start - self.next_index), self.current_file): # The zip here will stop at after either as many docs we must advance, or however # many docs remain in the file. In the latter case, we'll just drop out into the # next iteration of the while loop and pick up the next file. self.next_index += 1 result = next(self.current_file) self.next_index += 1 self.slice = slice(self.slice.start + (self.slice.step or 1), self.slice.stop, self.slice.step) return result def close(self): self.file_iter = None self.current_file = None def __iter__(self): return self def __del__(self): self.close() def __getitem__(self, key): if isinstance(key, slice): # it[start:stop:step] new_slice = ir_datasets.util.apply_sub_slice(self.slice, key) return Tweets2013IaDocIter(self.tweets_docs, new_slice) elif isinstance(key, int): # it[index] new_slice = ir_datasets.util.slice_idx(self.slice, key) new_it = Tweets2013IaDocIter(self.tweets_docs, new_slice) try: return next(new_it) except StopIteration as e: raise IndexError((self.slice, slice(key, key+1), new_slice)) raise TypeError('key must be int or slice') class TweetsDocstore(Docstore): def __init__(self, tweets_docs, options=DEFAULT_DOCSTORE_OPTIONS): super().__init__(tweets_docs.docs_cls(), 'doc_id', options=options) self.tweets_docs = tweets_docs def get_many_iter(self, doc_ids): lz4 = ir_datasets.lazy_libs.lz4_frame() files_to_search = {} # find the file that each tweet should be found in for doc_id in doc_ids: try: doc_id = int(doc_id) except ValueError: continue # tweet IDs must be ints, so skip this one source_file = self.tweets_docs._id2file(doc_id) if source_file not in files_to_search: files_to_search[source_file] = set() files_to_search[source_file].add(doc_id) # loop through each required source file to find the tweets for source_file, doc_ids in files_to_search.items(): if not (Path(self.tweets_docs.docs_path()) / source_file).exists(): continue # source file missing with lz4.frame.LZ4FrameFile(Path(self.tweets_docs.docs_path()) / source_file) as fin: block_docids = set() line_iter = iter(fin) while fin: line = next(fin, StopIteration) if line is StopIteration: break # bummer, can't find a doc_id... if len(line) < 64: # checkpoints lines can be at most ~60 characters. Tweets lines always be longer than this. # It's a checkpoint line! Are we looking for anything in this range? rng = json.loads(line) start, end = rng['start'], rng['end'] block_docids = set(d for d in doc_ids if start <= d <= end) elif block_docids: # Is this record a tweet we're looking for? data = json.loads(line) if data['id'] in block_docids: yield self.tweets_docs._docs_source_to_doc(line, data) block_docids.discard(data['id']) doc_ids.discard(data['id']) if not doc_ids: break # all done with this file else: # None of the docs we're looking for are in this block, so we don't need to bother parsing the json. # Depending on the where the tweet ends up being in the file, this optimization can speed up lookups by # up to ~5x. pass class Tweets2013IaDocs(BaseDocs): def __init__(self, docs_dlcs, base_path): self._docs_dlcs = docs_dlcs self._docs_base_path = base_path self._docs_file_counts_cache = None def _id2file(self, snowflake_id): # Converts a tweet ID to a timestamp-based file path ts = ((snowflake_id >> 22) + 1288834974657) # "magic" numbers from https://github.com/client9/snowflake2time/blob/master/python/snowflake.py#L24 dt = datetime.fromtimestamp(ts // 1000) return f'{dt.month:02d}/{dt.day:02d}/{dt.hour:02d}/{dt.minute:02d}.jsonl.lz4' def _docs_build(self): success_file = Path(self._docs_base_path) / '_success' inprogress_file = Path(self._docs_base_path) / '_inprogress' if success_file.exists(): return # already built # Make sure there's not already another process building this structure. Having multiple processes work on # this concurrently would cause problems because we gotta append to files. inprogress_file.parent.mkdir(parents=True, exist_ok=True) try: inprogress_file.touch(exist_ok=False) except FileExistsError: raise RuntimeError('Another process is currently building tweets2013-ia corpus; please wait for this process to finish. ' f'(If a prior process failed, you may need to manually clear {self._docs_base_path})') file_counts = Counter() # keeps track of the number of tweets in each file (for fancy slicing) try: # TODO: There's the potential for a race condition here... shutil.rmtree(inprogress_file.parent) # clear out directory because this process needs to *append* to files inprogress_file.parent.mkdir(parents=True, exist_ok=True) inprogress_file.touch(exist_ok=False) writer = TweetWriter(self._docs_base_path) with _logger.pbar_raw(desc='tweets', unit='tweet') as pbar, contextlib.ExitStack() as stack: # Since the souce files download slowly anyway, download them in parallel (this doesn't seem to reduce the download speed of either file) dlc_streams = [stack.enter_context(dlc.stream()) for dlc in self._docs_dlcs] tar_files = [stack.enter_context(tarfile.open(fileobj=stream, mode='r|')) for stream in dlc_streams] # Loop through the tar sources file-by-file (alternating between them) for records in itertools.zip_longest(*tar_files): for record, tarf in zip(records, tar_files): if record is None or not record.name.endswith('.json.bz2'): continue # not a data file # Loop through the tweets in each file with bz2.open(tarf.extractfile(record)) as f: for line in f: data = json.loads(line) if 'id' not in data: continue # e.g., "delete" records out_file = self._id2file(data['id']) writer.add(out_file, data['id'], line) file_counts[out_file] += 1 pbar.update(1) writer.flush() # any remaining tweets # Write out a file that gives the counts for each source file. This is used for fancy slicing # and also avoids globbing to get a list of all source files. with (Path(self._docs_base_path) / 'file_counts.tsv').open('wt') as f: for file, count in sorted(file_counts.items()): f.write(f'{file}\t{count}\n') # Mark as done! success_file.touch() finally: # No longer working on it inprogress_file.unlink() def docs_iter(self): return Tweets2013IaDocIter(self, slice(0, self.docs_count(force=True))) def docs_cls(self): return TweetDoc def docs_store(self, options=DEFAULT_DOCSTORE_OPTIONS): return CacheDocstore(TweetsDocstore(self), f'{self.docs_path(force=False)}.cache', options=options) def docs_path(self, force=False): return self._docs_base_path def docs_count(self, force=False): success_file = Path(self._docs_base_path) / '_success' if force or success_file.exists(): return sum(self._docs_file_counts().values()) def docs_namespace(self): return NAME def docs_lang(self): return None # multiple languages def _docs_file_counts(self): if self._docs_file_counts_cache is None: self._docs_build() result = {} with (Path(self.docs_path()) / 'file_counts.tsv').open('rt') as f: for line in f: file, count = line.strip().split('\t') result[file] = int(count) self._docs_file_counts_cache = result return self._docs_file_counts_cache def _docs_iter_source_files(self): yield from self._docs_file_counts().keys() def _docs_ctxt_iter_tweets(self, source_file): lz4 = ir_datasets.lazy_libs.lz4_frame() with lz4.frame.LZ4FrameFile(Path(self._docs_base_path) / source_file) as fin: for line in fin: data = json.loads(line) if 'id' in data: yield self._docs_source_to_doc(line, data) def _docs_source_to_doc(self, source, data): retweet_id = data['retweeted_status']['id_str'] if 'retweeted_status' in data else None return TweetDoc(data['id_str'], data['text'], data['user']['id_str'], data['created_at'], data.get('lang'), data['in_reply_to_status_id_str'], retweet_id, source, 'application/json') def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = Tweets2013IaDocs([dlc['docs/feb'], dlc['docs/mar']], os.path.join(base_path, 'corpus')) base = Dataset(collection, documentation('_')) subsets['trec-mb-2013'] = Dataset( collection, TrecQueries(dlc['trec-mb-2013/queries'], qtype=TrecMb13Query, qtype_map=QTYPE_MAP_13, remove_tags=RM_TAGS, namespace=NAME, lang='en'), TrecQrels(dlc['trec-mb-2013/qrels'], QREL_DEFS), documentation('trec-mb-2013') ) subsets['trec-mb-2014'] = Dataset( collection, TrecQueries(dlc['trec-mb-2014/queries'], qtype=TrecMb14Query, qtype_map=QTYPE_MAP_14, remove_tags=RM_TAGS, namespace=NAME, lang='en'), TrecQrels(dlc['trec-mb-2014/qrels'], QREL_DEFS), documentation('trec-mb-2014') ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/vaswani.py ================================================ import io import itertools import ir_datasets from ir_datasets.util import DownloadConfig, TarExtract, Cache from ir_datasets.formats import BaseDocs, BaseQueries, BaseQrels from ir_datasets.formats import GenericDoc, GenericQuery, TrecQrel from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS from ir_datasets.datasets.base import Dataset, YamlDocumentation NAME = 'vaswani' QREL_DEFS = { 1: 'Relevant', } def sentinel_splitter(it, sentinel): for is_sentinel, group in itertools.groupby(it, lambda l: l == sentinel): if not is_sentinel: yield list(group) class VaswaniDocs(BaseDocs): def __init__(self, docs_dlc): super().__init__() self.docs_dlc = docs_dlc def docs_path(self, force=True): return self.docs_dlc.path(force) @ir_datasets.util.use_docstore def docs_iter(self): with self.docs_dlc.stream() as stream: stream = io.TextIOWrapper(stream) for lines in sentinel_splitter(stream, sentinel=' /\n'): doc_id = lines[0].rstrip('\n') doc_text = ''.join(lines[1:]) yield GenericDoc(doc_id, doc_text) def docs_cls(self): return GenericDoc def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f'{ir_datasets.util.home_path()/NAME}/docs.pklz4', init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], count_hint=ir_datasets.util.count_hint(NAME), options=options ) def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_namespace(): return NAME def docs_lang(self): return 'en' class VaswaniQueries(BaseQueries): def __init__(self, queries_dlc): super().__init__() self.queries_dlc = queries_dlc def queries_path(self): return self.queries_dlc.path() def queries_iter(self): with self.queries_dlc.stream() as stream: stream = io.TextIOWrapper(stream) for lines in sentinel_splitter(stream, sentinel='/\n'): query_id = lines[0].rstrip('\n') query_text = ''.join(lines[1:]) yield GenericQuery(query_id, query_text) def queries_cls(self): return GenericQuery def queries_namespace(self): return NAME def queries_lang(self): return 'en' class VaswaniQrels(BaseQrels): def __init__(self, qrels_dlc): self.qrels_dlc = qrels_dlc def qrels_path(self): return self.qlres_dlc.path() def qrels_iter(self): with self.qrels_dlc.stream() as stream: stream = io.TextIOWrapper(stream) for lines in sentinel_splitter(stream, sentinel=' /\n'): query_id = lines[0].rstrip('\n') for line in lines[1:]: for doc_id in line.split(): yield TrecQrel(query_id, doc_id, 1, '0') def qrels_cls(self): return TrecQrel def qrels_defs(self): return QREL_DEFS def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} main_dlc = dlc['main'] base = Dataset( VaswaniDocs(Cache(TarExtract(main_dlc, 'doc-text'), base_path/'docs.txt')), VaswaniQueries(Cache(TarExtract(main_dlc, 'query-text'), base_path/'queries.txt')), VaswaniQrels(Cache(TarExtract(main_dlc, 'rlv-ass'), base_path/'qrels.txt')), documentation('_'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/wapo.py ================================================ import io import json import tarfile from typing import NamedTuple, Tuple, Optional import ir_datasets from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS from ir_datasets.util import Lazy, DownloadConfig, Migrator from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredQrels, YamlDocumentation from ir_datasets.formats import BaseDocs, BaseQueries, BaseQrels, GenericQuery, GenericQrel, TrecQueries, TrecQrels NAME = 'wapo' CORE_QREL_DEFS = { 0: "not relevant", 1: "relevant", 2: "highly relevant", } BL_QREL_DEFS = { 0: 'The document provides little or no useful background information.', 2: 'The document provides some useful background or contextual information that would help the user understand the broader story context of the target article.', 4: 'The document provides significantly useful background ...', 8: 'The document provides essential useful background ...', 16: 'The document _must_ appear in the sidebar otherwise critical context is missing.', } RM_TAGS = [' </num>', 'Narrative\n', '</docid>', '</url>'] BL_MAP = { ' *<num> Number: ': 'query_id', ' *<docid>': 'doc_id', ' *<url>': 'url', } class WapoDocMedia(NamedTuple): type: str url: str text: str class WapoDoc(NamedTuple): doc_id: str url: str title: str author: str published_date: Optional[int] kicker: str body: str body_paras_html: Tuple[str, ...] body_media: Tuple[WapoDocMedia, ...] def default_text(self): """ title and body """ return f'{self.title} {self.body}' class TrecBackgroundLinkingQuery(NamedTuple): query_id: str doc_id: str url: str class WapoDocs(BaseDocs): def __init__(self, dlc, file_name): self._dlc = dlc self._file_name = file_name def docs_path(self, force=True): return self._dlc.path(force) def docs_cls(self): return WapoDoc def docs_iter(self): return iter(self.docs_store()) def _docs_iter(self): BeautifulSoup = ir_datasets.lazy_libs.bs4().BeautifulSoup for doc_json in self.docs_wapo_raw_iter(): body = '' kicker = '' body_paras_html = [] body_media = [] for content in doc_json['contents']: if content is None: continue if content.get('type') == 'kicker': assert content['mime'] == 'text/plain' if content['content'] is not None: kicker += content['content'] + '\n' elif content.get('type') == 'sanitized_html': if content.get('content') is not None: body_paras_html.append(content['content']) if content.get('mime') == 'text/html': body += BeautifulSoup(content['content'], 'lxml-xml').get_text() + '\n' else: body += content['content'] + '\n' elif content.get('type') in ['image', 'tweet', 'video', 'gallery']: url = { 'image': lambda: content['imageURL'], 'video': lambda: content['contenturl'], 'gallery': lambda: content['contenturl'], 'tweet': lambda: f"https://twitter.com/{content['content']['user']['screen_name']}/status/{content['content']['id_str']}", }[content['type']]() text = { 'image': lambda: content.get('fullcaption'), 'video': lambda: content.get('blurb'), 'gallery': lambda: content.get('blurb'), 'tweet': lambda: content['content']['text'], }[content['type']]() body_media.append(WapoDocMedia(content['type'], url, text)) if text is not None: body += text + '\n' yield WapoDoc( doc_json['id'], doc_json['article_url'], doc_json['title'], doc_json['author'], doc_json.get('published_date'), kicker.rstrip('\n'), body.rstrip('\n'), tuple(body_paras_html), tuple(body_media)) def docs_wapo_raw_iter(self): with self._dlc.stream() as stream: with tarfile.open(fileobj=stream, mode='r|gz') as tarf: for member in tarf: if member.name != self._file_name: continue file = tarf.extractfile(member) for line in file: doc_json = json.loads(line) yield doc_json def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f'{self.docs_path()}.pklz4', init_iter_fn=self._docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], options=options ) def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_namespace(self): return NAME def docs_lang(self): return 'en' def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection_v2 = WapoDocs(dlc['v2'], 'WashingtonPost.v2/data/TREC_Washington_Post_collection.v2.jl') collection_v4 = WapoDocs(dlc['v4'], 'WashingtonPost.v4/data/TREC_Washington_Post_collection.v4.jl') base = Dataset(documentation('_')) subsets['v2'] = Dataset( collection_v2, documentation('v2')) subsets['v2/trec-core-2018'] = Dataset( collection_v2, TrecQueries(dlc['trec-core-2018/queries'], namespace='trec-core-2018', lang='en', remove_tags=RM_TAGS), TrecQrels(dlc['trec-core-2018/qrels'], CORE_QREL_DEFS), documentation('v2/trec-core-2018')) subsets['v2/trec-news-2018'] = Dataset( collection_v2, TrecQueries(dlc['trec-news-2018/queries'], namespace='trec-news-2018', lang='en', qtype=TrecBackgroundLinkingQuery, qtype_map=BL_MAP, remove_tags=RM_TAGS), TrecQrels(dlc['trec-news-2018/qrels'], BL_QREL_DEFS), documentation('v2/trec-news-2018')) subsets['v2/trec-news-2019'] = Dataset( collection_v2, TrecQueries(dlc['trec-news-2019/queries'], namespace='trec-news-2019', lang='en', qtype=TrecBackgroundLinkingQuery, qtype_map=BL_MAP, remove_tags=RM_TAGS), TrecQrels(dlc['trec-news-2019/qrels'], BL_QREL_DEFS), documentation('v2/trec-news-2019')) subsets['v3/trec-news-2020'] = Dataset( TrecQueries(dlc['trec-news-2020/queries'], namespace='trec-news-2020', lang='en', qtype=TrecBackgroundLinkingQuery, qtype_map=BL_MAP, remove_tags=RM_TAGS), TrecQrels(dlc['trec-news-2020/qrels'], BL_QREL_DEFS), documentation('v3/trec-news-2020')) subsets['v4'] = Dataset( collection_v4, documentation('v4')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets base, subsets = _init() ================================================ FILE: ir_datasets/datasets/wikiclir.py ================================================ import contextlib from pathlib import Path from typing import NamedTuple import ir_datasets from ir_datasets.util import TarExtractAll, DownloadConfig, RelativePath, Lazy from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries from ir_datasets.formats import TsvDocs, TsvQueries, TrecQrels NAME = 'wikiclir' _logger = ir_datasets.log.easy() QRELS_DEFS = { 2: "Document assigned to the (English) cross-lingual mate", 1: "All other articles that link to the mate, and are linked by the mate", } class WikiClirQuery(NamedTuple): query_id: str title: str first_sent: str def default_text(self): """ title """ return f"{self.title}" class WikiClirDoc(NamedTuple): doc_id: str title: str text: str def default_text(self): """ title and text """ return f"{self.title} {self.text}" def _init(): base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') langs = [ ('arabic', 'ar', 'ar'), ('catalan', 'ca', 'ca'), ('chinese', 'zh', 'zh'), ('czech', 'cs', 'cs'), ('dutch', 'nl', 'nl'), ('finnish', 'fi', 'fi'), ('french', 'fr', 'fr'), ('german', 'de', 'de'), ('italian', 'it', 'it'), ('japanese', 'ja', 'ja'), ('korean', 'ko', 'ko'), ('norwegian_(bokmal)', 'no', 'no'), ('norwegian_(nynorsk)', 'nn', 'nn'), ('polish', 'pl', 'pl'), ('portuguese', 'pt', 'pt'), ('romanian', 'ro', 'ro'), ('russian', 'ru', 'ru'), ('simple_english', 'en', 'en-simple'), ('spanish', 'es', 'es'), ('swahili', 'sw', 'sw'), ('swedish', 'sv', 'sv'), ('tagalog', 'tl', 'tl'), ('turkish', 'tr', 'tr'), ('ukrainian', 'uk', 'uk'), ('vietnamese', 'vi', 'vi'), ] dlc = TarExtractAll(dlc['source'], base_path/'source') queries = TsvQueries(RelativePath(dlc, 'wiki-clir/english/wiki_en.queries'), namespace=NAME, query_cls=WikiClirQuery, lang='en') base = Dataset(documentation('_')) subsets = {} for source_path, lang, dsid in langs: file_suffix = lang if dsid != 'en-simple' else 'simple' qrels = TrecQrels(RelativePath(dlc, f'wiki-clir/{source_path}/en2{file_suffix}.rel'), QRELS_DEFS, format_3col=True) qids = _qid_filter(qrels) subsets[dsid] = Dataset( TsvDocs(RelativePath(dlc, f'wiki-clir/{source_path}/wiki_{file_suffix}.documents'), doc_cls=WikiClirDoc, namespace=NAME, lang=lang), FilteredQueries(queries, qids, mode='include'), qrels, documentation(dsid), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets def _qid_filter(qrels): return Lazy(lambda: {q.query_id for q in qrels.qrels_iter()}) collection, subsets = _init() ================================================ FILE: ir_datasets/datasets/wikir.py ================================================ import contextlib from pathlib import Path from typing import NamedTuple import ir_datasets from ir_datasets.util import ZipExtractCache, DownloadConfig, RelativePath from ir_datasets.datasets.base import Dataset, YamlDocumentation from ir_datasets.formats import CsvQueries, CsvDocs, TrecQrels, TrecScoredDocs NAME = 'wikir' _logger = ir_datasets.log.easy() QRELS_DEFS = { 2: "Query is the article title", 1: "There is a link to the article with the query as its title in the first sentence", 0: "Otherwise", } def _init(): base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') subsets = {} sources = [ ('en1k', 'wikIR1k'), ('en59k', 'wikIR59k'), ('en78k', 'enwikIR'), ('ens78k', 'enwikIRS'), ('fr14k', 'FRwikIR14k'), ('es13k', 'ESwikIR13k'), ('it16k', 'ITwikIR16k'), ] for source, zip_dir_name in sources: source_dlc = ZipExtractCache(dlc[source], base_path/source) docs = CsvDocs(RelativePath(source_dlc, f"{zip_dir_name}/documents.csv"), namespace=source, lang=source[:2], count_hint=ir_datasets.util.count_hint(f'{NAME}/{source}'), docstore_path=ir_datasets.util.home_path()/NAME/f'{source}.pklz4') subsets[source] = Dataset(docs, documentation(source)) for split in ['training', 'validation', 'test']: subsets[f'{source}/{split}'] = Dataset( docs, CsvQueries(RelativePath(source_dlc, f"{zip_dir_name}/{split}/queries.csv"), lang=source[:2]), TrecQrels(RelativePath(source_dlc, f"{zip_dir_name}/{split}/qrels"), qrels_defs=QRELS_DEFS), TrecScoredDocs(RelativePath(source_dlc, f"{zip_dir_name}/{split}/BM25.res")), documentation(f'{source}/{split}') ) base = Dataset(documentation('_')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets collection, subsets = _init() ================================================ FILE: ir_datasets/docs/antique.yaml ================================================ _: pretty_name: 'ANTIQUE' desc: ' <p> "ANTIQUE is a non-factoid quesiton answering dataset based on the questions and answers of Yahoo! Webscope L6." </p> <ul> <li>Documents: Short answer passages (from Yahoo Answers)</li> <li>Queries: Natural language questions (from Yahoo Answers)</li> <li><a href="https://arxiv.org/abs/1905.08957">Dataset Paper</a></li> </ul> ' bibtex_ids: ['Hashemi2020Antique'] train: desc: ' <p> Official train set of the ANTIQUE dataset. </p>' bibtex_ids: ['Hashemi2020Antique'] train/split200-train: desc: ' <p> <a class="ds-ref">antique/train</a> without the 200 queries used by <a class="ds-ref">antique/train/split200-valid</a>. </p>' bibtex_ids: ['Hashemi2020Antique'] train/split200-valid: desc: ' <p> A held-out subset of 200 queries from <a class="ds-ref">antique/train</a>. Use in conjunction with <a class="ds-ref">antique/train/split200-train</a>. </p>' bibtex_ids: ['Hashemi2020Antique'] test: desc: ' <p> Official test set of the ANTIQUE dataset. </p>' bibtex_ids: ['Hashemi2020Antique'] test/non-offensive: desc: ' <p> <a class="ds-ref">antique/test</a> without a set of queries deemed by the authors of ANTIQUE to be "offensive (and noisy)." </p>' bibtex_ids: ['Hashemi2020Antique'] ================================================ FILE: ir_datasets/docs/aol-ia.yaml ================================================ _: pretty_name: 'AOL-IA (Internet Archive)' desc: ' <p> This is a version of the AOL Query Log. Documents use versions that appeared around the time of the query log (early 2006) via the Internet Archive. </p> <p> The query log does not include document or query IDs. These are instead created by ir_datasets. Document IDs are assigned using a hash of the URL that appears in the query log. Query IDs are assigned using the a hash of the noramlised query. All unique normalized queries are available from <kbd>queries</kbd>, and all clicked documents are available from <kbd>qrels</kbd> (iteration value set to the user ID). Full information (including original query) are available from <kbd>qlogs</kbd>. </p> ' bibtex_ids: ['Pass2006Picture', 'MacAvaney2022Reproducing'] docs_instructions: &inst "docs available using aolia-tools package" data_access: ' <p> To use the documents of this dataset, you will need to run the download script in <a href="https://github.com/terrierteam/aolia-tools">aolia-tools</a>. To run the script, use the following commands: </p> <code> git clone https://github.com/terrierteam/aolia-tools<br/> cd aolia-tools<br/> pip install -r requirements.txt<br/> python downloader.py<br/> </code> <p> It takes around 2 days to download all documents. </p> ' ================================================ FILE: ir_datasets/docs/aquaint.yaml ================================================ _: pretty_name: 'AQUAINT' desc: ' <p> A document collection of about 1M English newswire text. Sources are the Xinhua News Service (People''s Republic of China), the New York Times News Service, and the Associated Press Worldstream News Service. </p> <ul> <li><a href="https://catalog.ldc.upenn.edu/LDC2002T31">Dataset details</a></li> </ul> ' docs_instructions: &inst "docs available from LDC" bibtex_ids: ['Graff2002Aquaint'] data_access: ' <p> To use this dataset, you need a copy of the source corpus, provided by the the Linguistic Data Consortium. The specific resource needed is <a href="https://catalog.ldc.upenn.edu/LDC2002T31">LDC2002T31</a>. </p> <p> Many organizations already have a subscription to the LDC, so access to the collection can be as easy as confirming the data usage agreement and downloading the corpus. Check with your library for access details. </p> <p> The source file is: <kbd>aquaint_comp_LDC2002T31.tgz</kbd>. </p> <p> ir_datasets expects this file to be copied/linked in <kbd>~/.ir_datasets/aquaint/</kbd>. </p> ' trec-robust-2005: desc: ' <p> The TREC Robust 2005 dataset. Contains a subset of 50 "hard" queries from <a class="ds-ref">trec-robust04</a>. </p> <ul> <li>Documents: News articles</li> <li>Queries: keyword queries, descriptions, narratives</li> <li>Relevance: Deep judgments</li> <li><a href="https://trec.nist.gov/data/robust/05/05.guidelines.html">Shared task site</a></li> <li><a href="https://trec.nist.gov/pubs/trec14/papers/ROBUST.OVERVIEW.pdf">Task overview paper</a></li> <li>See also: <a class="ds-ref">trec-robust04</a></li> </ul> ' bibtex_ids: ['Voorhees2005Robust', 'Graff2002Aquaint'] docs_instructions: *inst ================================================ FILE: ir_datasets/docs/argsme.yaml ================================================ _: pretty_name: "args.me" desc: | <p> The args.me corpus is one of the largest argument resources available and contains arguments crawled from debate platforms and parliament discussions. </p> <ul> <li><a href="https://args.me/">args.me Search Engine</a></li> <li><a href="https://webis.de/downloads/publications/papers/wachsmuth_2017f.pdf">args.me Search Engine Paper</a></li> <li><a href="https://webis.de/downloads/publications/papers/ajjour_2019a.pdf">args.me Corpus Paper</a></li> <li><a href="https://git.webis.de/code-research/arguana/args/args-framework">GitLab Repository</a></li> </ul> bibtex_ids: - Wachsmuth2017Argument - Ajjour2019Acquisition 1.0: pretty_name: "args.me version 1.0" desc: | <p> Corpus version 1.0 with 387 606 arguments crawled from Debatewise, IDebate.org, Debatepedia, Debate.org. It was released on July 9, 2019 on <a href="https://zenodo.org/record/3274636">Zenodo</a>. The cleaned version <a class="ds-ref">argsme/1.0-cleaned</a> should be preferred. </p> <p> This collection is licensed with the <a href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International</a>. Individual rights to the content still apply. </p> bibtex_ids: - Wachsmuth2017Argument - Ajjour2019Acquisition 1.0-cleaned: pretty_name: "args.me version 1.0 cleaned" desc: | <p> Corpus version 1.0-cleaned with 382 545 arguments crawled from Debatewise, IDebate.org, Debatepedia, Debate.org. This version contains the same arguments as <a class="ds-ref">argsme/1.0</a>, but was cleaned as described in the corresponding publication. It was released on October 27, 2020 on <a href="https://zenodo.org/record/4139439">Zenodo</a>. </p> <p> This collection is licensed with the <a href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International</a>. Individual rights to the content still apply. </p> bibtex_ids: - Wachsmuth2017Argument - Ajjour2019Acquisition 2020-04-01: pretty_name: "args.me" desc: | <p> Corpus version 2020-04-01 with 387 740 arguments crawled from Debatewise, IDebate.org, Debatepedia, Debate.org, and from Canadian Parliament discussions. It was released on April 1, 2020 on <a href="https://zenodo.org/record/3734893">Zenodo</a>. </p> <p> This collection is licensed with the <a href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International</a>. Individual rights to the content still apply. </p> bibtex_ids: - Wachsmuth2017Argument - Ajjour2019Acquisition 2020-04-01/debateorg: desc: | <p> Subset of the 338 620 arguments from <a class="ds-ref">argsme/2020-04-01</a> that were crawled from the debate portal Debate.org. </p> bibtex_ids: - Wachsmuth2017Argument - Ajjour2019Acquisition 2020-04-01/debatepedia: desc: | <p> Subset of the 21 197 arguments from <a class="ds-ref">argsme/2020-04-01</a> that were crawled from the debate portal Debatepedia. </p> bibtex_ids: - Wachsmuth2017Argument - Ajjour2019Acquisition 2020-04-01/debatewise: desc: | <p> Subset of the 14 353 arguments from <a class="ds-ref">argsme/2020-04-01</a> that were crawled from the debate portal Debatewise. </p> bibtex_ids: - Wachsmuth2017Argument - Ajjour2019Acquisition 2020-04-01/idebate: desc: | <p> Subset of the 13 522 arguments from <a class="ds-ref">argsme/2020-04-01</a> that were crawled from the debate portal IDebate.org. </p> bibtex_ids: - Wachsmuth2017Argument - Ajjour2019Acquisition 2020-04-01/parliamentary: desc: | <p> Subset of the 48 arguments from <a class="ds-ref">argsme/2020-04-01</a> that were crawled from Canadian Parliament discussions. </p> bibtex_ids: - Wachsmuth2017Argument - Ajjour2019Acquisition 2020-04-01/processed: desc: | <p> Pre-processed version of <a class="ds-ref">argsme/2020-04-01</a> where each argument is split into sentences. </p> bibtex_ids: - Wachsmuth2017Argument - Ajjour2019Acquisition ================================================ FILE: ir_datasets/docs/beir.yaml ================================================ _: pretty_name: 'Beir (benchmark suite)' desc: ' <p> Beir is a suite of benchmarks to test zero-shot transfer. </p> <ul> <li><a href="https://arxiv.org/abs/2104.08663">Paper</a></li> <li><a href="https://github.com/UKPLab/beir/blob/main/README.md">GitHub</a></li> </ul> ' bibtex_ids: ['Thakur2021Beir'] arguana: desc: ' <p> A version of the ArguAna Counterargs dataset, for argument retrieval. </p> <ul> <li><a href="https://www.aclweb.org/anthology/P18-1023.pdf">Dataset paper</a></li> <li><a href="http://argumentation.bplaced.net/arguana/data">Dataset website</a></li> </ul> ' bibtex_ids: ['Wachsmuth2018Arguana', 'Thakur2021Beir'] climate-fever: desc: ' <p> A version of the CLIMATE-FEVER dataset, for fact verification on claims about climate. </p> <ul> <li><a href="https://arxiv.org/pdf/2012.00614.pdf">Dataset paper</a></li> <li><a href="https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html">Dataset website</a></li> </ul> ' bibtex_ids: ['Diggelmann2020CLIMATEFEVERAD', 'Thakur2021Beir'] dbpedia-entity: desc: ' <p> A version of the DBPedia-Entity-v2 dataset for entity retrieval. </p> <ul> <li><a href="http://hasibi.com/files/sigir2017-dbpedia_entity.pdf">Dataset paper</a></li> <li><a href="https://github.com/iai-group/DBpedia-Entity">Dataset website</a></li> </ul> ' bibtex_ids: ['Hasibi2017DBpediaEntityVA', 'Thakur2021Beir'] dbpedia-entity/dev: desc: ' <p> A random sample of 67 queries from the official test set, used as a dev set. </p> ' bibtex_ids: ['Hasibi2017DBpediaEntityVA', 'Thakur2021Beir'] dbpedia-entity/test: desc: ' <p> A the official test set, without 67 queries used as a dev set. </p> ' bibtex_ids: ['Hasibi2017DBpediaEntityVA', 'Thakur2021Beir'] fever: desc: ' <p> A version of the FEVER dataset for fact verification. Includes queries from the /train /dev and /test subsets. </p> <ul> <li><a href="https://www.aclweb.org/anthology/N18-1074.pdf">Dataset paper</a></li> <li><a href="https://fever.ai/resources.html">Dataset website</a></li> </ul> ' bibtex_ids: ['Thorne2018Fever', 'Thakur2021Beir'] fever/dev: desc: ' <p> The official dev set. </p> ' bibtex_ids: ['Thorne2018Fever', 'Thakur2021Beir'] fever/test: desc: ' <p> The official test set. </p> ' bibtex_ids: ['Thorne2018Fever', 'Thakur2021Beir'] fever/train: desc: ' <p> The official train set. </p> ' bibtex_ids: ['Thorne2018Fever', 'Thakur2021Beir'] fiqa: desc: ' <p> A version of the FIQA-2018 dataset (financial opinion question answering). Queries include those in the /train /dev and /test subsets. </p> <ul> <li><a href="https://dl.acm.org/doi/10.1145/3184558.3192301">Dataset paper</a></li> <li><a href="https://sites.google.com/view/fiqa/home">Dataset site</a></li> </ul> ' bibtex_ids: ['Maia2018Fiqa', 'Thakur2021Beir'] fiqa/dev: desc: ' <p> Random sample of 500 queries from the official dataset. </p> ' bibtex_ids: ['Maia2018Fiqa', 'Thakur2021Beir'] fiqa/test: desc: ' <p> Random sample of 648 queries from the official dataset. </p> ' bibtex_ids: ['Maia2018Fiqa', 'Thakur2021Beir'] fiqa/train: desc: ' <p> Official dataset without the 1148 queries sampled for /dev and /test. </p> ' bibtex_ids: ['Maia2018Fiqa', 'Thakur2021Beir'] hotpotqa: desc: ' <p> A version of the Hotpot QA dataset for multi-hop question answering. Queries include all those in /train /dev and /test. </p> <ul> <li><a href="https://www.aclweb.org/anthology/D18-1259">Dataset paper</a></li> <li><a href="https://github.com/hotpotqa/hotpot">Dataset website</a></li> </ul> ' bibtex_ids: ['Yang2018Hotpotqa', 'Thakur2021Beir'] hotpotqa/dev: desc: ' <p> Random selection of the 5447 queries from /train. </p> ' bibtex_ids: ['Yang2018Hotpotqa', 'Thakur2021Beir'] hotpotqa/test: desc: ' <p> Official <em>dev</em> set from HotpotQA, here used as a test set. </p> ' bibtex_ids: ['Yang2018Hotpotqa', 'Thakur2021Beir'] hotpotqa/train: desc: ' <p> Official train set, without the random selection of the 5447 queries used for /dev. </p> ' bibtex_ids: ['Yang2018Hotpotqa', 'Thakur2021Beir'] msmarco: desc: ' <p> A version of the MS MARCO passage ranking dataset. Includes queries from the /train, /dev, and /test sub-datasets. </p> <p> Note that this version differs from <a class="ds-ref">msmarco-passage</a>, in that it does not correct the encoding problems in the source documents. </p> <ul> <li><a href="https://microsoft.github.io/msmarco/#ranking">Leaderboard</a></li> <li><a href="https://arxiv.org/abs/1611.09268">Dataset Paper</a></li> <li>See also: <a class="ds-ref">msmarco-passage</a></li> </ul> ' bibtex_ids: ['Bajaj2016Msmarco', 'Thakur2021Beir'] msmarco/dev: desc: ' <p> A version of the MS MARCO passage ranking dev set. </p> <ul> <li>See also: <a class="ds-ref">msmarco-passage/dev</a></li> <li><a href="https://arxiv.org/abs/1611.09268">Dataset Paper</a></li> </ul> ' bibtex_ids: ['Bajaj2016Msmarco', 'Thakur2021Beir'] msmarco/test: desc: ' <p> A version of the TREC Deep Learning 2019 set. </p> <ul> <li>See also: <a class="ds-ref">msmarco-passage/trec-dl-2019</a></li> <li><a href="https://arxiv.org/pdf/2003.07820.pdf">Shared Task Paper</a></li> </ul> ' bibtex_ids: ['Craswell2019TrecDl', 'Bajaj2016Msmarco', 'Thakur2021Beir'] msmarco/train: desc: ' <p> A version of the MS MARCO passage ranking train set. </p> <ul> <li>See also: <a class="ds-ref">msmarco-passage/train</a></li> </ul> ' bibtex_ids: ['Bajaj2016Msmarco', 'Thakur2021Beir'] nfcorpus: desc: ' <p> A version of the NF Corpus (Nutrition Facts). Queries use the "title" variant of the query, which here are often natural language questions. Queries include all those from /train /dev and /test. </p> <p> Data pre-processing may be different than what is done in <a class="ds-ref">nfcorpus</a>. </p> <ul> <li><a href="https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/">Dataset website</p></li> <li><a href="https://link.springer.com/chapter/10.1007/978-3-319-30671-1_58">Dataset paper</p></li> <li>See also: <a class="ds-ref">nfcorpus</a></li> </ul> ' bibtex_ids: ['Boteva2016Nfcorpus', 'Thakur2021Beir'] nfcorpus/dev: desc: ' <p> Combined dev set of NFCorpus. </p> <ul> <li>See also: <a class="ds-ref">nfcorpus/dev</a></li> </ul> ' bibtex_ids: ['Boteva2016Nfcorpus', 'Thakur2021Beir'] nfcorpus/test: desc: ' <p> Combined test set of NFCorpus. </p> <ul> <li>See also: <a class="ds-ref">nfcorpus/test</a></li> </ul> ' bibtex_ids: ['Boteva2016Nfcorpus', 'Thakur2021Beir'] nfcorpus/train: desc: ' <p> Combined train set of NFCorpus. </p> <ul> <li>See also: <a class="ds-ref">nfcorpus/train</a></li> </ul> ' bibtex_ids: ['Boteva2016Nfcorpus', 'Thakur2021Beir'] nq: desc: ' <p> A version of the Natural Questions dev dataset. </p> <p> Data pre-processing differs both from what is done in <a class="ds-ref">natural-questions</a> and <a class="ds-ref">dpr-w100/natural-questions</a>, especially with respect to the document collection and filtering conducted on the queries. See the Beir paper for details. </p> <ul> <li><a href="https://ai.google.com/research/NaturalQuestions">Dataset website</a></li> <li><a href="https://storage.googleapis.com/pub-tools-public-publication-data/pdf/1f7b46b5378d757553d3e92ead36bda2e4254244.pdf">Dataset paper</a></li> <li>See also: <a class="ds-ref">natural-questions</a>, <a class="ds-ref">dpr-w100/natural-questions</a></li> </ul> ' bibtex_ids: ['Kwiatkowski2019Nq', 'Thakur2021Beir'] quora: desc: ' <p> A version of the Quora duplicate question detection dataset (QQP). Includes queries from /dev and /test sets. </p> <ul> <li><a href="https://www.kaggle.com/c/quora-question-pairs">Dataset website</a></li> </ul> ' bibtex_ids: ['Thakur2021Beir'] quora/dev: desc: ' <p> A 5,000 question subset of the original dataset, without overlaps in the other subsets. </p> ' bibtex_ids: ['Thakur2021Beir'] quora/test: desc: ' <p> A 10,000 question subset of the original dataset, without overlaps in the other subsets. </p> ' bibtex_ids: ['Thakur2021Beir'] scidocs: desc: ' <p> A version of the SciDocs dataset, used for citation retrieval. </p> <ul> <li><a href="https://www.aclweb.org/anthology/2020.acl-main.207.pdf">Dataset paper</a></li> <li><a href="https://allenai.org/data/scidocs">Dataset website</a></li> </ul> ' bibtex_ids: ['Cohan2020Scidocs', 'Thakur2021Beir'] scifact: desc: ' <p> A version of the SciFact dataset, for fact verification. Queries include those form the /train and /test sets. </p> <ul> <li><a href="https://www.aclweb.org/anthology/2020.emnlp-main.609.pdf">Dataset paper</a></li> <li><a href="https://www.aclweb.org/anthology/2020.emnlp-main.609.pdf">Dataset website</a></li> </ul> ' bibtex_ids: ['Wadden2020Scifact', 'Thakur2021Beir'] scifact/test: desc: ' <p> The official <em>dev</em> set. </p> ' bibtex_ids: ['Wadden2020Scifact', 'Thakur2021Beir'] scifact/train: desc: ' <p> The official train set. </p> ' bibtex_ids: ['Wadden2020Scifact', 'Thakur2021Beir'] trec-covid: desc: ' <p> A version of the TREC COVID (complete) dataset, with titles and abstracts as documents. Queries are the question variant. </p> <p> Data pre-processing may be different than what is done in <a class="ds-ref">cord19/trec-covid</a>. </p> <ul> <li><a href="https://www.semanticscholar.org/cord19">Document collection site</a></li> <li><a href="https://ir.nist.gov/covidSubmit/index.html">Shared task site</a></li> <li><a href="https://ir.nist.gov/covidSubmit/papers/Forum_TRECCOVID1.pdf">Shared task paper</a></li> <li>See also: <a class="ds-ref">cord19/trec-covid</a></li> </ul> ' bibtex_ids: ['Wang2020Cord19', 'Voorhees2020TrecCovid', 'Thakur2021Beir'] webis-touche2020: desc: ' <p> Original version of the Touchè-2020 dataset, for argument retrieval. </p> <div class="warn"> Consider using <a class="ds-ref">beir/webis-touche2020/v2</a> instead; it uses an updated, more complete version of the qrels. </div> <ul> <li><a href="https://link.springer.com/chapter/10.1007%2F978-3-030-58219-7_26">Dataset paper</a></li> <li><a href="https://webis.de/events/touche-20/">Dataset webiste</a></li> </ul> ' bibtex_ids: ['Bondarenko2020Tuche', 'Thakur2021Beir'] webis-touche2020/v2: desc: ' <p> Version 2 of the Touchè-2020 dataset, for argument retrieval. This version uses the "corrected" version of the qrels, mapped to version 1 of the corpus. </p> <ul> <li><a href="https://link.springer.com/chapter/10.1007%2F978-3-030-58219-7_26">Dataset paper</a></li> <li><a href="https://webis.de/events/touche-20/">Dataset webiste</a></li> </ul> ' bibtex_ids: ['Bondarenko2020Tuche', 'Thakur2021Beir'] cqadupstack/android: desc: ' <p> A version of the CQADupStack dataset, for duplicate question retrieval. This subset is from the <kbd>android</kbd> StackExchange subforum. </p> <ul> <li><a href="https://people.eng.unimelb.edu.au/tbaldwin/pubs/adcs2015.pdf">Dataset paper</a></li> <li><a href="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/">Dataset website</a></li> <li><a href="https://github.com/D1Doris/CQADupStack">Dataset repository</a></li> </ul> ' bibtex_ids: ['Hoogeveen2015CqaDupStack', 'Thakur2021Beir'] cqadupstack/english: desc: ' <p> A version of the CQADupStack dataset, for duplicate question retrieval. This subset is from the <kbd>english</kbd> StackExchange subforum. </p> <ul> <li><a href="https://people.eng.unimelb.edu.au/tbaldwin/pubs/adcs2015.pdf">Dataset paper</a></li> <li><a href="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/">Dataset website</a></li> <li><a href="https://github.com/D1Doris/CQADupStack">Dataset repository</a></li> </ul> ' bibtex_ids: ['Hoogeveen2015CqaDupStack', 'Thakur2021Beir'] cqadupstack/gaming: desc: ' <p> A version of the CQADupStack dataset, for duplicate question retrieval. This subset is from the <kbd>gaming</kbd> StackExchange subforum. </p> <ul> <li><a href="https://people.eng.unimelb.edu.au/tbaldwin/pubs/adcs2015.pdf">Dataset paper</a></li> <li><a href="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/">Dataset website</a></li> <li><a href="https://github.com/D1Doris/CQADupStack">Dataset repository</a></li> </ul> ' bibtex_ids: ['Hoogeveen2015CqaDupStack', 'Thakur2021Beir'] cqadupstack/gis: desc: ' <p> A version of the CQADupStack dataset, for duplicate question retrieval. This subset is from the <kbd>gis</kbd> StackExchange subforum. </p> <ul> <li><a href="https://people.eng.unimelb.edu.au/tbaldwin/pubs/adcs2015.pdf">Dataset paper</a></li> <li><a href="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/">Dataset website</a></li> <li><a href="https://github.com/D1Doris/CQADupStack">Dataset repository</a></li> </ul> ' bibtex_ids: ['Hoogeveen2015CqaDupStack', 'Thakur2021Beir'] cqadupstack/mathematica: desc: ' <p> A version of the CQADupStack dataset, for duplicate question retrieval. This subset is from the <kbd>mathematica</kbd> StackExchange subforum. </p> <ul> <li><a href="https://people.eng.unimelb.edu.au/tbaldwin/pubs/adcs2015.pdf">Dataset paper</a></li> <li><a href="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/">Dataset website</a></li> <li><a href="https://github.com/D1Doris/CQADupStack">Dataset repository</a></li> </ul> ' bibtex_ids: ['Hoogeveen2015CqaDupStack', 'Thakur2021Beir'] cqadupstack/physics: desc: ' <p> A version of the CQADupStack dataset, for duplicate question retrieval. This subset is from the <kbd>physics</kbd> StackExchange subforum. </p> <ul> <li><a href="https://people.eng.unimelb.edu.au/tbaldwin/pubs/adcs2015.pdf">Dataset paper</a></li> <li><a href="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/">Dataset website</a></li> <li><a href="https://github.com/D1Doris/CQADupStack">Dataset repository</a></li> </ul> ' bibtex_ids: ['Hoogeveen2015CqaDupStack', 'Thakur2021Beir'] cqadupstack/programmers: desc: ' <p> A version of the CQADupStack dataset, for duplicate question retrieval. This subset is from the <kbd>programmers</kbd> StackExchange subforum. </p> <ul> <li><a href="https://people.eng.unimelb.edu.au/tbaldwin/pubs/adcs2015.pdf">Dataset paper</a></li> <li><a href="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/">Dataset website</a></li> <li><a href="https://github.com/D1Doris/CQADupStack">Dataset repository</a></li> </ul> ' bibtex_ids: ['Hoogeveen2015CqaDupStack', 'Thakur2021Beir'] cqadupstack/stats: desc: ' <p> A version of the CQADupStack dataset, for duplicate question retrieval. This subset is from the <kbd>stats</kbd> StackExchange subforum. </p> <ul> <li><a href="https://people.eng.unimelb.edu.au/tbaldwin/pubs/adcs2015.pdf">Dataset paper</a></li> <li><a href="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/">Dataset website</a></li> <li><a href="https://github.com/D1Doris/CQADupStack">Dataset repository</a></li> </ul> ' bibtex_ids: ['Hoogeveen2015CqaDupStack', 'Thakur2021Beir'] cqadupstack/tex: desc: ' <p> A version of the CQADupStack dataset, for duplicate question retrieval. This subset is from the <kbd>tex</kbd> StackExchange subforum. </p> <ul> <li><a href="https://people.eng.unimelb.edu.au/tbaldwin/pubs/adcs2015.pdf">Dataset paper</a></li> <li><a href="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/">Dataset website</a></li> <li><a href="https://github.com/D1Doris/CQADupStack">Dataset repository</a></li> </ul> ' bibtex_ids: ['Hoogeveen2015CqaDupStack', 'Thakur2021Beir'] cqadupstack/unix: desc: ' <p> A version of the CQADupStack dataset, for duplicate question retrieval. This subset is from the <kbd>unix</kbd> StackExchange subforum. </p> <ul> <li><a href="https://people.eng.unimelb.edu.au/tbaldwin/pubs/adcs2015.pdf">Dataset paper</a></li> <li><a href="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/">Dataset website</a></li> <li><a href="https://github.com/D1Doris/CQADupStack">Dataset repository</a></li> </ul> ' bibtex_ids: ['Hoogeveen2015CqaDupStack', 'Thakur2021Beir'] cqadupstack/webmasters: desc: ' <p> A version of the CQADupStack dataset, for duplicate question retrieval. This subset is from the <kbd>webmasters</kbd> StackExchange subforum. </p> <ul> <li><a href="https://people.eng.unimelb.edu.au/tbaldwin/pubs/adcs2015.pdf">Dataset paper</a></li> <li><a href="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/">Dataset website</a></li> <li><a href="https://github.com/D1Doris/CQADupStack">Dataset repository</a></li> </ul> ' bibtex_ids: ['Hoogeveen2015CqaDupStack', 'Thakur2021Beir'] cqadupstack/wordpress: desc: ' <p> A version of the CQADupStack dataset, for duplicate question retrieval. This subset is from the <kbd>wordpress</kbd> StackExchange subforum. </p> <ul> <li><a href="https://people.eng.unimelb.edu.au/tbaldwin/pubs/adcs2015.pdf">Dataset paper</a></li> <li><a href="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/">Dataset website</a></li> <li><a href="https://github.com/D1Doris/CQADupStack">Dataset repository</a></li> </ul> ' bibtex_ids: ['Hoogeveen2015CqaDupStack', 'Thakur2021Beir'] ================================================ FILE: ir_datasets/docs/bibliography.bib ================================================ @inproceedings{Hashemi2020Antique, title={ANTIQUE: A Non-Factoid Question Answering Benchmark}, author={Helia Hashemi and Mohammad Aliannejadi and Hamed Zamani and Bruce Croft}, booktitle={ECIR}, year={2020} } @misc{Graff2002Aquaint, title={The AQUAINT Corpus of English News Text}, author={David Graff}, year={2002}, url={https://catalog.ldc.upenn.edu/LDC2002T31}, publisher={Linguistic Data Consortium} } @inproceedings{Voorhees2005Robust, title={Overview of the TREC 2005 Robust Retrieval Track}, author={Ellen M. Voorhees}, booktitle={TREC}, year={2005} } @inproceedings{Dalton2019Cast, title={CAsT 2019: The Conversational Assistance Track Overview}, author={Jeffrey Dalton and Chenyan Xiong and Jamie Callan}, booktitle={TREC}, year={2019} } @inproceedings{Dalton2020Cast, title={CAsT 2020: The Conversational Assistance Track Overview}, author={Jeffrey Dalton and Chenyan Xiong and Jamie Callan}, booktitle={TREC}, year={2020} } @article{Thakur2021Beir, title = "BEIR: A Heterogenous Benchmark for Zero-shot Evaluation of Information Retrieval Models", author = "Thakur, Nandan and Reimers, Nils and Rücklé, Andreas and Srivastava, Abhishek and Gurevych, Iryna", journal= "arXiv preprint arXiv:2104.08663", month = "4", year = "2021", url = "https://arxiv.org/abs/2104.08663", } @inproceedings{Wachsmuth2018Arguana, author = "Wachsmuth, Henning and Syed, Shahbaz and Stein, Benno", title = "Retrieval of the Best Counterargument without Prior Topic Knowledge", booktitle = "Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", year = "2018", publisher = "Association for Computational Linguistics", location = "Melbourne, Australia", pages = "241--251", url = "http://aclweb.org/anthology/P18-1023" } @article{Diggelmann2020CLIMATEFEVERAD, title={CLIMATE-FEVER: A Dataset for Verification of Real-World Climate Claims}, author={T. Diggelmann and Jordan L. Boyd-Graber and Jannis Bulian and Massimiliano Ciaramita and Markus Leippold}, journal={ArXiv}, year={2020}, volume={abs/2012.00614} } @article{Hasibi2017DBpediaEntityVA, title={DBpedia-Entity v2: A Test Collection for Entity Search}, author={Faegheh Hasibi and Fedor Nikolaev and Chenyan Xiong and K. Balog and S. E. Bratsberg and Alexander Kotov and J. Callan}, journal={Proceedings of the 40th International ACM SIGIR Conference on Research and Development in Information Retrieval}, year={2017} } @inproceedings{Thorne2018Fever, title = "{FEVER}: a Large-scale Dataset for Fact Extraction and {VER}ification", author = "Thorne, James and Vlachos, Andreas and Christodoulopoulos, Christos and Mittal, Arpit", booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)", month = jun, year = "2018", address = "New Orleans, Louisiana", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/N18-1074", doi = "10.18653/v1/N18-1074", pages = "809--819" } @article{Maia2018Fiqa, title={WWW'18 Open Challenge: Financial Opinion Mining and Question Answering}, author={Macedo Maia and S. Handschuh and A. Freitas and Brian Davis and R. McDermott and M. Zarrouk and A. Balahur}, journal={Companion Proceedings of the The Web Conference 2018}, year={2018} } @inproceedings{Yang2018Hotpotqa, title = "{H}otpot{QA}: A Dataset for Diverse, Explainable Multi-hop Question Answering", author = "Yang, Zhilin and Qi, Peng and Zhang, Saizheng and Bengio, Yoshua and Cohen, William and Salakhutdinov, Ruslan and Manning, Christopher D.", booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing", month = oct # "-" # nov, year = "2018", address = "Brussels, Belgium", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/D18-1259", doi = "10.18653/v1/D18-1259", pages = "2369--2380" } @inproceedings{Bajaj2016Msmarco, title={MS MARCO: A Human Generated MAchine Reading COmprehension Dataset}, author={Payal Bajaj, Daniel Campos, Nick Craswell, Li Deng, Jianfeng Gao, Xiaodong Liu, Rangan Majumder, Andrew McNamara, Bhaskar Mitra, Tri Nguyen, Mir Rosenberg, Xia Song, Alina Stoica, Saurabh Tiwary, Tong Wang}, booktitle={InCoCo@NIPS}, year={2016} } @inproceedings{Craswell2019TrecDl, title={Overview of the TREC 2019 deep learning track}, author={Nick Craswell and Bhaskar Mitra and Emine Yilmaz and Daniel Campos and Ellen Voorhees}, booktitle={TREC 2019}, year={2019} } @inproceedings{Boteva2016Nfcorpus, title="A Full-Text Learning to Rank Dataset for Medical Information Retrieval", author = "Vera Boteva and Demian Gholipour and Artem Sokolov and Stefan Riezler", booktitle = "Proceedings of the European Conference on Information Retrieval ({ECIR})", location = "Padova, Italy", publisher = "Springer", year = 2016 } @article{Kwiatkowski2019Nq, title = {Natural Questions: a Benchmark for Question Answering Research}, author = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le and Slav Petrov}, year = {2019}, journal = {TACL} } @inproceedings{Cohan2020Scidocs, title = "{SPECTER}: Document-level Representation Learning using Citation-informed Transformers", author = "Cohan, Arman and Feldman, Sergey and Beltagy, Iz and Downey, Doug and Weld, Daniel", booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics", month = jul, year = "2020", address = "Online", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/2020.acl-main.207", doi = "10.18653/v1/2020.acl-main.207", pages = "2270--2282" } @inproceedings{Wadden2020Scifact, title = "Fact or Fiction: Verifying Scientific Claims", author = "Wadden, David and Lin, Shanchuan and Lo, Kyle and Wang, Lucy Lu and van Zuylen, Madeleine and Cohan, Arman and Hajishirzi, Hannaneh", booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)", month = nov, year = "2020", address = "Online", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/2020.emnlp-main.609", doi = "10.18653/v1/2020.emnlp-main.609", pages = "7534--7550" } @article{Wang2020Cord19, title={CORD-19: The Covid-19 Open Research Dataset}, author={Lucy Lu Wang and Kyle Lo and Yoganand Chandrasekhar and Russell Reas and Jiangjiang Yang and Darrin Eide and K. Funk and Rodney Michael Kinney and Ziyang Liu and W. Merrill and P. Mooney and D. Murdick and Devvret Rishi and Jerry Sheehan and Zhihong Shen and B. Stilson and A. Wade and K. Wang and Christopher Wilhelm and Boya Xie and D. Raymond and Daniel S. Weld and Oren Etzioni and Sebastian Kohlmeier}, journal={ArXiv}, year={2020} } @article{Voorhees2020TrecCovid, title={TREC-COVID: Constructing a Pandemic Information Retrieval Test Collection}, author={E. Voorhees and Tasmeer Alam and Steven Bedrick and Dina Demner-Fushman and W. Hersh and Kyle Lo and Kirk Roberts and I. Soboroff and Lucy Lu Wang}, journal={ArXiv}, year={2020}, volume={abs/2005.04474} } @inproceedings{Bondarenko2020Tuche, title={Overview of Touch{\'e} 2020: Argument Retrieval}, author={Alexander Bondarenko and Maik Fr{\"o}be and Meriem Beloucif and Lukas Gienapp and Yamen Ajjour and Alexander Panchenko and Christian Biemann and Benno Stein and Henning Wachsmuth and Martin Potthast and Matthias Hagen}, booktitle={CLEF}, year={2020} } @article{Hoogeveen2015CqaDupStack, title={{CQADupStack}: A Benchmark Data Set for Community Question-Answering Research}, author={D. Hoogeveen and Karin M. Verspoor and Timothy Baldwin}, journal={Proceedings of the 20th Australasian Document Computing Symposium}, year={2015} } @article{Dietz2017Car, title={{TREC CAR}: A Data Set for Complex Answer Retrieval}, author={Laura Dietz and Ben Gamari}, year={2017}, note={Version 1.5}, url={http://trec-car.cs.unh.edu} } @inproceedings{Dietz2017TrecCar, title={TREC Complex Answer Retrieval Overview.}, author={Dietz, Laura and Verma, Manisha and Radlinski, Filip and Craswell, Nick}, booktitle={TREC}, year={2017} } @inproceedings{Nanni2017BenchmarkCar, title={Benchmark for complex answer retrieval}, author={Nanni, Federico and Mitra, Bhaskar and Magnusson, Matt and Dietz, Laura}, booktitle={ICTIR}, year={2017} } @inproceedings{Roberts2017TrecPm, title={Overview of the TREC 2017 Precision Medicine Track}, author={Kirk Roberts and Dina Demner-Fushman and Ellen Voorhees and William R. Hersh and Steven Bedrick and Alexander J. Lazar and Shubham Pant}, booktitle={TREC}, year={2017} } @inproceedings{Roberts2018TrecPm, title={Overview of the TREC 2018 Precision Medicine Track}, author={Kirk Roberts and Dina Demner-Fushman and Ellen Voorhees and William R. Hersh and Steven Bedrick and Alexander J. Lazar}, booktitle={TREC}, year={2018} } @inproceedings{Roberts2019TrecPm, title={Overview of the TREC 2019 Precision Medicine Track}, author={Kirk Roberts and Dina Demner-Fushman and Ellen Voorhees and William R. Hersh and Steven Bedrick and Alexander J. Lazar and Shubham Pant and Funda Meric-Bernstam}, booktitle={TREC}, year={2019} } @inproceedings{Sun2020Clirmatrix, title = "{CLIRM}atrix: A massively large collection of bilingual and multilingual datasets for Cross-Lingual Information Retrieval", author = "Sun, Shuo and Duh, Kevin", booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)", month = nov, year = "2020", address = "Online", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/2020.emnlp-main.340", doi = "10.18653/v1/2020.emnlp-main.340", pages = "4160--4170" } @inproceedings{Clarke2009TrecWeb, title={Overview of the TREC 2009 Web Track}, author={Charles L. A. Clarke and Nick Craswell and Ian Soboroff}, booktitle={TREC}, year={2009} } @inproceedings{Clarke2010TrecWeb, title={Overview of the TREC 2010 Web Track}, author={Charles L. A. Clarke and Nick Craswell and Ian Soboroff and Gordon V. Cormack}, booktitle={TREC}, year={2010} } @inproceedings{Clarke2011TrecWeb, title={Overview of the TREC 2011 Web Track}, author={Charles L. A. Clarke and Nick Craswell and Ian Soboroff and Ellen M. Voorhees}, booktitle={TREC}, year={2011} } @inproceedings{Clarke2012TrecWeb, title={Overview of the TREC 2012 Web Track}, author={Charles L. A. Clarke and Nick Craswell and Ellen M. Voorhees}, booktitle={TREC}, year={2012} } @inproceedings{Carterette2009MQ, title={Million Query Track 2009 Overview}, author={Ben Carterette and Virgil Pavlu and Hui Fang and Evangelos Kanoulas}, booktitle={TREC}, year={2009} } @inproceedings{Zuccon2016ClefEhealth, title={The IR Task at the CLEF eHealth Evaluation Lab 2016: User-centred Health Information Retrieval}, author={Guido Zuccon and Joao Palotti and Lorraine Goeuriot and Liadh Kelly and Mihai Lupu and Pavel Pecina and Henning M{\"u}ller and Julie Budaher and Anthony Deacon}, booktitle={CLEF}, year={2016} } @inproceedings{Palotti2017ClefEhealth, title={CLEF 2017 Task Overview: The IR Task at the eHealth Evaluation Lab - Evaluating Retrieval Methods for Consumer Health Search}, author={Joao Palotti and Guido Zuccon and Jimmy and Pavel Pecina and Mihai Lupu and Lorraine Goeuriot and Liadh Kelly and Allan Hanbury}, booktitle={CLEF}, year={2017} } @inproceedings{Abualsaud2019TrecDecision, title={Overview of the TREC 2019 Decision Track}, author={Mustafa Abualsaud and Christina Lioma and Maria Maistro and Mark D. Smucker and Guido Zuccon}, booktitle={TREC}, year={2019} } @inproceedings{Mao2018OWww2, title={Overview of the NTCIR-14 We Want Web Task}, author={Jiaxin Mao and Tetsuya Sakai and Cheng Luo and Peng Xiao and Yiqun Liu and Zhicheng Dou}, booktitle={NTCIR}, year={2018} } @inproceedings{Luo2017Www1, title={Overview of the NTCIR-13 We Want Web Task}, author={Cheng Luo and Tetsuya Sakai and Yiqun Liu and Zhicheng Dou and Chenyan Xiong and Jingfang Xu}, booktitle={NTCIR}, year={2017} } @inproceedings{CollinsThompson2014TrecWeb, title={TREC 2014 Web Track Overview}, author={Kevyn Collins-Thompson and Craig Macdonald and Paul Bennett and Fernando Diaz and Ellen M. Voorhees}, booktitle={TREC}, year={2014} } @inproceedings{CollinsThompson2013TrecWeb, title={TREC 2013 Web Track Overview}, author={Kevyn Collins-Thompson and Paul Bennett and Fernando Diaz and Charles L. A. Clarke and Ellen M. Voorhees}, booktitle={TREC}, year={2013} } @article{Husain2019CodeSearchNet, title={CodeSearchNet Challenge: Evaluating the State of Semantic Code Search}, author={Hamel Husain and Ho-Hsiang Wu and Tiferet Gazit and Miltiadis Allamanis and Marc Brockschmidt}, journal={ArXiv}, year={2019} } @misc{Karpukhin2020Dpr, title={Dense Passage Retrieval for Open-Domain Question Answering}, author={Vladimir Karpukhin and Barlas Oğuz and Sewon Min and Patrick Lewis and Ledell Wu and Sergey Edunov and Danqi Chen and Wen-tau Yih}, year={2020}, eprint={2004.04906}, archivePrefix={arXiv}, primaryClass={cs.CL} } @inproceedings{Joshi2017TriviaQA, title={TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension}, author={Mandar Joshi and Eunsol Choi and Daniel S. Weld and Luke Zettlemoyer}, booktitle={ACL}, year={2017} } @inproceedings{Craswell2002TrecWeb, title={Overview of the TREC-2002 Web Track}, author={Nick Craswell and David Hawking}, booktitle={TREC}, year={2002} } @inproceedings{Craswell2003TrecWeb, title={Overview of the TREC 2003 Web Track}, author={Nick Craswell and David Hawking and Ross Wilkinson and Mingfang Wu}, booktitle={TREC}, year={2003} } @inproceedings{Craswell2004TrecWeb, title={Overview of the TREC-2004 Web Track}, author={Nick Craswell and David Hawking}, booktitle={TREC}, year={2004} } @inproceedings{Clarke2004TrecTerabyte, title={Overview of the TREC 2004 Terabyte Track}, author={Charles Clarke and Nick Craswell and Ian Soboroff}, booktitle={TREC}, year={2004} } @inproceedings{Clarke2005TrecTerabyte, title={The TREC 2005 Terabyte Track}, author={Charles L. A. Clark and Falk Scholer and Ian Soboroff}, booktitle={TREC}, year={2005} } @inproceedings{Buttcher2006TrecTerabyte, title={The TREC 2006 Terabyte Track}, author={Stefan B\"uttcher and Charles L. A. Clarke and Ian Soboroff}, booktitle={TREC}, year={2006} } @inproceedings{Allen2007MQ, title={Million Query Track 2007 Overview}, author={James Allan and Ben Carterette and Javed A. Aslam and Virgil Pavlu and Blagovest Dachev and Evangelos Kanoulas}, booktitle={TREC}, year={2007} } @inproceedings{Allen2008MQ, title={Million Query Track 2008 Overview}, author={James Allan and Javed A. Aslam and Ben Carterette and Virgil Pavlu and Evangelos Kanoulas}, booktitle={TREC}, year={2008} } @inproceedings{Hersh2006TrecGenomics, title={TREC 2006 Genomics Track Overview}, author={William Hersh and Aaron M. Cohen and Phoebe Roberts and Hari Krishna Rekapalli}, booktitle={TREC}, year={2006} } @inproceedings{Hersh2007TrecGenomics, title={TREC 2007 Genomics Track Overview}, author={William Hersh and Aaron Cohen and Lynn Ruslen and Phoebe Roberts}, booktitle={TREC}, year={2007} } @inproceedings{Hersh2004TrecGenomics, title={TREC 2004 Genomics Track Overview}, author={William R. Hersh and Ravi Teja Bhuptiraju and Laura Ross and Phoebe Johnson and Aaron M. Cohen and Dale F. Kraemer}, booktitle={TREC}, year={2004} } @inproceedings{Hersh2005TrecGenomics, title={TREC 2005 Genomics Track Overview}, author={William Hersh and Aaron Cohen and Jianji Yang and Ravi Teja Bhupatiraju and Phoebe Roberts and Marti Hearst}, booktitle={TREC}, year={2007} } @article{Craswell2020Orcas, title={ORCAS: 18 Million Clicked Query-Document Pairs for Analyzing Search}, author={Craswell, Nick and Campos, Daniel and Mitra, Bhaskar and Yilmaz, Emine and Billerbeck, Bodo}, journal={arXiv preprint arXiv:2006.05324}, year={2020} } @inproceedings{Craswell2020TrecDl, title={Overview of the TREC 2020 deep learning track}, author={Nick Craswell and Bhaskar Mitra and Emine Yilmaz and Daniel Campos}, booktitle={TREC}, year={2020} } @article{Mackie2021DlHard, title={How Deep is your Learning: the DL-HARD Annotated Deep Learning Dataset}, author={Iain Mackie and Jeffrey Dalton and Andrew Yates}, journal={ArXiv}, year={2021}, volume={abs/2105.07975} } @inproceedings{MacAvaney2020MedMarco, author = {MacAvaney, Sean and Cohan, Arman and Goharian, Nazli}, title = {SLEDGE-Zero: A Zero-Shot Baseline for COVID-19 Literature Search}, booktitle = {EMNLP}, year = {2020} } @article{Sandhaus2008Nyt, title={The new york times annotated corpus}, author={Sandhaus, Evan}, journal={Linguistic Data Consortium, Philadelphia}, volume={6}, number={12}, pages={e26752}, year={2008} } @inproceedings{Allan2017TrecCore, author = {James Allan and Donna Harman and Evangelos Kanoulas and Dan Li and Christophe Van Gysel and Ellen Vorhees}, title = {TREC 2017 Common Core Track Overview}, booktitle = {TREC}, year = {2017} } @inproceedings{MacAvaney2019Wksup, author = {MacAvaney, Sean and Yates, Andrew and Hui, Kai and Frieder, Ophir}, title = {Content-Based Weak Supervision for Ad-Hoc Re-Ranking}, booktitle = {SIGIR}, year = {2019} } @inproceedings{Simpson2014TrecCds, title={Overview of the TREC 2014 Clinical Decision Support Track}, author={Matthew S. Simpson and Ellen M. Voorhees and William Hersh}, booktitle={TREC}, year={2014} } @inproceedings{Roberts2015TrecCds, title={Overview of the TREC 2015 Clinical Decision Support Track}, author={Kirk Roberts and Matthew S. Simpson and Ellen Voorhees and William R. Hersh}, booktitle={TREC}, year={2015} } @inproceedings{Roberts2016TrecCds, title={Overview of the TREC 2016 Clinical Decision Support Track}, author={Kirk Roberts and Dina Demner-Fushman and Ellen M. Voorhees and William R. Hersh}, booktitle={TREC}, year={2016} } @misc{Graff2001Arabic, title={Arabic Newswire Part 1 LDC2001T55}, author={Graff, David, and Walker, Kevin}, year={2001}, url={https://catalog.ldc.upenn.edu/LDC2001T55}, publisher={Linguistic Data Consortium} } @inproceedings{Gey2001Arabic, title={The TREC-2001 Cross-Language Information Retrieval Track: Searching Arabic using English, French or Arabic Queries}, author={Fredric Gey and Douglas Oard}, booktitle={TREC}, year={2001} } @inproceedings{Gey2002Arabic, title={The TREC-2002 Arabic/English CLIR Track}, author={Fredric Gey and Douglas Oard}, booktitle={TREC}, year={2002} } @misc{Rogers2000Mandarin, title={TREC Mandarin LDC2000T52}, author={Rogers, Willie}, year={2000}, url={https://catalog.ldc.upenn.edu/LDC2000T52}, publisher={Linguistic Data Consortium} } @inproceedings{Harman1997Chinese, title={Spanish and Chinese Document Retrieval in TREC-5}, author={Alan Smeaton and Ross Wilkinson}, booktitle={TREC}, year={1996} } @inproceedings{Wilkinson1998Chinese, title={Chinese Document Retrieval at TREC-6}, author={Ross Wilkinson}, booktitle={TREC}, year={1997} } @inproceedings{Voorhees2004Robust, title={Overview of the TREC 2004 Robust Retrieval Track}, author={Ellen Voorhees}, booktitle={TREC}, year={2004} } @inproceedings{Huston2014ACO, title={A Comparison of Retrieval Models using Term Dependencies}, author={Samuel Huston and W. Bruce Croft}, booktitle={CIKM}, year={2014} } @misc{Rogers2000Spanish, title={TREC Spanish LDC2000T51}, author={Rogers, Willie}, year={2000}, url={https://catalog.ldc.upenn.edu/LDC2000T51}, publisher={Linguistic Data Consortium} } @inproceedings{Harman1994Trec3, title={Overview of the Third Text REtrieval Conference (TREC-3)}, author={Donna Harman}, booktitle={TREC}, year={1994} } @inproceedings{Harman1995Trec4, title={Overview of the Fourth Text REtrieval Conference (TREC-4)}, author={Donna Harman}, booktitle={TREC}, year={1995} } @inproceedings{Rekabsaz2021TripClick, title={TripClick: The Log Files of a Large Health Web Search Engine}, author={Navid Rekabsaz and Oleg Lesota and Markus Schedl and Jon Brassey and Carsten Eickhoff}, year={2021}, booktitle={SIGIR} } @inproceedings{Hofstaetter2022TripClick, title={Establishing Strong Baselines for TripClick Health Retrieval}, author={Sebastian Hofst\"atter and Sophia Althammer and Mete Sertkan and Allan Hanbury}, year={2022}, booktitle={ECIR} } @inproceedings{Sequiera2017TweetsIA, title={Finally, a Downloadable Test Collection of Tweets}, author={Royal Sequiera and Jimmy Lin}, booktitle={SIGIR}, year={2017} } @inproceedings{Lin2013Microblog, title={Overview of the TREC-2013 Microblog Track}, author={Jimmy Lin and Miles Efron}, booktitle={TREC}, year={2013} } @inproceedings{Lin2014Microblog, title={Overview of the TREC-2014 Microblog Track}, author={Jimmy Lin and Miles Efron and Yulu Wang and Garrick Sherman}, booktitle={TREC}, year={2014} } @inproceedings{Soboroff2018News, title={TREC 2018 News Track Overview}, author={Ian Soboroff and Shudong Huang and Donna Harman}, booktitle={TREC}, year={2018} } @inproceedings{Soboroff2019News, title={TREC 2019 News Track Overview}, author={Ian Soboroff and Shudong Huang and Donna Harman}, booktitle={TREC}, year={2019} } @inproceedings{Frej2020Wikir, title={WIKIR: A Python toolkit for building a large-scale Wikipedia-based English Information Retrieval Dataset}, author={Jibril Frej and Didier Schwab and Jean-Pierre Chevallet}, booktitle={LREC}, year={2020} } @inproceedings{Frej2020MlWikir, title={MLWIKIR: A Python Toolkit for Building Large-scale Wikipedia-based Information Retrieval Datasets in Chinese, English, French, Italian, Japanese, Spanish and More}, author={Jibril Frej and Didier Schwab and Jean-Pierre Chevallet}, booktitle={CIRCLE}, year={2020} } @article{Zhang2021MrTyDi, title={{Mr. TyDi}: A Multi-lingual Benchmark for Dense Retrieval}, author={Xinyu Zhang and Xueguang Ma and Peng Shi and Jimmy Lin}, year={2021}, journal={arXiv:2108.08787}, } @article{Clark2020TyDiQa, title={{TyDi QA}: A Benchmark for Information-Seeking Question Answering in Typologically Diverse Languages}, author={Jonathan H. Clark and Eunsol Choi and Michael Collins and Dan Garrette and Tom Kwiatkowski and Vitaly Nikolaev and Jennimaria Palomaki}, year={2020}, journal={Transactions of the Association for Computational Linguistics} } @article{Bonifacio2021MMarco, title={{mMARCO}: A Multilingual Version of {MS MARCO} Passage Ranking Dataset}, author={Luiz Henrique Bonifacio and Israel Campiotti and Roberto Lotufo and Rodrigo Nogueira}, year={2021}, journal={arXiv:2108.13897} } @inproceedings{Pass2006Picture, title={A picture of search}, author={Pass, Greg and Chowdhury, Abdur and Torgeson, Cayley}, booktitle={InfoScale}, year={2006} } @inproceedings{MacAvaney2022Reproducing, author={MacAvaney, Sean and Macdonald, Craig and Ounis, Iadh}, title={Reproducing Personalised Session Search over the AOL Query Log}, booktitle={ECIR}, year={2022} } @inproceedings{Wachsmuth2017Argument, author = {Henning Wachsmuth and Martin Potthast and Khalid Al-Khatib and Yamen Ajjour and Jana Puschmann and Jiani Qu and Jonas Dorsch and Viorel Morari and Janek Bevendorff and Benno Stein}, booktitle = {4th Workshop on Argument Mining (ArgMining 2017) at EMNLP}, editor = {Kevin Ashley and Claire Cardie and Nancy Green and Iryna Gurevych and Ivan Habernal and Diane Litman and Georgios Petasis and Chris Reed and Noam Slonim and Vern Walker}, month = sep, pages = {49-59}, publisher = {Association for Computational Linguistics}, site = {Copenhagen, Denmark}, title = {{Building an Argument Search Engine for the Web}}, url = {https://www.aclweb.org/anthology/W17-5106}, year = 2017 } @inproceedings{Ajjour2019Acquisition, address = {Berlin Heidelberg New York}, author = {Yamen Ajjour and Henning Wachsmuth and Johannes Kiesel and Martin Potthast and Matthias Hagen and Benno Stein}, booktitle = {42nd German Conference on Artificial Intelligence (KI 2019)}, doi = {10.1007/978-3-030-30179-8\_4}, editor = {Christoph Benzm{\"u}ller and Heiner Stuckenschmidt}, month = sep, pages = {48-59}, publisher = {Springer}, site = {Kassel, Germany}, title = {{Data Acquisition for Argument Search: The args.me corpus}}, year = 2019 } @inproceedings{Bondarenko2020Touche, address = {Berlin Heidelberg New York}, author = {Alexander Bondarenko and Maik Fr{\"o}be and Meriem Beloucif and Lukas Gienapp and Yamen Ajjour and Alexander Panchenko and Chris Biemann and Benno Stein and Henning Wachsmuth and Martin Potthast and Matthias Hagen}, booktitle = {Experimental IR Meets Multilinguality, Multimodality, and Interaction. 11th International Conference of the CLEF Association (CLEF 2020)}, doi = {10.1007/978-3-030-58219-7\_26}, editor = {Avi Arampatzis and Evangelos Kanoulas and Theodora Tsikrika and Stefanos Vrochidis and Hideo Joho and Christina Lioma and Carsten Eickhoff and Aur{\'e}lie N{\'e}v{\'e}ol and Linda Cappellato and Nicola Ferro}, month = sep, pages = {384-395}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, site = {Thessaloniki, Greece}, title = {{Overview of Touch{\'e} 2020: Argument Retrieval}}, url = {https://link.springer.com/chapter/10.1007/978-3-030-58219-7_26}, volume = 12260, year = 2020, } @inproceedings{Wachsmuth2017Quality, author = {Henning Wachsmuth and Nona Naderi and Yufang Hou and Yonatan Bilu and Vinodkumar Prabhakaran and Tim Alberdingk Thijm and Graeme Hirst and Benno Stein}, booktitle = {15th Conference of the European Chapter of the Association for Computational Linguistics (EACL 2017)}, editor = {Phil Blunsom and Alexander Koller and Mirella Lapata}, month = apr, pages = {176-187}, site = {Valencia, Spain}, title = {{Computational Argumentation Quality Assessment in Natural Language}}, url = {http://aclweb.org/anthology/E17-1017}, year = 2017 } @inproceedings{Braunstain2016Support, author = {Liora Braunstain and Oren Kurland and David Carmel and Idan Szpektor and Anna Shtok}, editor = {Nicola Ferro and Fabio Crestani and Marie{-}Francine Moens and Josiane Mothe and Fabrizio Silvestri and Giorgio Maria Di Nunzio and Claudia Hauff and Gianmaria Silvello}, title = {Supporting Human Answers for Advice-Seeking Questions in {CQA} Sites}, booktitle = {Advances in Information Retrieval - 38th European Conference on {IR} Research, {ECIR} 2016, Padua, Italy, March 20-23, 2016. Proceedings}, series = {Lecture Notes in Computer Science}, volume = {9626}, pages = {129--141}, publisher = {Springer}, year = {2016}, doi = {10.1007/978-3-319-30671-1\_10}, } @inproceedings{Rafalak2014Credibility, author = {Maria Rafalak and Katarzyna Abramczuk and Adam Wierzbicki}, editor = {Chin{-}Wan Chung and Andrei Z. Broder and Kyuseok Shim and Torsten Suel}, title = {Incredible: is (almost) all web content trustworthy? analysis of psychological factors related to website credibility evaluation}, booktitle = {23rd International World Wide Web Conference, {WWW} '14, Seoul, Republic of Korea, April 7-11, 2014, Companion Volume}, pages = {1117--1122}, publisher = {{ACM}}, year = {2014}, doi = {10.1145/2567948.2578997}, } @inproceedings{Bondarenko2021Touche, address = {Berlin Heidelberg New York}, author = {Alexander Bondarenko and Lukas Gienapp and Maik Fr{\"o}be and Meriem Beloucif and Yamen Ajjour and Alexander Panchenko and Chris Biemann and Benno Stein and Henning Wachsmuth and Martin Potthast and Matthias Hagen}, booktitle = {Experimental IR Meets Multilinguality, Multimodality, and Interaction. 12th International Conference of the CLEF Association (CLEF 2021)}, doi = {10.1007/978-3-030-85251-1\_28}, editor = {{K. Sel{\c{c}}uk} Candan and Bogdan Ionescu and Lorraine Goeuriot and Henning M{\"u}ller and Alexis Joly and Maria Maistro and Florina Piroi and Guglielmo Faggioli and Nicola Ferro}, month = sep, pages = {450-467}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, site = {Bucharest, Romania}, title = {{Overview of Touch{\'e} 2021: Argument Retrieval}}, url = {https://link.springer.com/chapter/10.1007/978-3-030-85251-1_28}, volume = 12880, year = 2021, } @inproceedings{Froebe2022Anchors, address = {Berlin Heidelberg New York}, author = {Maik Fr{\"o}be and Sebastian G{\"u}nther and Maximilian Probst and Martin Potthast and Matthias Hagen}, booktitle = {Advances in Information Retrieval. 44th European Conference on IR Research (ECIR 2022)}, month = apr, publisher = {Springer}, series = {Lecture Notes in Computer Science}, site = {Stavanger, Norway}, title = {{The Power of Anchor Text in the Neural Retrieval Era}}, year = 2022 } @article{Santhanam2021ColBERTv2, title = "ColBERTv2: Effective and Efficient Retrieval via Lightweight Late Interaction", author = "Keshav Santhanam and Omar Khattab and Jon Saad-Falcon and Christopher Potts and Matei Zaharia", journal= "arXiv preprint arXiv:2112.01488", year = "2021", url = "https://arxiv.org/abs/2112.01488" } @inproceedings{petroni-etal-2021-kilt, title = "{KILT}: a Benchmark for Knowledge Intensive Language Tasks", author = {Petroni, Fabio and Piktus, Aleksandra and Fan, Angela and Lewis, Patrick and Yazdani, Majid and De Cao, Nicola and Thorne, James and Jernite, Yacine and Karpukhin, Vladimir and Maillard, Jean and Plachouras, Vassilis and Rockt{\"a}schel, Tim and Riedel, Sebastian}, booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies", month = "jun", year = "2021", address = "Online", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2021.naacl-main.200", doi = "10.18653/v1/2021.naacl-main.200", pages = "2523--2544", } @misc{Voorhees1996Disks45, title = {NIST TREC Disks 4 and 5: Retrieval Test Collections Document Set}, author = {Ellen M. Voorhees}, doi = {10.18434/t47g6m}, year = {1996}, publisher = {National Institute of Standards and Technology} } @inproceedings{Voorhees1998Trec7, title = {Overview of the Seventh Text Retrieval Conference (TREC-7)}, author = {Ellen M. Voorhees and Donna Harman}, year = {1998}, booktitle = {TREC} } @inproceedings{Voorhees1999Trec8, title = {Overview of the Eight Text Retrieval Conference (TREC-8)}, author = {Ellen M. Voorhees and Donna Harman}, year = {1999}, booktitle = {TREC} } @inproceedings{sasaki-etal-2018-cross, title = "Cross-Lingual Learning-to-Rank with Shared Representations", author = "Sasaki, Shota and Sun, Shuo and Schamoni, Shigehiko and Duh, Kevin and Inui, Kentaro", booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 2 (Short Papers)", month = jun, year = "2018", address = "New Orleans, Louisiana", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/N18-2073", doi = "10.18653/v1/N18-2073", pages = "458--463" } @inproceedings{mackie2022codec, title={CODEC: Complex Document and Entity Collection}, author={Mackie, Iain and Owoicho, Paul and Gemmell, Carlos and Fischer, Sophie and MacAvaney, Sean and Dalton, Jeffery}, booktitle={Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval}, year={2022} } @inproceedings{Dato2022Istella, title={The Istella22 Dataset: Bridging Traditional and Neural Learning to Rank Evaluation}, author={Domenico Dato, Sean MacAvaney, Franco Maria Nardini, Raffaele Perego, Nicola Tonellotto}, booktitle={Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval}, year={2022} } @inproceedings{Bondarenko2022Touche, address = {Berlin Heidelberg New York}, author = {Alexander Bondarenko and Maik Fr{\"o}be and Johannes Kiesel and Shahbaz Syed and Timon Gurcke and Meriem Beloucif and Alexander Panchenko and Chris Biemann and Benno Stein and Henning Wachsmuth and Martin Potthast and Matthias Hagen}, booktitle = {Experimental IR Meets Multilinguality, Multimodality, and Interaction. 13th International Conference of the CLEF Association (CLEF 2022)}, editor = {Alberto Barr{\'o}n-Cede{\~n}o and Giovanni Da San Martino and Mirko Degli Esposti and Fabrizio Sebastiani and Craig Macdonald and Gabriella Pasi and Allan Hanbury and Martin Potthast and Guglielmo Faggioli and Nicola Ferro}, month = sep, numpages = 29, publisher = {Springer}, series = {Lecture Notes in Computer Science}, site = {Bologna, Italy}, title = {{Overview of Touch{\'e} 2022: Argument Retrieval}}, year = 2022 } @inproceedings{Kiesel2021Image, author = {Johannes Kiesel and Nico Reichenbach and Benno Stein and Martin Potthast}, booktitle = {8th Workshop on Argument Mining (ArgMining 2021) at EMNLP}, doi = {10.18653/v1/2021.argmining-1.4}, editor = {Khalid Al-Khatib and Yufang Hou and Manfred Stede}, month = nov, pages = {36-45}, publisher = {Association for Computational Linguistics}, site = {Punta Cana, Dominican Republic}, title = {{Image Retrieval for Arguments Using Stance-Aware Query Expansion}}, url = {https://aclanthology.org/2021.argmining-1.4/}, year = 2021 } @inproceedings{Dimitrov2021SemEval, author = {Dimitar Dimitrov and Bishr Bin Ali and Shaden Shaar and Firoj Alam and Fabrizio Silvestri and Hamed Firooz and Preslav Nakov and Giovanni Da San Martino}, editor = {Alexis Palmer and Nathan Schneider and Natalie Schluter and Guy Emerson and Aur{\'{e}}lie Herbelot and Xiaodan Zhu}, title = {SemEval-2021 Task 6: Detection of Persuasion Techniques in Texts and Images}, booktitle = {Proceedings of the 15th International Workshop on Semantic Evaluation, SemEval@ACL/IJCNLP 2021, Virtual Event / Bangkok, Thailand, August 5-6, 2021}, pages = {70--98}, publisher = {Association for Computational Linguistics}, year = {2021}, doi = {10.18653/v1/2021.semeval-1.7}, } @inproceedings{Yanai2007Image, author = {Keiji Yanai}, editor = {Carey L. Williamson and Mary Ellen Zurko and Peter F. Patel{-}Schneider and Prashant J. Shenoy}, title = {Image collector {III:} a web image-gathering system with bag-of-keypoints}, booktitle = {Proceedings of the 16th International Conference on World Wide Web, {WWW} 2007, Banff, Alberta, Canada, May 8-12, 2007}, pages = {1295--1296}, publisher = {{ACM}}, year = {2007}, doi = {10.1145/1242572.1242816}, } @article{Zhang2022Miracl, title={Making a MIRACL: Multilingual information retrieval across a continuum of languages}, author={Zhang, Xinyu and Thakur, Nandan and Ogundepo, Odunayo and Kamalloo, Ehsan and Alfonso-Hermelo, David and Li, Xiaoguang and Liu, Qun and Rezagholizadeh, Mehdi and Lin, Jimmy}, journal={arXiv preprint arXiv:2210.09984}, year={2022} } @article{Lawrie2022HC4, author = {Dawn Lawrie and James Mayfield and Douglas W. Oard and Eugene Yang}, title = {HC4: A New Suite of Test Collections for Ad Hoc CLIR}, booktitle = {{Advances in Information Retrieval. 44th European Conference on IR Research (ECIR 2022)}, year = {2022}, month = apr, publisher = {Springer}, series = {Lecture Notes in Computer Science}, site = {Stavanger, Norway}, url = {https://arxiv.org/abs/2201.09992} } ================================================ FILE: ir_datasets/docs/c4.yaml ================================================ _: pretty_name: 'C4' desc: ' <p> A version of <a href="https://www.tensorflow.org/datasets/catalog/c4">Google''s C4 dataset</a>, which consists of articles crawled form the web. </p> ' en-noclean-tr: desc: ' <p> The "en-noclean" train subset of the corpus, consisting of ~1B documents written in English. Document IDs are assigned as proposed by the <a href="https://trec-health-misinfo.github.io/"> TREC Health Misinformation 2021 track</a>. </p> ' en-noclean-tr/trec-misinfo-2021: desc: ' <p> The TREC Health Misinformation 2021 track. </p> <ul> <li><a href="https://trec-health-misinfo.github.io/">Shared Task Website</a> </ul> ' ================================================ FILE: ir_datasets/docs/car.yaml ================================================ _: pretty_name: 'TREC CAR' desc: ' <p> An ad-hoc passage retrieval collection, constructed from Wikipedia and used as the basis of the TREC Complex Answer Retrieval (CAR) task. </p> ' v1.5: desc: ' <p> Version 1.5 of the TREC dataset. This version is used for year 1 (2017) of the TREC CAR shared task. </p> ' bibtex_ids: ['Dietz2017Car'] v1.5/trec-y1: desc: ' <p> Official test set of TREC CAR 2017 (year 1). </p> ' bibtex_ids: ['Dietz2017TrecCar', 'Dietz2017Car'] v1.5/trec-y1/manual: desc: ' <p> Official test set of TREC CAR 2017 (year 1), using manual graded relevance judgments. </p> ' bibtex_ids: ['Dietz2017TrecCar', 'Dietz2017Car'] v1.5/trec-y1/auto: desc: ' <p> Official test set of TREC CAR 2017 (year 1), using automatic relevance judgments (assumed from hierarchical structure of pages, i.e., paragraphs under a header are assumed relevant.) </p> ' bibtex_ids: ['Dietz2017TrecCar', 'Dietz2017Car'] v1.5/test200: desc: ' <p> Un-official test set consisting of manually-selected articles. Sometimes used as a validation set. </p> ' bibtex_ids: ['Nanni2017BenchmarkCar', 'Dietz2017Car'] v1.5/train/fold0: desc: ' <p> Fold 0 of the official large training set for TREC CAR 2017. Relevance assumed from hierarchical structure of pages (i.e., paragraphs under a header are assumed relevant.) </p> ' bibtex_ids: ['Dietz2017TrecCar', 'Dietz2017Car'] v1.5/train/fold1: desc: ' <p> Fold 1 of the official large training set for TREC CAR 2017. Relevance assumed from hierarchical structure of pages (i.e., paragraphs under a header are assumed relevant.) </p> ' bibtex_ids: ['Dietz2017TrecCar', 'Dietz2017Car'] v1.5/train/fold2: desc: ' <p> Fold 2 of the official large training set for TREC CAR 2017. Relevance assumed from hierarchical structure of pages (i.e., paragraphs under a header are assumed relevant.) </p> ' bibtex_ids: ['Dietz2017TrecCar', 'Dietz2017Car'] v1.5/train/fold3: desc: ' <p> Fold 3 of the official large training set for TREC CAR 2017. Relevance assumed from hierarchical structure of pages (i.e., paragraphs under a header are assumed relevant.) </p> ' bibtex_ids: ['Dietz2017TrecCar', 'Dietz2017Car'] v1.5/train/fold4: desc: ' <p> Fold 4 of the official large training set for TREC CAR 2017. Relevance assumed from hierarchical structure of pages (i.e., paragraphs under a header are assumed relevant.) </p> ' bibtex_ids: ['Dietz2017TrecCar', 'Dietz2017Car'] v2.0: desc: ' <p> Version 2.0 of the TREC CAR dataset. </p> ' bibtex_ids: ['Dietz2017Car'] ================================================ FILE: ir_datasets/docs/clinicaltrials.yaml ================================================ _: pretty_name: 'Clinical Trials' desc: ' <p> Clinical trial information from <a href="https://clinicaltrials.gov/">ClinicalTrials.gov</a>. Used for the Clinical Trials subtasks in TREC Precision Medicine. </p> <ul> <li><a href="http://www.trec-cds.org/">TREC Precision Medicine website</a></li> </ul> ' 2017: desc: ' <p> A snapshot of <a href="https://clinicaltrials.gov/">ClinicalTrials.gov</a> from April 2017 for use with the <a class="ds-ref">clinicaltrials/2017/trec-pm-2017</a> and <a class="ds-ref">clinicaltrials/2017/trec-pm-2018</a> Clinical Trials subtasks. </p> <ul> <li><a href="http://www.trec-cds.org/2017.html#documents">Dataset information</a></li> </ul> ' trec-pm-2017: desc: ' <p> The TREC 2017 Precision Medicine clinical trials subtask. </p> <ul> <li><a href="http://www.trec-cds.org/2017.html">Shared task site</a></li> <li><a href="https://trec.nist.gov/pubs/trec26/papers/Overview-PM.pdf">Shared task paper</a></li> <li>See also: <a class="ds-ref">medline/2017/trec-pm-2017</a></li> </ul> ' bibtex_ids: ['Roberts2017TrecPm'] trec-pm-2018: desc: ' <p> The TREC 2018 Precision Medicine clinical trials subtask. </p> <ul> <li><a href="http://www.trec-cds.org/2018.html">Shared task site</a></li> <li><a href="https://trec.nist.gov/pubs/trec27/papers/Overview-PM.pdf">Shared task paper</a></li> <li>See also: <a class="ds-ref">medline/2017/trec-pm-2018</a></li> </ul> ' bibtex_ids: ['Roberts2018TrecPm'] 2019: desc: ' <p> A snapshot of <a href="https://clinicaltrials.gov/">ClinicalTrials.gov</a> from May 2019 for use with the <a class="ds-ref">clinicaltrials/2019/trec-pm-2019</a> Clinical Trials subtask. </p> <ul> <li><a href="http://www.trec-cds.org/2019.html#documents">Dataset information</a></li> </ul> ' trec-pm-2019: desc: ' <p> The TREC 2019 Precision Medicine clinical trials subtask. </p> <ul> <li><a href="http://www.trec-cds.org/2019.html">Shared task site</a></li> <li><a href="https://trec.nist.gov/pubs/trec28/papers/OVERVIEW.PM.pdf">Shared task paper</a></li> </ul> ' bibtex_ids: ['Roberts2019TrecPm'] 2021: desc: ' <p> A snapshot of <a href="https://clinicaltrials.gov/">ClinicalTrials.gov</a> from April 2021 for use with the <a href="http://www.trec-cds.org/2021.html">TREC Clinical Trials 2021 Track</a>. </p> <ul> <li><a href="http://www.trec-cds.org/2021.html#documents">Dataset information</a></li> </ul> ' trec-ct-2021: desc: ' <p> The TREC Clinical Trials 2021 track. </p> <ul> <li><a href="http://www.trec-cds.org/2021.html">Shared Task Website</a></li> </ul> ' trec-ct-2022: desc: ' <p> The TREC Clinical Trials 2022 track. </p> <ul> <li><a href="https://www.trec-cds.org/2022.html">Shared Task Website</a></li> </ul> ' ================================================ FILE: ir_datasets/docs/clirmatrix.yaml ================================================ _: pretty_name: 'CLIRMatrix' desc: ' <p> CLIRMatrix contains is massively large collection of bilingual and multilingual datasets for Cross-Lingual Information Retrieval. </p> <p> With 139 languages, there are 19,182 total language pairs. This is too many to list individually in the catalog, so patterns are instead used to match the dataset. </p> <p> <kbd class="str">"clirmatrix/{lang}"</kbd> (e.g., <kbd class="str">"clirmatrix/en"</kbd>): </p> <p> The document corpus for the given language. Documents are provided as <kbd class="kwd">GenericDoc</kbd>s. </p> <p> <kbd class="str">"clirmatrix/{doc_lang}/{bi139-base|bi139-full}/{query_lang}/{train|dev|test1|test2}"</kbd> (e.g., <kbd class="str">"clirmatrix/en/bi139-full/de/train"</kbd>): </p> <p> Documents are provided as <kbd class="kwd">GenericDoc</kbd>s, queries are provided as <kbd class="kwd">GenericQuery</kbd>s, and qrels are provided as <kbd class="kwd">TrecQrel</kbd>s. </p> <p> Supported languages are: af, als, am, an, ar, arz, ast, az, azb, ba, bar, be, bg, bn, bpy, br, bs, bug, ca, cdo, ce, ceb, ckb, cs, cv, cy, da, de, diq, el, eml, en, eo, es, et, eu, fa, fi, fo, fr, fy, ga, gd, gl, gu, he, hi, hr, hsb, ht, hu, hy, ia, id, ilo, io, is, it, ja, jv, ka, kk, kn, ko, ku, ky, la, lb, li, lmo, lt, lv, mai, mg, mhr, min, mk, ml, mn, mr, mrj, ms, my, mzn, nap, nds, ne, new, nl, nn, no, oc, or, os, pa, pl, pms, pnb, ps, pt, qu, ro, ru, sa, sah, scn, sco, sd, sh, si, simple, sk, sl, sq, sr, su, sv, sw, szl, ta, te, tg, th, tl, tr, tt, uk, ur, uz, vec, vi, vo, wa, war, wuu, xmf, yi, yo, zh </p> <p> <kbd class="str">"clirmatrix/{doc_lang}/multi8/{query_lang}/{train|dev|test1|test2}"</kbd> (e.g., <kbd class="str">"clirmatrix/en/multi8/de/train"</kbd>): </p> <p> Documents are provided as <kbd class="kwd">GenericDoc</kbd>s, queries are provided as <kbd class="kwd">GenericQuery</kbd>s, and qrels are provided as <kbd class="kwd">TrecQrel</kbd>s. Supported languages are: ar, de, en, es, fr, ja, ru, zh </p> <ul> <li><a href="https://www.aclweb.org/anthology/2020.emnlp-main.340">Paper</a></li> <li><a href="http://www.cs.jhu.edu/~shuosun/clirmatrix/">Data Website</a></li> </ul> ' bibtex_ids: ['Sun2020Clirmatrix'] ================================================ FILE: ir_datasets/docs/clueweb09.yaml ================================================ _: pretty_name: 'ClueWeb09' desc: ' <p> ClueWeb 2009 web document collection. Contains over 1B web pages, in 10 languages. </p> <p> The dataset is obtained for a fee from CMU, and is shipped as hard drives. More information is provided <a href="https://lemurproject.org/clueweb09/">here</a>. </p> <ul> <li><a href="https://lemurproject.org/clueweb09/">Document collection site</a></li> </ul> ' docs_instructions: &inst "docs available from CMU" data_access: ' <p> To use this dataset, you need a copy of <a href="https://lemurproject.org/clueweb09/">ClueWeb 2009</a>, provided by CMU. </p> <p> Your organization may already have a copy. If this is the case, you may only need to complete a new "Individual Argeement". Otherwise, your organization will need to file the "Organizational agreement" and pay a fee to CMU to get a copy. The data are provided as hard drives that are shipped to you. </p> <p> Once you have the data, ir_datasets will need the directories that look like the following: </p> <ul> <li><kbd>ClueWeb09_English_1</kbd></li> <li><kbd>ClueWeb09_English_2</kbd></li> <li><kbd>...</kbd></li> <li><kbd>ClueWeb09_Arabic_1</kbd></li> <li><kbd>...</kbd></li> </ul> <p> ir_datasets expects the above directories to be copied/linked under <kbd>~/.ir_datasets/clueweb09/corpus</kbd>. </p> ' ar: desc: ' <p> Subset of ClueWeb09 with only Arabic-language documents. </p> ' docs_instructions: *inst zh: desc: ' <p> Subset of ClueWeb09 with only Chinese-language documents. </p> ' docs_instructions: *inst en: desc: ' <p> Subset of ClueWeb09 with only English-language documents. </p> ' docs_instructions: *inst fr: desc: ' <p> Subset of ClueWeb09 with only French-language documents. </p> ' docs_instructions: *inst de: desc: ' <p> Subset of ClueWeb09 with only German-language documents. </p> ' docs_instructions: *inst it: desc: ' <p> Subset of ClueWeb09 with only Italian-language documents. </p> ' docs_instructions: *inst ja: desc: ' <p> Subset of ClueWeb09 with only Japanese-language documents. </p> ' docs_instructions: *inst ko: desc: ' <p> Subset of ClueWeb09 with only Korean-language documents. </p> ' docs_instructions: *inst pt: desc: ' <p> Subset of ClueWeb09 with only Portuguese-language documents. </p> ' docs_instructions: *inst es: desc: ' <p> Subset of ClueWeb09 with only Spanish-language documents. </p> ' docs_instructions: *inst catb: desc: ' <p> Subset of ClueWeb09 with the first ~50 million English-language documents. Used as a smaller collection for TREC Web Track tasks. </p> ' docs_instructions: *inst trec-web-2009: desc: ' <p> The TREC Web Track 2009 ad-hoc ranking benchmark. Contains 50 queries with deep relevance judgments. </p> <ul> <li><a href="https://trec.nist.gov/data/web09.html">Shared task site</a></li> <li><a href="https://trec.nist.gov/pubs/trec18/papers/WEB09.OVERVIEW.pdf">Shared task paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['Clarke2009TrecWeb'] trec-web-2010: desc: ' <p> The TREC Web Track 2010 ad-hoc ranking benchmark. Contains 50 queries with deep relevance judgments. </p> <ul> <li><a href="https://trec.nist.gov/data/web10.html">Shared task site</a></li> <li><a href="http://www-personal.umich.edu/~kevynct/pubs/trec-web-2014-overview.pdf">Shared task paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['Clarke2010TrecWeb'] trec-web-2011: desc: ' <p> The TREC Web Track 2011 ad-hoc ranking benchmark. Contains 50 queries with deep relevance judgments. </p> <ul> <li><a href="https://trec.nist.gov/pubs/trec20/papers/WEB.OVERVIEW.pdf">Shared task site</a></li> <li><a href="http://www-personal.umich.edu/~kevynct/pubs/trec-web-2014-overview.pdf">Shared task paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['Clarke2011TrecWeb'] trec-web-2012: desc: ' <p> The TREC Web Track 2012 ad-hoc ranking benchmark. Contains 50 queries with deep relevance judgments. </p> <ul> <li><a href="https://trec.nist.gov/data/web2012.html">Shared task site</a></li> <li><a href="https://trec.nist.gov/pubs/trec21/papers/WEB12.overview.pdf">Shared task paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['Clarke2012TrecWeb'] trec-mq-2009: desc: ' <p> TREC 2009 Million Query track. </p> <ul> <li><a href="https://trec.nist.gov/data/million.query09.html">Shared task site</a></li> <li><a href="https://trec.nist.gov/pubs/trec18/papers/MQ09OVERVIEW.pdf">Shared task paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['Carterette2009MQ'] ================================================ FILE: ir_datasets/docs/clueweb12.yaml ================================================ _: pretty_name: 'ClueWeb12' desc: ' <p> ClueWeb 2012 web document collection. Contains 733M web pages. </p> <p> The dataset is obtained for a fee from CMU, and is shipped as hard drives. More information is provided <a href="https://lemurproject.org/clueweb12/">here</a>. </p> <ul> <li><a href="https://lemurproject.org/clueweb12/">Document collection site</a></li> <li><a href="http://boston.lti.cs.cmu.edu/clueweb12/">Dataset construction details</a></li> </ul> ' docs_instructions: &inst "docs available from CMU" data_access: ' <p> To use this dataset, you need a copy of <a href="https://lemurproject.org/clueweb12/">ClueWeb 2012</a>, provided by CMU. </p> <p> Your organization may already have a copy. If this is the case, you may only need to complete a new "Individual Argeement". Otherwise, your organization will need to file the "Organizational agreement" and pay a fee to CMU to get a copy. The data are provided as hard drives that are shipped to you. </p> <p> Once you have the data, ir_datasets will need the directories that look like the following: </p> <ul> <li><kbd>ClueWeb12_00</kbd></li> <li><kbd>ClueWeb12_01</kbd></li> <li><kbd>...</kbd></li> </ul> <p> ir_datasets expects the above directories to be copied/linked under <kbd>~/.ir_datasets/clueweb12/corpus</kbd>. </p> ' b13: desc: ' <p> Official subset of the ClueWeb12 datasets with 52M web pages. </p> ' docs_instructions: *inst trec-web-2013: desc: ' <p> The TREC Web Track 2013 ad-hoc ranking benchmark. Contains 50 queries with deep relevance judgments. </p> <ul> <li><a href="https://trec.nist.gov/data/web2013.html">Shared task site</a></li> <li><a href="https://trec.nist.gov/pubs/trec22/papers/WEB.OVERVIEW.pdf">Shared task paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['CollinsThompson2013TrecWeb'] trec-web-2013/diversity: desc: ' <p> The TREC Web Track 2013 diverse ranking benchmark. Contains 50 queries with deep subtopic relevance judgments. </p> <ul> <li><a href="https://trec.nist.gov/data/web2013.html">Shared task site</a></li> <li><a href="https://trec.nist.gov/pubs/trec22/papers/WEB.OVERVIEW.pdf">Shared task paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['CollinsThompson2013TrecWeb'] trec-web-2014: desc: ' <p> The TREC Web Track 2014 ad-hoc ranking benchmark. Contains 50 queries with deep relevance judgments. </p> <ul> <li><a href="https://trec.nist.gov/data/web2014.html">Shared task site</a></li> <li><a href="http://www-personal.umich.edu/~kevynct/pubs/trec-web-2014-overview.pdf">Shared task paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['CollinsThompson2014TrecWeb'] trec-web-2014/diversity: desc: ' <p> The TREC Web Track 2014 diverse ranking benchmark. Contains 50 queries with deep subtopic relevance judgments. </p> <ul> <li><a href="https://trec.nist.gov/data/web2014.html">Shared task site</a></li> <li><a href="http://www-personal.umich.edu/~kevynct/pubs/trec-web-2014-overview.pdf">Shared task paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['CollinsThompson2014TrecWeb'] ntcir-www-1: desc: ' <p> The NTCIR-13 We Want Web (WWW) 1 ad-hoc ranking benchmark. Contains 100 queries with deep relevance judgments (avg 255 per query). Judgments aggregated from two assessors. Note that the qrels contain additional judgments from the NTCIR-14 CENTRE track. </p> <ul> <li><a href="http://www.thuir.cn/ntcirwww/">Shared task site</a></li> <li><a href="http://www.thuir.cn/ntcirwww/files/ntcir13wwwov.pdf">Shared task paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['Luo2017Www1'] ntcir-www-2: desc: ' <p> The NTCIR-14 We Want Web (WWW) 2 ad-hoc ranking benchmark. Contains 80 queries with deep relevance judgments (avg 345 per query). Judgments aggregated from two assessors. </p> <ul> <li><a href="http://www.thuir.cn/ntcirwww2/">Shared task site</a></li> <li><a href="http://research.nii.ac.jp/ntcir/workshop/OnlineProceedings14/pdf/ntcir/01-NTCIR14-OV-WWW-MaoJ.pdf">Shared task paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['Mao2018OWww2'] ntcir-www-3: desc: ' <p> The NTCIR-15 We Want Web (WWW) 3 ad-hoc ranking benchmark. Contains 160 queries with deep relevance judgments (to be released). 80 of the queries are from <a class="ds-ref">clueweb12/b13/ntcir-www-2</a>. </p> <ul> <li><a href="http://sakailab.com/www3/">Shared task site</a></li> </ul> ' docs_instructions: *inst trec-misinfo-2019: desc: ' <p> The TREC Medical Misinformation 2019 dataset. </p> <ul> <li><a href="https://trec.nist.gov/data/misinfo2019.html">Shared task site</a></li> <li><a href="https://trec.nist.gov/pubs/trec28/papers/OVERVIEW.D.pdf">Shared task paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['Abualsaud2019TrecDecision'] clef-ehealth: desc: ' <p> The CLEF eHealth 2016-17 IR dataset. Contains consumer health queries and judgments containing trustworthiness and understandability scores, in addition to the normal relevance assessments. </p> <p> This dataset contains the combined 2016 and 2017 relevance judgments, since the same queries were used in the two year. The assessment year can be distinguished using iteration (2016 is iteration 0, 2017 is iteration 1). </p> <ul> <li><a href="https://sites.google.com/site/clefehealth2016/task-3">2016 shared task site</a></li> <li><a href="https://sites.google.com/site/clefehealth2017/task-3">2017 shared task site</a></li> <li><a href="http://ceur-ws.org/Vol-1609/16090015.pdf">2016 shared task paper</a></li> <li><a href="http://ceur-ws.org/Vol-1866/invited_paper_16.pdf">2017 shared task paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['Zuccon2016ClefEhealth', 'Palotti2017ClefEhealth'] clef-ehealth/cs: desc: ' <p> The CLEF eHealth 2016-17 IR dataset, with queries professionally translataed to Czech. See <a class="ds-ref">clueweb12/b13/clef-ehealth</a> for more details. </p> ' docs_instructions: *inst bibtex_ids: ['Zuccon2016ClefEhealth', 'Palotti2017ClefEhealth'] clef-ehealth/de: desc: ' <p> The CLEF eHealth 2016-17 IR dataset, with queries professionally translataed to German. See <a class="ds-ref">clueweb12/b13/clef-ehealth</a> for more details. </p> ' docs_instructions: *inst bibtex_ids: ['Zuccon2016ClefEhealth', 'Palotti2017ClefEhealth'] clef-ehealth/fr: desc: ' <p> The CLEF eHealth 2016-17 IR dataset, with queries professionally translataed to French. See <a class="ds-ref">clueweb12/b13/clef-ehealth</a> for more details. </p> ' docs_instructions: *inst bibtex_ids: ['Zuccon2016ClefEhealth', 'Palotti2017ClefEhealth'] clef-ehealth/hu: desc: ' <p> The CLEF eHealth 2016-17 IR dataset, with queries professionally translataed to Hungarian. See <a class="ds-ref">clueweb12/b13/clef-ehealth</a> for more details. </p> ' docs_instructions: *inst bibtex_ids: ['Zuccon2016ClefEhealth', 'Palotti2017ClefEhealth'] clef-ehealth/pl: desc: ' <p> The CLEF eHealth 2016-17 IR dataset, with queries professionally translataed to Polish. See <a class="ds-ref">clueweb12/b13/clef-ehealth</a> for more details. </p> ' docs_instructions: *inst bibtex_ids: ['Zuccon2016ClefEhealth', 'Palotti2017ClefEhealth'] clef-ehealth/sv: desc: ' <p> The CLEF eHealth 2016-17 IR dataset, with queries professionally translataed to Swedish. See <a class="ds-ref">clueweb12/b13/clef-ehealth</a> for more details. </p> ' docs_instructions: *inst bibtex_ids: ['Zuccon2016ClefEhealth', 'Palotti2017ClefEhealth'] ================================================ FILE: ir_datasets/docs/codec.yaml ================================================ _: pretty_name: 'CODEC' desc: ' <p> CODEC Document Ranking sub-task. </p> <ul> <li>Documents: curated web articles</li> <li>Queries: challenging, entity-focused queries</li> <li><a href="https://github.com/grill-lab/CODEC">Task Repository</a></li> <li>See also: <a class="ds-ref">kilt/codec</a>, the entity ranking subtask</li> </ul> ' docs_instructions: &inst "docs available from dataset authors" data_access: ' <p> To use this dataset, you need a copy the document corpus from <a href="https://github.com/grill-lab/CODEC">here</a>. </p> <p> The process involves emailing a dataset author, who will provide instructions for downloading the dataset. </p> <p> ir_datasets expects the source file to be copied/linked under <kbd>~/.ir_datasets/codec/v1/comets_documents.jsonl</kbd>. </p> ' bibtex_ids: ['mackie2022codec'] economics: desc: ' <p> Subset of <a class="ds-ref">codec</a> that only contains topics about economics. </p> ' docs_instructions: *inst bibtex_ids: ['mackie2022codec'] history: desc: ' <p> Subset of <a class="ds-ref">codec</a> that only contains topics about history. </p> ' docs_instructions: *inst bibtex_ids: ['mackie2022codec'] politics: desc: ' <p> Subset of <a class="ds-ref">codec</a> that only contains topics about politics. </p> ' docs_instructions: *inst bibtex_ids: ['mackie2022codec'] ================================================ FILE: ir_datasets/docs/codesearchnet.yaml ================================================ _: pretty_name: 'CodeSearchNet' desc: ' <p> A benchmark for semantic code search. Uses </p> <ul> <li>Documents: Code functions in python, java, go, php, ruby, and javascript</li> <li>Queries: Inferred from docstrings, or </li> <li><a href="https://arxiv.org/pdf/1909.09436.pdf">Dataset Paper</a></li> <li><a href="https://wandb.ai/github/codesearchnet/benchmark/leaderboard">Challenge Task Leaderboard</a></li> </ul> ' bibtex_ids: ['Husain2019CodeSearchNet'] train: desc: ' <p> Official train set, using queries inferred from docstrings. </p> ' bibtex_ids: ['Husain2019CodeSearchNet'] valid: desc: ' <p> Official validation set, using queries inferred from docstrings. </p> ' bibtex_ids: ['Husain2019CodeSearchNet'] test: desc: ' <p> Official test set, using queries inferred from docstrings. </p> ' bibtex_ids: ['Husain2019CodeSearchNet'] challenge: desc: ' <p> Official challenge set, with keyword queries and deep relevance assessments. </p> ' bibtex_ids: ['Husain2019CodeSearchNet'] ================================================ FILE: ir_datasets/docs/cord19.yaml ================================================ _: pretty_name: 'CORD-19' desc: ' <p> Collection of scientific articles related to COVID-19. </p> <p> Uses the 2020-07-16 version of the dataset, corresponding to the "complete" collection used for TREC COVID. </p> <p> Note that this version of the document collection only provides article meta-data. To get the full text, use <a class="ds-ref">cord19/fulltext</a>. </p> <ul> <li><a href="https://www.semanticscholar.org/cord19">Document collection site</a></li> </ul> ' bibtex_ids: ['Wang2020Cord19'] trec-covid: desc: ' <p> The Complete TREC COVID collection. Queries related to COVID-19, including deep relevance judgments. </p> <ul> <li><a href="https://ir.nist.gov/covidSubmit/index.html">Shared task site</a></li> <li><a href="https://ir.nist.gov/covidSubmit/papers/Forum_TRECCOVID1.pdf">Shared task paper</a></li> </ul> ' bibtex_ids: ['Voorhees2020TrecCovid', 'Wang2020Cord19'] fulltext: desc: ' <p> Version of <a class="ds-ref">cord19</a> dataset that includes article full texts. This dataset takes longer to load than the version that only includes article meata-data. </p> ' bibtex_ids: ['Wang2020Cord19'] fulltext/trec-covid: desc: ' <p> Version of <a class="ds-ref">cord19/trec-covid</a> dataset that includes article full texts. This dataset takes longer to load than the version that only includes article meata-data. </p> <p> Queries and qrels are the same as <a class="ds-ref">cord19/trec-covid</a>; it just uses the extended documents from <a class="ds-ref">cord19/fulltext</a>. </p> ' bibtex_ids: ['Voorhees2020TrecCovid', 'Wang2020Cord19'] round1: desc: ' <p> Version of <a class="ds-ref">cord19</a> dataset from 2020-04-10 with only documents used for the first round of the TREC COVID shared task. </p> ' bibtex_ids: ['Voorhees2020TrecCovid', 'Wang2020Cord19'] trec-covid/round1: desc: ' <p> Round 1 of the TREC COVID task. Includes 30 queries related to COVID-19. This uses the "2020-04-10" version of the collection. </p> <ul> <li><a href="https://ir.nist.gov/covidSubmit/round1.html">Round 1 Guidelines</a></li> <li><a href="https://ir.nist.gov/covidSubmit/index.html">Shared task site</a></li> <li><a href="https://ir.nist.gov/covidSubmit/papers/Forum_TRECCOVID1.pdf">Shared task paper</a></li> </ul> ' bibtex_ids: ['Voorhees2020TrecCovid', 'Wang2020Cord19'] trec-covid/round2: desc: ' <p> Round 2 of the TREC COVID task. Includes 35 queries related to COVID-19. This uses the "2020-05-01" version of the collection. </p> <p> Note that the qrels do not contain results from the prior round(s). Use the "complete" version for this setting (<a class="ds-ref">cord19/trec-covid</a>). </p> <ul> <li><a href="https://ir.nist.gov/covidSubmit/round2.html">Round 2 Guidelines</a></li> <li><a href="https://ir.nist.gov/covidSubmit/index.html">Shared task site</a></li> <li><a href="https://ir.nist.gov/covidSubmit/papers/Forum_TRECCOVID1.pdf">Shared task paper</a></li> </ul> ' bibtex_ids: ['Voorhees2020TrecCovid', 'Wang2020Cord19'] trec-covid/round3: desc: ' <p> Round 3 of the TREC COVID task. Includes 40 queries related to COVID-19. This uses the "2020-05-19" version of the collection. </p> <p> Note that the qrels do not contain results from the prior round(s). Use the "complete" version for this setting (<a class="ds-ref">cord19/trec-covid</a>). </p> <ul> <li><a href="https://ir.nist.gov/covidSubmit/round3.html">Round 3 Guidelines</a></li> <li><a href="https://ir.nist.gov/covidSubmit/index.html">Shared task site</a></li> <li><a href="https://ir.nist.gov/covidSubmit/papers/Forum_TRECCOVID1.pdf">Shared task paper</a></li> </ul> ' bibtex_ids: ['Voorhees2020TrecCovid', 'Wang2020Cord19'] trec-covid/round4: desc: ' <p> Round 4 of the TREC COVID task. Includes 45 queries related to COVID-19. This uses the "2020-06-19" version of the collection. </p> <p> Note that the qrels do not contain results from the prior round(s). Use the "complete" version for this setting (<a class="ds-ref">cord19/trec-covid</a>). </p> <ul> <li><a href="https://ir.nist.gov/covidSubmit/round4.html">Round 4 Guidelines</a></li> <li><a href="https://ir.nist.gov/covidSubmit/index.html">Shared task site</a></li> <li><a href="https://ir.nist.gov/covidSubmit/papers/Forum_TRECCOVID1.pdf">Shared task paper</a></li> </ul> ' bibtex_ids: ['Voorhees2020TrecCovid', 'Wang2020Cord19'] trec-covid/round5: desc: ' <p> Round 5 of the TREC COVID task. Includes 50 queries related to COVID-19. This uses the "2020-07-16" version of the collection. </p> <p> Note that the qrels do not contain results from the prior round(s). Use the "complete" version for this setting (<a class="ds-ref">cord19/trec-covid</a>). </p> <ul> <li><a href="https://ir.nist.gov/covidSubmit/round5.html">Round 5 Guidelines</a></li> <li><a href="https://ir.nist.gov/covidSubmit/index.html">Shared task site</a></li> <li><a href="https://ir.nist.gov/covidSubmit/papers/Forum_TRECCOVID1.pdf">Shared task paper</a></li> </ul> ' bibtex_ids: ['Voorhees2020TrecCovid', 'Wang2020Cord19'] ================================================ FILE: ir_datasets/docs/cranfield.yaml ================================================ _: pretty_name: 'Cranfield' desc: ' <p> A small corpus of 1,400 scientific abstracts. </p> <ul> <li>Documents: Scientific abstracts</li> <li>Queries: Natural language questions</li> <li><a href="http://ir.dcs.gla.ac.uk/resources/test_collections/cran/">Dataset Information</a></li> </ul> ' ================================================ FILE: ir_datasets/docs/csl.yaml ================================================ _: pretty_name: 'CSL' desc: ' <p> The CSL dataset, used for the TREC NueCLIR technical document task. </p> ' trec-2023: desc: ' <p> The TREC NeuCLIR 2023 technical documen task. </p> ' ================================================ FILE: ir_datasets/docs/disks45.yaml ================================================ _: pretty_name: 'TREC Disks 4 and 5' desc: ' <p> TREC Disks 4 and 5, including documents from the Financial Times, the Congressional Record, the Federal Register, the Foreign Broadcast Information Service, and the Los Angeles Times. </p> <p> This dataset is a placeholder for the complete collection, but at this time, only the version of the dataset without the Congressional Record (<a class="ds-ref">disks45/nocr</a>) are provided. </p> <ul> <li><a href="https://trec.nist.gov/data/cd45/">Information and access to TREC Disks 4 and 5.</a></li> </ul> ' docs_instructions: &inst "docs available from NIST" data_access: ' <p> To use this dataset, you need a copy of <a href="https://trec.nist.gov/data/cd45/index.html">TREC Disks 4 and 5</a>, provided by NIST. </p> <p> Your organization may already have a copy. If this is the case, you may only need to complete a new "Individual Argeement". Otherwise, your organization will need to file the "Organizational agreement" with NIST. It can take some time to process, but you will end up with a password-protected download link. </p> <p> ir_datasets needs the following directories from the source: </p> <ul> <li><kbd>FBIS</kbd></li> <li><kbd>FR94</kbd></li> <li><kbd>FT</kbd></li> <li><kbd>LATIMES</kbd></li> </ul> <p> ir_datasets expects the above directories to be copied/linked under <kbd>~/.ir_datasets/disks45/corpus</kbd>. The source document files themselves can either be compressed or uncompressed (it seems they have been distributed both ways in the past.) If ir_datasets does not find the files it is expecting, it will raise an error. </p> ' nocr: desc: ' <p> A version of <a class="ds-ref">disks45</a> without the Congressional Record. This is the typical setting for tasks like TREC 7, TREC 8, and TREC Robust 2004. </p> ' docs_instructions: *inst bibtex_ids: ['Voorhees1996Disks45'] nocr/trec-robust-2004: desc: ' <p> The TREC Robust retrieval task focuses on "improving the consistency of retrieval technology by focusing on poorly performing topics." </p> <p> The TREC Robust document collection is from TREC disks 4 and 5. Due to the copyrighted nature of the documents, this collection is for research use only, which requires agreements to be filed with NIST. See details <a href="https://trec.nist.gov/data/cd45/index.html">here</a>. </p> <ul> <li>Documents: News articles</li> <li>Queries: keyword queries, descriptions, narratives</li> <li>Relevance: Deep judgments</li> <li><a href="https://trec.nist.gov/pubs/trec13/papers/ROBUST.OVERVIEW.pdf">Task Overview Paper</a></li> <li>See also: <a class="ds-ref">aquaint/trec-robust-2005</a></li> </ul>' docs_instructions: *inst bibtex_ids: ['Voorhees1996Disks45', 'Voorhees2004Robust', 'Huston2014ACO'] nocr/trec-robust-2004/fold1: desc: ' <p>Robust04 Fold 1 (Title) proposed by Huston & Croft (2014) and used in numerous works</p>' docs_instructions: *inst bibtex_ids: ['Voorhees1996Disks45', 'Voorhees2004Robust', 'Huston2014ACO'] nocr/trec-robust-2004/fold2: desc: ' <p>Robust04 Fold 2 (Title) proposed by Huston & Croft (2014) and used in numerous works</p>' docs_instructions: *inst bibtex_ids: ['Voorhees1996Disks45', 'Voorhees2004Robust', 'Huston2014ACO'] nocr/trec-robust-2004/fold3: desc: ' <p>Robust04 Fold 3 (Title) proposed by Huston & Croft (2014) and used in numerous works</p>' docs_instructions: *inst bibtex_ids: ['Voorhees1996Disks45', 'Voorhees2004Robust', 'Huston2014ACO'] nocr/trec-robust-2004/fold4: desc: ' <p>Robust04 Fold 4 (Title) proposed by Huston & Croft (2014) and used in numerous works</p>' docs_instructions: *inst bibtex_ids: ['Voorhees1996Disks45', 'Voorhees2004Robust', 'Huston2014ACO'] nocr/trec-robust-2004/fold5: desc: ' <p>Robust04 Fold 5 (Title) proposed by Huston & Croft (2014) and used in numerous works</p>' docs_instructions: *inst bibtex_ids: ['Voorhees1996Disks45', 'Voorhees2004Robust', 'Huston2014ACO'] nocr/trec7: desc: ' <p> The TREC 7 Adhoc Retrieval track. </p> <ul> <li><a href="https://trec.nist.gov/pubs/trec7/papers/overview_7.pdf.gz">Task Overview Paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['Voorhees1996Disks45', 'Voorhees1998Trec7'] nocr/trec8: desc: ' <p> The TREC 8 Adhoc Retrieval track. </p> <ul> <li><a href="https://trec.nist.gov/pubs/trec8/papers/overview_8.pdf">Task Overview Paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['Voorhees1996Disks45', 'Voorhees1999Trec8'] ================================================ FILE: ir_datasets/docs/dpr-w100.yaml ================================================ _: pretty_name: 'DPR Wiki100' desc: ' <p> A wikipedia dump from 20 December, 2018, split into passages of 100 words. Used in experiments in the DPR paper (and other subsequent works) for retrieval experiments over Q&A collections. </p> <ul> <li><a href="https://arxiv.org/pdf/2004.04906.pdf">Dataset paper</a></li> <li><a href="https://github.com/facebookresearch/DPR">Repository</a></li> </ul> ' bibtex_ids: ['Karpukhin2020Dpr'] natural-questions/train: desc: ' <p> Training subset from the Natural Questions Q&A collection. This differs from the <a class="ds-ref">natural-questions/train</a> dataset in that it uses the full Wikipedia dump and additional filtering (described in the DPR paper) was applied. </p> <ul> <li>See also: <a class="ds-ref">natural-questions</a></li> </ul> ' bibtex_ids: ['Kwiatkowski2019Nq', 'Karpukhin2020Dpr'] natural-questions/dev: desc: ' <p> Dev subset from the Natural Questions Q&A collection. This differs from the <a class="ds-ref">natural-questions/dev</a> dataset in that it uses the full Wikipedia dump and additional filtering (described in the DPR paper) was applied. </p> <ul> <li>See also: <a class="ds-ref">natural-questions</a></li> </ul> ' bibtex_ids: ['Kwiatkowski2019Nq', 'Karpukhin2020Dpr'] trivia-qa/train: desc: ' <p> Training subset from the Trivia QA dataset. Differing from the official Trivia QA collection, this uses the DPR Wikipedia dump as the source collection. Refer to the DPR paper for more details. </p> <ul> <li><a href="https://www.aclweb.org/anthology/P17-1147.pdf">Dataset paper</a></li> <li><a href="http://nlp.cs.washington.edu/triviaqa/">Dataset website</a></li> </ul> ' bibtex_ids: ['Joshi2017TriviaQA', 'Karpukhin2020Dpr'] trivia-qa/dev: desc: ' <p> Dev subset from the Trivia QA dataset. Differing from the official Trivia QA collection, this uses the DPR Wikipedia dump as the source collection. Refer to the DPR paper for more details. </p> <ul> <li><a href="https://www.aclweb.org/anthology/P17-1147.pdf">Dataset paper</a></li> <li><a href="http://nlp.cs.washington.edu/triviaqa/">Dataset website</a></li> </ul> ' bibtex_ids: ['Joshi2017TriviaQA', 'Karpukhin2020Dpr'] ================================================ FILE: ir_datasets/docs/gov.yaml ================================================ _: pretty_name: 'GOV' desc: ' <p> GOV web document collection. Used for early TREC Web Tracks. Not to be confused with <a class="ds-ref">gov2</a>. </p> <p> The dataset is obtained for a fee from UoG, and is shipped as a hard drive. More information is provided <a href="http://ir.dcs.gla.ac.uk/test_collections/access_to_data.html">here</a>. </p> <ul> <li><a href="http://ir.dcs.gla.ac.uk/test_collections/gov2-summary.htm">Document collection site</a></li> </ul> ' docs_instructions: &inst "docs available from UoG" data_access: ' <p> To use this dataset, you need a copy of <a href="http://ir.dcs.gla.ac.uk/test_collections/access_to_data.html">GOV</a>, provided by the University of Glasgow. </p> <p> Your organization may already have a copy. If this is the case, you may only need to complete a new "Individual Argeement". Otherwise, your organization will need to file the "Organizational agreement" and pay a fee to UoG to get a copy. The data are provided as hard drives that are shipped to you. </p> <p> Once you have the data, ir_datasets will need the directories that look like the following: </p> <ul> <li><kbd>G00</kbd></li> <li><kbd>G01</kbd></li> <li><kbd>...</kbd></li> </ul> <p> ir_datasets expects the above directories to be copied/linked under <kbd>~/.ir_datasets/gov/corpus</kbd>. </p> ' trec-web-2002: desc: ' <p> The TREC Web Track 2002 ad-hoc ranking benchmark. </p> <ul> <li><a href="https://trec.nist.gov/data/t11.web.html">Shared task site</a></li> <li><a href="https://trec.nist.gov/pubs/trec11/papers/WEB.OVER.pdf">Shared task paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['Craswell2002TrecWeb'] trec-web-2002/named-page: desc: ' <p> The TREC Web Track 2002 named page ranking benchmark. </p> <ul> <li><a href="https://trec.nist.gov/data/t11.web.html">Shared task site</a></li> <li><a href="https://trec.nist.gov/pubs/trec11/papers/WEB.OVER.pdf">Shared task paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['Craswell2002TrecWeb'] trec-web-2003: desc: ' <p> The TREC Web Track 2003 ad-hoc ranking benchmark. </p> <ul> <li><a href="https://trec.nist.gov/data/t12.web.html">Shared task site</a></li> <li><a href="https://trec.nist.gov/pubs/trec12/papers/WEB.OVERVIEW.pdf">Shared task paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['Craswell2003TrecWeb'] trec-web-2003/named-page: desc: ' <p> The TREC Web Track 2003 named page ranking benchmark. </p> <ul> <li><a href="https://trec.nist.gov/data/t12.web.html">Shared task site</a></li> <li><a href="https://trec.nist.gov/pubs/trec12/papers/WEB.OVERVIEW.pdf">Shared task paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['Craswell2003TrecWeb'] trec-web-2004: desc: ' <p> The TREC Web Track 2004 ad-hoc ranking benchmark. </p> <p> Queries include a combination of topic distillation, homepage finding, and named page finding. </p> <ul> <li><a href="https://trec.nist.gov/data/t13.web.html">Shared task site</a></li> <li><a href="https://trec.nist.gov/pubs/trec12/papers/WEB.OVERVIEW.pdf">Shared task paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['Craswell2004TrecWeb'] ================================================ FILE: ir_datasets/docs/gov2.yaml ================================================ _: pretty_name: 'GOV2' desc: ' <p> GOV2 web document collection. Used for the TREC Terabyte Track. </p> <p> The dataset is obtained for a fee from UoG, and is shipped as a hard drive. More information is provided <a href="http://ir.dcs.gla.ac.uk/test_collections/access_to_data.html">here</a>. </p> <ul> <li><a href="http://ir.dcs.gla.ac.uk/test_collections/gov2-summary.htm">Document collection site</a></li> </ul> ' docs_instructions: &inst "docs available from UoG" data_access: ' <p> To use this dataset, you need a copy of <a href="http://ir.dcs.gla.ac.uk/test_collections/access_to_data.html">GOV2</a>, provided by the University of Glasgow. </p> <p> Your organization may already have a copy. If this is the case, you may only need to complete a new "Individual Argeement". Otherwise, your organization will need to file the "Organizational agreement" and pay a fee to UoG to get a copy. The data are provided as hard drives that are shipped to you. </p> <p> Once you have the data, ir_datasets will need the <kbd>GOV2_data</kbd> directory. </p> <p> ir_datasets expects the above directory to be copied/linked under <kbd>~/.ir_datasets/gov/corpus</kbd>. </p> ' trec-tb-2004: desc: ' <p> The TREC Terabyte Track 2004 ad-hoc ranking benchmark. Contains 50 queries with deep relevance judgments. </p> <ul> <li><a href="https://trec.nist.gov/data/terabyte04.html">Shared task site</a></li> <li><a href="https://trec.nist.gov/pubs/trec13/papers/TERA.OVERVIEW.pdf">Shared task paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['Clarke2004TrecTerabyte'] trec-tb-2005: desc: ' <p> The TREC Terabyte Track 2005 ad-hoc ranking benchmark. Contains 50 queries with deep relevance judgments. </p> <ul> <li><a href="https://trec.nist.gov/data/terabyte05.html">Shared task site</a></li> <li><a href="https://trec.nist.gov/pubs/trec14/papers/TERABYTE.OVERVIEW.pdf">Shared task paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['Clarke2005TrecTerabyte'] trec-tb-2005/named-page: desc: ' <p> The TREC Terabyte Track 2005 named page ranking benchmark. Contains 252 queries with titles that resemble bookmark labels. Relevance judgments include near-duplicate pages and other pages that may satisfy the bookmark label. </p> <ul> <li><a href="https://trec.nist.gov/data/terabyte05.html">Shared task site</a></li> <li><a href="https://trec.nist.gov/pubs/trec14/papers/TERABYTE.OVERVIEW.pdf">Shared task paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['Clarke2005TrecTerabyte'] trec-tb-2005/efficiency: desc: ' <p> The TREC Terabyte Track 2005 efficiency ranking benchmark. Contains 50,000 queries from a search engine, including the 50 topics from <a class="ds-ref">gov2/trec-tb-2005</a>. Only the 50 topics have judgments. </p> <ul> <li><a href="https://trec.nist.gov/data/terabyte05.html">Shared task site</a></li> <li><a href="https://trec.nist.gov/pubs/trec14/papers/TERABYTE.OVERVIEW.pdf">Shared task paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['Clarke2005TrecTerabyte'] trec-tb-2006: desc: ' <p> The TREC Terabyte Track 2006 ad-hoc ranking benchmark. Contains 50 queries with deep relevance judgments. </p> <ul> <li><a href="https://trec.nist.gov/data/terabyte06.html">Shared task site</a></li> <li><a href="https://trec.nist.gov/pubs/trec15/papers/TERA06.OVERVIEW.pdf">Shared task paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['Buttcher2006TrecTerabyte'] trec-tb-2006/named-page: desc: ' <p> The TREC Terabyte Track 2006 named page ranking benchmark. Contains 181 queries with titles that resemble bookmark labels. Relevance judgments include near-duplicate pages and other pages that may satisfy the bookmark label.</p> <ul> <li><a href="https://trec.nist.gov/data/terabyte06.html">Shared task site</a></li> <li><a href="https://trec.nist.gov/pubs/trec15/papers/TERA06.OVERVIEW.pdf">Shared task paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['Buttcher2006TrecTerabyte'] trec-tb-2006/efficiency: desc: ' <p> The TREC Terabyte Track 2006 efficiency ranking benchmark. Contains 100,000 queries from a search engine, including the 50 topics from <a class="ds-ref">gov2/trec-tb-2006</a>. Only the 50 topics have judgments. </p> <ul> <li><a href="https://trec.nist.gov/data/terabyte05.html">Shared task site</a></li> <li><a href="https://trec.nist.gov/pubs/trec14/papers/TERABYTE.OVERVIEW.pdf">Shared task paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['Buttcher2006TrecTerabyte'] trec-tb-2006/efficiency/10k: desc: ' <p> Small stream from <a class="ds-ref">gov2/trec-tb-2006/efficiency</a>, with 10,000 queries. </p> ' docs_instructions: *inst bibtex_ids: ['Buttcher2006TrecTerabyte'] trec-tb-2006/efficiency/stream1: desc: ' <p> Stream 1 of <a class="ds-ref">gov2/trec-tb-2006/efficiency</a> (25,000 queries). </p> ' docs_instructions: *inst bibtex_ids: ['Buttcher2006TrecTerabyte'] trec-tb-2006/efficiency/stream2: desc: ' <p> Stream 2 of <a class="ds-ref">gov2/trec-tb-2006/efficiency</a> (25,000 queries). </p> ' docs_instructions: *inst bibtex_ids: ['Buttcher2006TrecTerabyte'] trec-tb-2006/efficiency/stream3: desc: ' <p> Stream 3 of <a class="ds-ref">gov2/trec-tb-2006/efficiency</a> (25,000 queries). </p> ' docs_instructions: *inst bibtex_ids: ['Buttcher2006TrecTerabyte'] trec-tb-2006/efficiency/stream4: desc: ' <p> Stream 4 of <a class="ds-ref">gov2/trec-tb-2006/efficiency</a> (25,000 queries). </p> ' docs_instructions: *inst bibtex_ids: ['Buttcher2006TrecTerabyte'] trec-mq-2007: desc: ' <p> TREC 2007 Million Query track. </p> <ul> <li><a href="https://trec.nist.gov/data/million.query07.html">Shared task site</a></li> <li><a href="https://trec.nist.gov/pubs/trec16/papers/1MQ.OVERVIEW16.pdf">Shared task paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['Allen2007MQ'] trec-mq-2008: desc: ' <p> TREC 2008 Million Query track. </p> <ul> <li><a href="https://trec.nist.gov/data/million.query08.html">Shared task site</a></li> <li><a href="https://trec.nist.gov/pubs/trec17/papers/MQ.OVERVIEW.pdf">Shared task paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['Allen2008MQ'] ================================================ FILE: ir_datasets/docs/hc4.yaml ================================================ _: pretty_name: 'HC4 (HLTCOE CLIR Common-Crawl Collection)' desc: ' <p> HC4 is a new suite of test collections for ad hoc Cross-Language Information Retrieval (CLIR), with Common Crawl News documents in Chinese, Persian, and Russian, topics in English and in the document languages, and graded relevance judgments. </p> <ul> <li>Documents: Web pages from Common Crawl in Chinese, Persian, and Russian.</li> <li> Queries: English TREC-style title/description queries. Narrative field contains an example passage for each relevance level. Human and machine translation of the titles and descriptions in the target language (i.e., document language) are provided in the query object. (Titles and descriptions are machine-translated into all three target languages even in the laguages that they are not assessed to facillate CLIR other than English-to-X pairs, e.g., Persian-to-Chinese. Please refer to the original dataset repository for these additional resources.) </li> <li>Report: Each query comes with an English report that is designed to be written by professional searchers prior to the search.</li> <li>Qrels: Documents are judged in three levels of relevance. Please refer to the dataset paper for the full definition of the levels. </li> <li><a href="https://github.com/hltcoe/HC4">Repository</a></li> <li><a href="https://arxiv.org/abs/2201.09992">Dataset Paper</a></li> </ul>' bibtex_ids: ['Lawrie2022HC4'] data_access: ' <p> To access the docuemnts of this dataset, you will need to download the documents from Common Crawl. The script for downloading and validating the documents are in <a href="https://github.com/hltcoe/HC4">HLTCOE/HC4</a>. Please use the following command to download the documents: </p> <code> git clone https://github.com/hltcoe/HC4<br/> cd HC4<br/> pip install -r requirements.txt<br/> python download_documents.py --storage ~/.ir_datasets/hc4/ \ <br/> --zho ./resources/hc4/zho/ids.jsonl.gz \ <br/> --fas ./resources/hc4/fas/ids.jsonl.gz \ <br/> --rus ./resources/hc4/rus/ids.*.jsonl.gz \ <br/> --jobs {number of process}<br/> </code> <p> After download, please also post-process the downloaded file to verify all and only specified documents are downloaded, and modify the ordering of the collection to match the original specified ordering in the id files. </p> <code> for lang in zho fas rus; do <br/>   python fix_document_order.py --hc4_file ~/.ir_datasets/hc4/$lang/hc4_docs.jsonl \ <br/>    --id_file ./resources/hc4/$lang/ids*.jsonl.gz \ <br/>    --check_hash <br/> done </code> <p> You can also store the documents in other directory and create a soft link for <kbd>~/.ir_datasets/hc4/</kbd>. </p> ' zh: desc: ' <p> The Chinese collection contains English queries and Chinese documents for retrieval. Human and machine translated queries are provided in the query object for running monolingual retrieval or cross-language retrival assuming the machine query tranlstion into Chinese is available. </p> ' docs_instructions: &inst "docs available using https://github.com/hltcoe/HC4 package" bibtex_ids: ['Lawrie2022HC4'] zh/train: desc: ' <p> Train split of <a class="ds-ref">hc4/zh</a>. </p> ' docs_instructions: *inst bibtex_ids: ['Lawrie2022HC4'] zh/dev: desc: ' <p> Development split of <a class="ds-ref">hc4/zh</a>. </p> ' docs_instructions: *inst bibtex_ids: ['Lawrie2022HC4'] zh/test: desc: ' <p> Test split of <a class="ds-ref">hc4/zh</a>. </p> ' docs_instructions: *inst bibtex_ids: ['Lawrie2022HC4'] fa: desc: ' <p> The Persian collection contains English queries and Persian documents for retrieval. Human and machine translated queries are provided in the query object for running monolingual retrieval or cross-language retrival assuming the machine query tranlstion into Persian is available. </p> ' docs_instructions: *inst bibtex_ids: ['Lawrie2022HC4'] fa/train: desc: ' <p> Train split of <a class="ds-ref">hc4/fa</a>. </p> ' docs_instructions: *inst bibtex_ids: ['Lawrie2022HC4'] fa/dev: desc: ' <p> Development split of <a class="ds-ref">hc4/fa</a>. </p> ' docs_instructions: *inst bibtex_ids: ['Lawrie2022HC4'] fa/test: desc: ' <p> Test split of <a class="ds-ref">hc4/fa</a>. </p> ' docs_instructions: *inst bibtex_ids: ['Lawrie2022HC4'] ru: desc: ' <p> The Russian collection contains English queries and Russian documents for retrieval. Human and machine translated queries are provided in the query object for running monolingual retrieval or cross-language retrival assuming the machine query tranlstion into Russian is available. </p> ' docs_instructions: *inst bibtex_ids: ['Lawrie2022HC4'] ru/train: desc: ' <p> Train split of <a class="ds-ref">hc4/ru</a>. </p> ' docs_instructions: *inst bibtex_ids: ['Lawrie2022HC4'] ru/dev: desc: ' <p> Development split of <a class="ds-ref">hc4/ru</a>. </p> ' docs_instructions: *inst bibtex_ids: ['Lawrie2022HC4'] ru/test: desc: ' <p> Test split of <a class="ds-ref">hc4/ru</a>. </p> ' bibtex_ids: ['Lawrie2022HC4'] ================================================ FILE: ir_datasets/docs/highwire.yaml ================================================ _: pretty_name: 'Highwire (TREC Genomics 2006-07)' desc: ' <p> Medical document collection from <a href="https://www.highwirepress.com/">Highwire Press</a>. Includes 162,259 scientific articles from 49 journals. </p> <p> This dataset is used for the TREC 2006-07 TREC Genomics track. </p> <p> Note that these documents are split into passages based on paragraph tags in the HTML. </p> <ul> <li>Documents: Biomedical journal articles</li> <li><a href="https://dmice.ohsu.edu/trec-gen/2006data.html#docs">Information about document collection</a></li> </ul> ' trec-genomics-2006: desc: ' <p> The TREC Genomics Track 2006 benchmark. Contains 28 queries with passage-level relevance judgments. </p> <ul> <li>Documents: Biomedical journal articles</li> <li>Queries: Natural language questions</li> <li>Qrels: deep, by passage</li> <li><a href="https://dmice.ohsu.edu/trec-gen/2006data.html">Shared task data site</a></li> <li><a href="https://trec.nist.gov/pubs/trec15/papers/GEO06.OVERVIEW.pdf">Shared task paper</a></li> </ul> ' bibtex_ids: ['Hersh2006TrecGenomics'] trec-genomics-2007: desc: ' <p> The TREC Genomics Track 2007 benchmark. Contains 36 queries with passage-level relevance judgments. </p> <ul> <li>Documents: Biomedical journal articles</li> <li>Queries: Natural language questions</li> <li>Qrels: deep, by passage</li> <li><a href="https://dmice.ohsu.edu/trec-gen/2007data.html">Shared task data site</a></li> <li><a href="https://dmice.ohsu.edu/hersh/trec-07-genomics.pdf">Shared task paper</a></li> </ul> ' bibtex_ids: ['Hersh2007TrecGenomics'] ================================================ FILE: ir_datasets/docs/istella22.yaml ================================================ _: pretty_name: 'Istella22' desc: ' <p> The Istella22 dataset facilitates comparisions between traditional and neural learning-to-rank by including query and document text along with LTR features (not included in ir_datasets). </p> <p> Note that to use the dataset, you must <b>read and accept</b> the <a href="https://www.istella.ai/dataset/Istella22-LicenseAgreement.txt">Istella22 License Agreement</a>. By using the dataset, you agree to be bound by the terms of the license: the <b>Istella dataset is solely for non-commercial use</b>. </p> <ul> <li><a href="https://dl.acm.org/doi/abs/10.1145/3477495.3531740">Paper</a></li> <li><a href="https://istella.ai/data/istella22-dataset/">Website</a></li> </ul> ' bibtex_ids: ['Dato2022Istella'] test: desc: ' <p> Official test query set. </p> ' test/fold1: desc: ' <p> Fold 1 of the official test query set. </p> ' test/fold2: desc: ' <p> Fold 2 of the official test query set. </p> ' test/fold3: desc: ' <p> Fold 3 of the official test query set. </p> ' test/fold4: desc: ' <p> Fold 4 of the official test query set. </p> ' test/fold5: desc: ' <p> Fold 5 of the official test query set. </p> ' ================================================ FILE: ir_datasets/docs/kilt.yaml ================================================ _: pretty_name: 'KILT' desc: ' <p> KILT is a corpus used for various "knowledge intensive language tasks". </p> <ul> <li>Documents: Wikipedia articles</li> <li><a href="https://github.com/facebookresearch/KILT">Repository</a></li> <li><a href="https://arxiv.org/abs/2009.02252">Paper</a></li> <li><a href="https://ai.facebook.com/tools/kilt/">Leaderboard</a></li> </ul> ' bibtex_ids: ['petroni-etal-2021-kilt'] codec: desc: ' <p> CODEC Entity Ranking sub-task. </p> <ul> <li><a href="https://github.com/grill-lab/CODEC">Task Repository</a></li> <li>See also: <a class="ds-ref">codec</a>, the document ranking subtask</li> </ul> ' bibtex_ids: ['mackie2022codec'] codec/economics: desc: ' <p> Subset of <a class="ds-ref">kilt/codec</a> that only contains topics about economics. </p> ' bibtex_ids: ['mackie2022codec'] codec/history: desc: ' <p> Subset of <a class="ds-ref">kilt/codec</a> that only contains topics about history. </p> ' bibtex_ids: ['mackie2022codec'] codec/politics: desc: ' <p> Subset of <a class="ds-ref">kilt/codec</a> that only contains topics about politics. </p> ' bibtex_ids: ['mackie2022codec'] ================================================ FILE: ir_datasets/docs/lotte.yaml ================================================ _: pretty_name: 'LoTTE' bibtex_ids: ['Santhanam2021ColBERTv2'] desc: ' <p> LoTTE (Long-Tail Topic-stratified Evaluation) is a set of test collections focused on out-of-domain evaluation. It consists of data from several StackExchanges, with relevance assumed by either by upvotes (at least 1) or being selected as the accepted answer by the question''s author. </p> <p> Note that the dev and test corpora are disjoint to avoid leakage. </p> <ul> <li>Documents: Answers to StackExchange questions</li> <li>Queries: Natural language questions</li> <li><a href="https://arxiv.org/pdf/2112.01488.pdf">Dataset Paper</a></li> </ul> ' lifestyle/dev: bibtex_ids: ['Santhanam2021ColBERTv2'] desc: ' <p> Answers from lifestyle-focused forums, including bicycles, coffee, crafts, diy, gardening, lifehacks, mechanics, music, outdoors, parenting, pets, sports, and travel. </p> ' lifestyle/dev/forum: bibtex_ids: ['Santhanam2021ColBERTv2'] official_measures: ['Success@5'] desc: ' <p> Forum queries for <a class="ds-ref">lotte/lifestyle/dev</a>. </p> ' lifestyle/dev/search: bibtex_ids: ['Santhanam2021ColBERTv2'] official_measures: ['Success@5'] desc: ' <p> Search queries for <a class="ds-ref">lotte/lifestyle/dev</a>. </p> ' lifestyle/test: bibtex_ids: ['Santhanam2021ColBERTv2'] desc: ' <p> Queries and answers from lifestyle-focused forums, including bicycles, coffee, crafts, diy, gardening, lifehacks, mechanics, music, outdoors, parenting, pets, sports, and travel. </p> ' lifestyle/test/forum: bibtex_ids: ['Santhanam2021ColBERTv2'] official_measures: ['Success@5'] desc: ' <p> Forum queries for <a class="ds-ref">lotte/lifestyle/test</a>. </p> ' lifestyle/test/search: bibtex_ids: ['Santhanam2021ColBERTv2'] official_measures: ['Success@5'] desc: ' <p> Search queries for <a class="ds-ref">lotte/lifestyle/test</a>. </p> ' pooled/dev: bibtex_ids: ['Santhanam2021ColBERTv2'] desc: ' <p> Combined version of <a class="ds-ref">lotte/lifestyle/dev</a>, <a class="ds-ref">lotte/recreation/dev</a>, <a class="ds-ref">lotte/science/dev</a>, <a class="ds-ref">lotte/technology/dev</a>, and <a class="ds-ref">lotte/writing/dev</a>. </p> ' pooled/dev/forum: bibtex_ids: ['Santhanam2021ColBERTv2'] official_measures: ['Success@5'] desc: ' <p> Forum queries for <a class="ds-ref">lotte/pooled/dev</a>. </p> ' pooled/dev/search: bibtex_ids: ['Santhanam2021ColBERTv2'] official_measures: ['Success@5'] desc: ' <p> Search queries for <a class="ds-ref">lotte/pooled/dev</a>. </p> ' pooled/test: bibtex_ids: ['Santhanam2021ColBERTv2'] desc: ' <p> Combined version of <a class="ds-ref">lotte/lifestyle/test</a>, <a class="ds-ref">lotte/recreation/test</a>, <a class="ds-ref">lotte/science/test</a>, <a class="ds-ref">lotte/technology/test</a>, and <a class="ds-ref">lotte/writing/test</a>. </p> ' pooled/test/forum: bibtex_ids: ['Santhanam2021ColBERTv2'] official_measures: ['Success@5'] desc: ' <p> Forum queries for <a class="ds-ref">lotte/pooled/test</a>. </p> ' pooled/test/search: bibtex_ids: ['Santhanam2021ColBERTv2'] official_measures: ['Success@5'] desc: ' <p> Search queries for <a class="ds-ref">lotte/pooled/test</a>. </p> ' recreation/dev: bibtex_ids: ['Santhanam2021ColBERTv2'] desc: ' <p> Answers from recreation-focused forums, including anime, boardgames, gaming, movies, photo, rpg, and scifi. </p> ' recreation/dev/forum: bibtex_ids: ['Santhanam2021ColBERTv2'] official_measures: ['Success@5'] desc: ' <p> Forum queries for <a class="ds-ref">lotte/recreation/dev</a>. </p> ' recreation/dev/search: bibtex_ids: ['Santhanam2021ColBERTv2'] official_measures: ['Success@5'] desc: ' <p> Search queries for <a class="ds-ref">lotte/recreation/dev</a>. </p> ' recreation/test: bibtex_ids: ['Santhanam2021ColBERTv2'] desc: ' <p> Answers from recreation-focused forums, including anime, boardgames, gaming, movies, photo, rpg, and scifi. </p> ' recreation/test/forum: bibtex_ids: ['Santhanam2021ColBERTv2'] official_measures: ['Success@5'] desc: ' <p> Forum queries for <a class="ds-ref">lotte/recreation/test</a>. </p> ' recreation/test/search: bibtex_ids: ['Santhanam2021ColBERTv2'] official_measures: ['Success@5'] desc: ' <p> Search queries for <a class="ds-ref">lotte/recreation/test</a>. </p> ' science/dev: bibtex_ids: ['Santhanam2021ColBERTv2'] desc: ' <p> Answers from science-focused forums, including academia, astronomy, biology, chemistry, datasciene, earthscience, engineering, math, philosophy, physics, and stats. </p> ' science/dev/forum: bibtex_ids: ['Santhanam2021ColBERTv2'] official_measures: ['Success@5'] desc: ' <p> Forum queries for <a class="ds-ref">lotte/science/dev</a>. </p> ' science/dev/search: bibtex_ids: ['Santhanam2021ColBERTv2'] official_measures: ['Success@5'] desc: ' <p> Search queries for <a class="ds-ref">lotte/science/dev</a>. </p> ' science/test: bibtex_ids: ['Santhanam2021ColBERTv2'] desc: ' <p> Answers from science-focused forums, including academia, astronomy, biology, chemistry, datasciene, earthscience, engineering, math, philosophy, physics, and stats. </p> ' science/test/forum: bibtex_ids: ['Santhanam2021ColBERTv2'] official_measures: ['Success@5'] desc: ' <p> Forum queries for <a class="ds-ref">lotte/science/test</a>. </p> ' science/test/search: bibtex_ids: ['Santhanam2021ColBERTv2'] official_measures: ['Success@5'] desc: ' <p> Search queries for <a class="ds-ref">lotte/science/test</a>. </p> ' technology/dev: bibtex_ids: ['Santhanam2021ColBERTv2'] desc: ' <p> Answers from technology-focused forums, including android, apple, askubuntu, electronics, networkengineering, security, serverfault, softwareengineering, superuser, unix, and webapps. </p> ' technology/dev/forum: bibtex_ids: ['Santhanam2021ColBERTv2'] official_measures: ['Success@5'] desc: ' <p> Forum queries for <a class="ds-ref">lotte/technology/dev</a>. </p> ' technology/dev/search: bibtex_ids: ['Santhanam2021ColBERTv2'] official_measures: ['Success@5'] desc: ' <p> Search queries for <a class="ds-ref">lotte/technology/dev</a>. </p> ' technology/test: bibtex_ids: ['Santhanam2021ColBERTv2'] desc: ' <p> Answers from technology-focused forums, including android, apple, askubuntu, electronics, networkengineering, security, serverfault, softwareengineering, superuser, unix, and webapps. </p> ' technology/test/forum: bibtex_ids: ['Santhanam2021ColBERTv2'] official_measures: ['Success@5'] desc: ' <p> Forum queries for <a class="ds-ref">lotte/technology/test</a>. </p> ' technology/test/search: bibtex_ids: ['Santhanam2021ColBERTv2'] official_measures: ['Success@5'] desc: ' <p> Search queries for <a class="ds-ref">lotte/technology/test</a>. </p> ' writing/dev: bibtex_ids: ['Santhanam2021ColBERTv2'] desc: ' <p> Answers from writing-focused forums, including ell, english, linguistics, literature, worldbuilding, and writing. </p> ' writing/dev/forum: bibtex_ids: ['Santhanam2021ColBERTv2'] official_measures: ['Success@5'] desc: ' <p> Forum queries for <a class="ds-ref">lotte/writing/dev</a>. </p> ' writing/dev/search: bibtex_ids: ['Santhanam2021ColBERTv2'] official_measures: ['Success@5'] desc: ' <p> Search queries for <a class="ds-ref">lotte/writing/dev</a>. </p> ' writing/test: bibtex_ids: ['Santhanam2021ColBERTv2'] desc: ' <p> Answers from writing-focused forums, including ell, english, linguistics, literature, worldbuilding, and writing. </p> ' writing/test/forum: bibtex_ids: ['Santhanam2021ColBERTv2'] official_measures: ['Success@5'] desc: ' <p> Forum queries for <a class="ds-ref">lotte/writing/test</a>. </p> ' writing/test/search: bibtex_ids: ['Santhanam2021ColBERTv2'] official_measures: ['Success@5'] desc: ' <p> Search queries for <a class="ds-ref">lotte/writing/test</a>. </p> ' ================================================ FILE: ir_datasets/docs/medline.yaml ================================================ _: pretty_name: 'Medline' desc: ' <p> Medical articles from <a href="https://www.nlm.nih.gov/bsd/medline.html">Medline</a>. This collection was used by TREC Genomics 2004-05 (2004 version of dataset) and by TREC Precision Medicine 2017-18 (2017 version). </p> ' 2004: desc: ' <p> 3M Medline articles including titles and abstracts, used for the TREC 2004-05 Genomics track. </p> <ul> <li>Documents: Biomedical article titles and abstracts</li> <li><a href="https://dmice.ohsu.edu/trec-gen/2004data.html">Information about document collection</a></li> </ul> ' trec-genomics-2004: desc: ' <p> The TREC Genomics Track 2004 benchmark. Contains 50 queries with article-level relevance judgments. </p> <ul> <li>Documents: Biomedical article titles and abstracts</li> <li>Queries: Natural language questions</li> <li>Qrels: deep, graded</li> <li><a href="https://dmice.ohsu.edu/trec-gen/2004data.html">Shared task data site</a></li> <li><a href="https://trec.nist.gov/pubs/trec13/papers/GEO.OVERVIEW.pdf">Shared task paper</a></li> </ul> ' bibtex_ids: ['Hersh2004TrecGenomics'] trec-genomics-2005: desc: ' <p> The TREC Genomics Track 2005 benchmark. Contains 50 queries with article-level relevance judgments. </p> <ul> <li>Documents: Biomedical article titles and abstracts</li> <li>Queries: Natural language questions</li> <li>Qrels: deep, graded</li> <li><a href="https://dmice.ohsu.edu/trec-gen/2005data.html">Shared task data site</a></li> <li><a href="https://trec.nist.gov/pubs/trec14/papers/GEO.OVERVIEW.pdf">Shared task paper</a></li> </ul> ' bibtex_ids: ['Hersh2005TrecGenomics'] 2017: desc: ' <p> 26M Medline and AACR/ASCO Proceedings articles including titles and abstracts. This collection is used for the TREC 2017-18 TREC Precision Medicine track. </p> <ul> <li>Documents: Biomedical article titles and abstracts</li> <li><a href="http://www.trec-cds.org/2017.html">Information about document collection</a></li> </ul> ' trec-pm-2017: desc: ' <p> The TREC Precision Medicine (PM) Track 2017 benchmark. Contains 30 queries containing disease, gene, and target demographic information. </p> <ul> <li>Documents: Biomedical article titles and abstracts</li> <li>Queries: Specific to TREC PM information need</li> <li>Qrels: deep, graded</li> <li><a href="http://www.trec-cds.org/2017.html">Shared task data site</a></li> <li><a href="https://trec.nist.gov/pubs/trec26/papers/Overview-PM.pdf">Shared task paper</a></li> </ul> ' bibtex_ids: ['Roberts2017TrecPm'] trec-pm-2018: desc: ' <p> The TREC Precision Medicine (PM) Track 2018 benchmark. Contains 50 queries containing disease, gene, and target demographic information. </p> <ul> <li>Documents: Biomedical article titles and abstracts</li> <li>Queries: Specific to TREC PM information need</li> <li>Qrels: deep, graded</li> <li><a href="http://www.trec-cds.org/2018.html">Shared task data site</a></li> <li><a href="https://trec.nist.gov/pubs/trec27/papers/Overview-PM.pdf">Shared task paper</a></li> </ul> ' bibtex_ids: ['Roberts2018TrecPm'] ================================================ FILE: ir_datasets/docs/miracl.yaml ================================================ _: pretty_name: 'MIRACL' desc: ' <p> MIRACL is a multilingual adhoc retrieval dataset covering 18 languages. The document corpora are based on Wikipedia dumps, which are split into passages. </p> <ul> <li><a href="https://arxiv.org/pdf/2210.09984.pdf">Dataset paper</a></li> <li><a href="https://project-miracl.github.io/">Website</a></li> </ul> ' bibtex_ids: ['Zhang2022Miracl'] ar: desc: ' <p> The Arabic corpus. </p> ' bibtex_ids: ['Zhang2022Miracl'] ar/train: desc: ' <p> The train set for Arabic. </p> ' bibtex_ids: ['Zhang2022Miracl'] ar/dev: desc: ' <p> The dev set for Arabic. </p> ' bibtex_ids: ['Zhang2022Miracl'] ar/test-a: desc: ' <p> The held-out test set (version a) for Arabic. </p> ' bibtex_ids: ['Zhang2022Miracl'] ar/test-b: desc: ' <p> The held-out test set (version b) for Arabic. </p> ' bibtex_ids: ['Zhang2022Miracl'] bn: desc: ' <p> The Bengali corpus. </p> ' bibtex_ids: ['Zhang2022Miracl'] bn/train: desc: ' <p> The train set for Bengali. </p> ' bibtex_ids: ['Zhang2022Miracl'] bn/dev: desc: ' <p> The dev set for Bengali. </p> ' bibtex_ids: ['Zhang2022Miracl'] bn/test-a: desc: ' <p> The held-out test set (version a) for Bengali. </p> ' bibtex_ids: ['Zhang2022Miracl'] bn/test-b: desc: ' <p> The held-out test set (version b) for Bengali. </p> ' bibtex_ids: ['Zhang2022Miracl'] de: desc: ' <p> The German corpus. </p> ' bibtex_ids: ['Zhang2022Miracl'] de/dev: desc: ' <p> The dev set for German. </p> ' bibtex_ids: ['Zhang2022Miracl'] de/test-b: desc: ' <p> The held-out test set (version b) for German. </p> ' bibtex_ids: ['Zhang2022Miracl'] en: desc: ' <p> The English corpus. </p> ' bibtex_ids: ['Zhang2022Miracl'] en/train: desc: ' <p> The train set for English. </p> ' bibtex_ids: ['Zhang2022Miracl'] en/dev: desc: ' <p> The dev set for English. </p> ' bibtex_ids: ['Zhang2022Miracl'] en/test-a: desc: ' <p> The held-out test set (version a) for English. </p> ' bibtex_ids: ['Zhang2022Miracl'] en/test-b: desc: ' <p> The held-out test set (version b) for English. </p> ' bibtex_ids: ['Zhang2022Miracl'] es: desc: ' <p> The Spanish corpus. </p> ' bibtex_ids: ['Zhang2022Miracl'] es/train: desc: ' <p> The train set for Spanish. </p> ' bibtex_ids: ['Zhang2022Miracl'] es/dev: desc: ' <p> The dev set for Spanish. </p> ' bibtex_ids: ['Zhang2022Miracl'] es/test-b: desc: ' <p> The held-out test set (version b) for Spanish. </p> ' bibtex_ids: ['Zhang2022Miracl'] fa: desc: ' <p> The Persian corpus. </p> ' bibtex_ids: ['Zhang2022Miracl'] fa/train: desc: ' <p> The train set for Persian. </p> ' bibtex_ids: ['Zhang2022Miracl'] fa/dev: desc: ' <p> The dev set for Persian. </p> ' bibtex_ids: ['Zhang2022Miracl'] fa/test-b: desc: ' <p> The held-out test set (version b) for Persian. </p> ' bibtex_ids: ['Zhang2022Miracl'] fi: desc: ' <p> The Finnish corpus. </p> ' bibtex_ids: ['Zhang2022Miracl'] fi/train: desc: ' <p> The train set for Finnish. </p> ' bibtex_ids: ['Zhang2022Miracl'] fi/dev: desc: ' <p> The dev set for Finnish. </p> ' bibtex_ids: ['Zhang2022Miracl'] fi/test-a: desc: ' <p> The held-out test set (version a) for Finnish. </p> ' bibtex_ids: ['Zhang2022Miracl'] fi/test-b: desc: ' <p> The held-out test set (version b) for Finnish. </p> ' bibtex_ids: ['Zhang2022Miracl'] fr: desc: ' <p> The French corpus. </p> ' bibtex_ids: ['Zhang2022Miracl'] fr/train: desc: ' <p> The train set for French. </p> ' bibtex_ids: ['Zhang2022Miracl'] fr/dev: desc: ' <p> The dev set for French. </p> ' bibtex_ids: ['Zhang2022Miracl'] fr/test-b: desc: ' <p> The held-out test set (version b) for French. </p> ' bibtex_ids: ['Zhang2022Miracl'] hi: desc: ' <p> The Hindi corpus. </p> ' bibtex_ids: ['Zhang2022Miracl'] hi/train: desc: ' <p> The train set for Hindi. </p> ' bibtex_ids: ['Zhang2022Miracl'] hi/dev: desc: ' <p> The dev set for Hindi. </p> ' bibtex_ids: ['Zhang2022Miracl'] hi/test-b: desc: ' <p> The held-out test set (version b) for Hindi. </p> ' bibtex_ids: ['Zhang2022Miracl'] id: desc: ' <p> The Indonesian corpus. </p> ' bibtex_ids: ['Zhang2022Miracl'] id/train: desc: ' <p> The train set for Indonesian. </p> ' bibtex_ids: ['Zhang2022Miracl'] id/dev: desc: ' <p> The dev set for Indonesian. </p> ' bibtex_ids: ['Zhang2022Miracl'] id/test-a: desc: ' <p> The held-out test set (version a) for Indonesian. </p> ' bibtex_ids: ['Zhang2022Miracl'] id/test-b: desc: ' <p> The held-out test set (version b) for Indonesian. </p> ' bibtex_ids: ['Zhang2022Miracl'] ja: desc: ' <p> The Japanese corpus. </p> ' bibtex_ids: ['Zhang2022Miracl'] ja/train: desc: ' <p> The train set for Japanese. </p> ' bibtex_ids: ['Zhang2022Miracl'] ja/dev: desc: ' <p> The dev set for Japanese. </p> ' bibtex_ids: ['Zhang2022Miracl'] ja/test-a: desc: ' <p> The held-out test set (version a) for Japanese. </p> ' bibtex_ids: ['Zhang2022Miracl'] ja/test-b: desc: ' <p> The held-out test set (version b) for Japanese. </p> ' bibtex_ids: ['Zhang2022Miracl'] ko: desc: ' <p> The Korean corpus. </p> ' bibtex_ids: ['Zhang2022Miracl'] ko/train: desc: ' <p> The train set for Korean. </p> ' bibtex_ids: ['Zhang2022Miracl'] ko/dev: desc: ' <p> The dev set for Korean. </p> ' bibtex_ids: ['Zhang2022Miracl'] ko/test-a: desc: ' <p> The held-out test set (version a) for Korean. </p> ' bibtex_ids: ['Zhang2022Miracl'] ko/test-b: desc: ' <p> The held-out test set (version b) for Korean. </p> ' bibtex_ids: ['Zhang2022Miracl'] ru: desc: ' <p> The Russian corpus. </p> ' bibtex_ids: ['Zhang2022Miracl'] ru/train: desc: ' <p> The train set for Russian. </p> ' bibtex_ids: ['Zhang2022Miracl'] ru/dev: desc: ' <p> The dev set for Russian. </p> ' bibtex_ids: ['Zhang2022Miracl'] ru/test-a: desc: ' <p> The held-out test set (version a) for Russian. </p> ' bibtex_ids: ['Zhang2022Miracl'] ru/test-b: desc: ' <p> The held-out test set (version b) for Russian. </p> ' bibtex_ids: ['Zhang2022Miracl'] sw: desc: ' <p> The Swahili corpus. </p> ' bibtex_ids: ['Zhang2022Miracl'] sw/train: desc: ' <p> The train set for Swahili. </p> ' bibtex_ids: ['Zhang2022Miracl'] sw/dev: desc: ' <p> The dev set for Swahili. </p> ' bibtex_ids: ['Zhang2022Miracl'] sw/test-a: desc: ' <p> The held-out test set (version a) for Swahili. </p> ' bibtex_ids: ['Zhang2022Miracl'] sw/test-b: desc: ' <p> The held-out test set (version b) for Swahili. </p> ' bibtex_ids: ['Zhang2022Miracl'] te: desc: ' <p> The Telugu corpus. </p> ' bibtex_ids: ['Zhang2022Miracl'] te/train: desc: ' <p> The train set for Telugu. </p> ' bibtex_ids: ['Zhang2022Miracl'] te/dev: desc: ' <p> The dev set for Telugu. </p> ' bibtex_ids: ['Zhang2022Miracl'] te/test-a: desc: ' <p> The held-out test set (version a) for Telugu. </p> ' bibtex_ids: ['Zhang2022Miracl'] te/test-b: desc: ' <p> The held-out test set (version b) for Telugu. </p> ' bibtex_ids: ['Zhang2022Miracl'] th: desc: ' <p> The Thai corpus. </p> ' bibtex_ids: ['Zhang2022Miracl'] th/train: desc: ' <p> The train set for Thai. </p> ' bibtex_ids: ['Zhang2022Miracl'] th/dev: desc: ' <p> The dev set for Thai. </p> ' bibtex_ids: ['Zhang2022Miracl'] th/test-a: desc: ' <p> The held-out test set (version a) for Thai. </p> ' bibtex_ids: ['Zhang2022Miracl'] th/test-b: desc: ' <p> The held-out test set (version b) for Thai. </p> ' bibtex_ids: ['Zhang2022Miracl'] yo: desc: ' <p> The Yoruba corpus. </p> ' bibtex_ids: ['Zhang2022Miracl'] yo/dev: desc: ' <p> The dev set for Yoruba. </p> ' bibtex_ids: ['Zhang2022Miracl'] yo/test-b: desc: ' <p> The held-out test set (version b) for Yoruba. </p> ' bibtex_ids: ['Zhang2022Miracl'] zh: desc: ' <p> The Chinese corpus. </p> ' bibtex_ids: ['Zhang2022Miracl'] zh/train: desc: ' <p> The train set for Chinese. </p> ' bibtex_ids: ['Zhang2022Miracl'] zh/dev: desc: ' <p> The dev set for Chinese. </p> ' bibtex_ids: ['Zhang2022Miracl'] zh/test-b: desc: ' <p> The held-out test set (version b) for Chinese. </p> ' bibtex_ids: ['Zhang2022Miracl'] ================================================ FILE: ir_datasets/docs/mmarco.yaml ================================================ _: pretty_name: 'mMARCO' desc: ' <p> A version of the MS MARCO passage dataset (<a class="ds-ref">msmarco-passage</a>) with the queries and documents automatically translated into several languages. </p> <ul> <li>Documents: Short passages (from web), translated from English</li> <li>Queries: Natural language questions (from query log), translated from English</li> <li><a href="https://github.com/unicamp-dl/mMARCO">Repository</a></li> <li><a href="https://arxiv.org/abs/2108.13897">Dataset Paper</a></li> </ul>' bibtex_ids: ['Bonifacio2021MMarco'] zh: desc: ' <p> Version of <a class="ds-ref">msmarco-passage</a>, with documents translated into Chinese. </p>' bibtex_ids: ['Bonifacio2021MMarco'] zh/train: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/train</a>, with queries and documents translated into Chinese. </p>' bibtex_ids: ['Bonifacio2021MMarco'] zh/dev: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev</a>, with queries and documents translated into Chinese. </p>' bibtex_ids: ['Bonifacio2021MMarco'] zh/dev/small: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev/small</a>, with queries and documents translated into Chinese. </p>' bibtex_ids: ['Bonifacio2021MMarco'] zh/dev/v1.1: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev</a>, with queries and documents translated into Chinese. </p> <p> Version 1.1 of this file includes manual corrections from the authorss of the translated files. <a href="https://github.com/unicamp-dl/mMARCO/issues/8#issuecomment-992810293">See discussion here</a>. </p>' bibtex_ids: ['Bonifacio2021MMarco'] zh/dev/small/v1.1: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev/small</a>, with queries and documents translated into Chinese. </p> <p> Version 1.1 of this file includes manual corrections from the authorss of the translated files. <a href="https://github.com/unicamp-dl/mMARCO/issues/8#issuecomment-992810293">See discussion here</a>. </p>' bibtex_ids: ['Bonifacio2021MMarco'] fr: desc: ' <p> Version of <a class="ds-ref">msmarco-passage</a>, with documents translated into French. </p>' bibtex_ids: ['Bonifacio2021MMarco'] fr/train: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/train</a>, with queries and documents translated into French. </p>' bibtex_ids: ['Bonifacio2021MMarco'] fr/dev: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev</a>, with queries and documents translated into French. </p>' bibtex_ids: ['Bonifacio2021MMarco'] fr/dev/small: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev/small</a>, with queries and documents translated into French. </p>' bibtex_ids: ['Bonifacio2021MMarco'] de: desc: ' <p> Version of <a class="ds-ref">msmarco-passage</a>, with documents translated into German. </p>' bibtex_ids: ['Bonifacio2021MMarco'] de/train: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/train</a>, with queries and documents translated into German. </p>' bibtex_ids: ['Bonifacio2021MMarco'] de/dev: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev</a>, with queries and documents translated into German. </p>' bibtex_ids: ['Bonifacio2021MMarco'] de/dev/small: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev/small</a>, with queries and documents translated into German. </p>' bibtex_ids: ['Bonifacio2021MMarco'] id: desc: ' <p> Version of <a class="ds-ref">msmarco-passage</a>, with documents translated into Indonesian. </p>' bibtex_ids: ['Bonifacio2021MMarco'] id/train: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/train</a>, with queries and documents translated into Indonesian. </p>' bibtex_ids: ['Bonifacio2021MMarco'] id/dev: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev</a>, with queries and documents translated into Indonesian. </p>' bibtex_ids: ['Bonifacio2021MMarco'] id/dev/small: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev/small</a>, with queries and documents translated into Indonesian. </p>' bibtex_ids: ['Bonifacio2021MMarco'] it: desc: ' <p> Version of <a class="ds-ref">msmarco-passage</a>, with documents translated into Italian. </p>' bibtex_ids: ['Bonifacio2021MMarco'] it/train: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/train</a>, with queries and documents translated into Italian. </p>' bibtex_ids: ['Bonifacio2021MMarco'] it/dev: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev</a>, with queries and documents translated into Italian. </p>' bibtex_ids: ['Bonifacio2021MMarco'] it/dev/small: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev/small</a>, with queries and documents translated into Italian. </p>' bibtex_ids: ['Bonifacio2021MMarco'] ru: desc: ' <p> Version of <a class="ds-ref">msmarco-passage</a>, with documents translated into Russian. </p>' bibtex_ids: ['Bonifacio2021MMarco'] ru/train: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/train</a>, with queries and documents translated into Russian. </p>' bibtex_ids: ['Bonifacio2021MMarco'] ru/dev: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev</a>, with queries and documents translated into Russian. </p>' bibtex_ids: ['Bonifacio2021MMarco'] ru/dev/small: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev/small</a>, with queries and documents translated into Russian. </p>' bibtex_ids: ['Bonifacio2021MMarco'] es: desc: ' <p> Version of <a class="ds-ref">msmarco-passage</a>, with documents translated into Spanish. </p>' bibtex_ids: ['Bonifacio2021MMarco'] es/train: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/train</a>, with queries and documents translated into Spanish. </p>' bibtex_ids: ['Bonifacio2021MMarco'] es/dev: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev</a>, with queries and documents translated into Spanish. </p>' bibtex_ids: ['Bonifacio2021MMarco'] es/dev/small: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev/small</a>, with queries and documents translated into Spanish. </p>' bibtex_ids: ['Bonifacio2021MMarco'] pt: desc: ' <p> Version of <a class="ds-ref">msmarco-passage</a>, with documents translated into Portuguese. </p>' bibtex_ids: ['Bonifacio2021MMarco'] pt/train: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/train</a>, with queries and documents translated into Portuguese. </p>' bibtex_ids: ['Bonifacio2021MMarco'] pt/dev: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev</a>, with queries and documents translated into Portuguese. </p>' bibtex_ids: ['Bonifacio2021MMarco'] pt/dev/small: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev/small</a>, with queries and documents translated into Portuguese. </p>' bibtex_ids: ['Bonifacio2021MMarco'] pt/dev/v1.1: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev</a>, with queries and documents translated into Portuguese. </p> <p> Version 1.1 of this file includes manual corrections from the authorss of the translated files. <a href="https://github.com/unicamp-dl/mMARCO/issues/8#issuecomment-992810293">See discussion here</a>. It also removes some duplicated query IDs. </p>' bibtex_ids: ['Bonifacio2021MMarco'] pt/dev/small/v1.1: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev/small</a>, with queries and documents translated into Portuguese. </p> <p> Version 1.1 of this file includes manual corrections from the authorss of the translated files. <a href="https://github.com/unicamp-dl/mMARCO/issues/8#issuecomment-992810293">See discussion here</a>. It also removes some duplicated query IDs. </p>' bibtex_ids: ['Bonifacio2021MMarco'] pt/train/v1.1: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/train</a>, with queries and documents translated into Portuguese. </p> <p> Version 1.1 of this file includes manual corrections from the authorss of the translated files. <a href="https://github.com/unicamp-dl/mMARCO/issues/8#issuecomment-992810293">See discussion here</a>. It also removes some duplicated query IDs. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/ar: desc: ' <p> Version of <a class="ds-ref">msmarco-passage</a>, with queries and documents translated into Arabic. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/ar/dev: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev</a>, with queries and documents translated into Arabic. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/ar/dev/small: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev/small</a>, with queries and documents translated into Arabic. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/ar/train: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/train</a>, with queries and documents translated into Arabic. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/de: desc: ' <p> Version of <a class="ds-ref">msmarco-passage</a>, with queries and documents translated into German. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/de/dev: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev</a>, with queries and documents translated into German. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/de/dev/small: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev/small</a>, with queries and documents translated into German. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/de/train: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/train</a>, with queries and documents translated into German. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/dt: desc: ' <p> Version of <a class="ds-ref">msmarco-passage</a>, with queries and documents translated into Dutch. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/dt/dev: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev</a>, with queries and documents translated into Dutch. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/dt/dev/small: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev/small</a>, with queries and documents translated into Dutch. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/dt/train: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/train</a>, with queries and documents translated into Dutch. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/es: desc: ' <p> Version of <a class="ds-ref">msmarco-passage</a>, with queries and documents translated into Spanish. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/es/dev: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev</a>, with queries and documents translated into Spanish. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/es/dev/small: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev/small</a>, with queries and documents translated into Spanish. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/es/train: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/train</a>, with queries and documents translated into Spanish. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/fr: desc: ' <p> Version of <a class="ds-ref">msmarco-passage</a>, with queries and documents translated into French. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/fr/dev: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev</a>, with queries and documents translated into French. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/fr/dev/small: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev/small</a>, with queries and documents translated into French. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/fr/train: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/train</a>, with queries and documents translated into French. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/hi: desc: ' <p> Version of <a class="ds-ref">msmarco-passage</a>, with queries and documents translated into Hindi. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/hi/dev: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev</a>, with queries and documents translated into Hindi. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/hi/dev/small: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev/small</a>, with queries and documents translated into Hindi. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/hi/train: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/train</a>, with queries and documents translated into Hindi. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/id: desc: ' <p> Version of <a class="ds-ref">msmarco-passage</a>, with queries and documents translated into Indonesian. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/id/dev: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev</a>, with queries and documents translated into Indonesian. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/id/dev/small: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev/small</a>, with queries and documents translated into Indonesian. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/id/train: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/train</a>, with queries and documents translated into Indonesian. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/it: desc: ' <p> Version of <a class="ds-ref">msmarco-passage</a>, with queries and documents translated into Italian. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/it/dev: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev</a>, with queries and documents translated into Italian. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/it/dev/small: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev/small</a>, with queries and documents translated into Italian. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/it/train: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/train</a>, with queries and documents translated into Italian. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/ja: desc: ' <p> Version of <a class="ds-ref">msmarco-passage</a>, with queries and documents translated into Japanese. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/ja/dev: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev</a>, with queries and documents translated into Japanese. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/ja/dev/small: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev/small</a>, with queries and documents translated into Japanese. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/ja/train: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/train</a>, with queries and documents translated into Japanese. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/pt: desc: ' <p> Version of <a class="ds-ref">msmarco-passage</a>, with queries and documents translated into Portuguese. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/pt/dev: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev</a>, with queries and documents translated into Portuguese. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/pt/dev/small: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev/small</a>, with queries and documents translated into Portuguese. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/pt/train: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/train</a>, with queries and documents translated into Portuguese. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/ru: desc: ' <p> Version of <a class="ds-ref">msmarco-passage</a>, with queries and documents translated into Russian. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/ru/dev: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev</a>, with queries and documents translated into Russian. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/ru/dev/small: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev/small</a>, with queries and documents translated into Russian. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/ru/train: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/train</a>, with queries and documents translated into Russian. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/vi: desc: ' <p> Version of <a class="ds-ref">msmarco-passage</a>, with queries and documents translated into Vietnamese. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/vi/dev: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev</a>, with queries and documents translated into Vietnamese. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/vi/dev/small: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev/small</a>, with queries and documents translated into Vietnamese. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/vi/train: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/train</a>, with queries and documents translated into Vietnamese. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/zh: desc: ' <p> Version of <a class="ds-ref">msmarco-passage</a>, with queries and documents translated into Chinese. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/zh/dev: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev</a>, with queries and documents translated into Chinese. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/zh/dev/small: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/dev/small</a>, with queries and documents translated into Chinese. </p>' bibtex_ids: ['Bonifacio2021MMarco'] v2/zh/train: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/train</a>, with queries and documents translated into Chinese. </p>' bibtex_ids: ['Bonifacio2021MMarco'] ================================================ FILE: ir_datasets/docs/mr-tydi.yaml ================================================ _: pretty_name: 'Mr. TyDi' bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p> A multi-lingual benchmark benchmark suite constructed from the <a href="https://arxiv.org/abs/2003.05002">TyDi QA Benchmark</a>. Relevance labels are sparsely assigned based on shallow human annotation. </p> <ul> <li><a href="https://arxiv.org/pdf/2108.08787.pdf">Dataset paper</a></li> </ul> ' ar: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p> Complete Arabic dataset, including all train, dev, and test queries and qrels. </p> ' ar/train: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Train set for Arabic</p> ' ar/dev: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Development set for Arabic</p> ' ar/test: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Test set for Arabic</p> ' bn: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p> Complete Bengali dataset, including all train, dev, and test queries and qrels. </p> ' bn/train: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Train set for Bengali</p> ' bn/dev: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Development set for Bengali</p> ' bn/test: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Test set for Bengali</p> ' en: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p> Complete English dataset, including all train, dev, and test queries and qrels. </p> ' en/train: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Train set for English</p> ' en/dev: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Development set for English</p> ' en/test: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Test set for English</p> ' fi: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p> Complete Finnish dataset, including all train, dev, and test queries and qrels. </p> ' fi/train: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Train set for Finnish</p> ' fi/dev: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Development set for Finnish</p> ' fi/test: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Test set for Finnish</p> ' id: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p> Complete Indonesian dataset, including all train, dev, and test queries and qrels. </p> ' id/train: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Train set for Indonesian</p> ' id/dev: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Development set for Indonesian</p> ' id/test: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Test set for Indonesian</p> ' ja: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p> Complete Japanese dataset, including all train, dev, and test queries and qrels. </p> ' ja/train: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Train set for Japanese</p> ' ja/dev: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Development set for Japanese</p> ' ja/test: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Test set for Japanese</p> ' ko: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p> Complete Korean dataset, including all train, dev, and test queries and qrels. </p> ' ko/train: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Train set for Korean</p> ' ko/dev: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Development set for Korean</p> ' ko/test: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Test set for Korean</p> ' ru: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p> Complete Russian dataset, including all train, dev, and test queries and qrels. </p> ' ru/train: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Train set for Russian</p> ' ru/dev: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Development set for Russian</p> ' ru/test: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Test set for Russian</p> ' sw: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p> Complete Swahili dataset, including all train, dev, and test queries and qrels. </p> ' sw/train: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Train set for Swahili</p> ' sw/dev: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Development set for Swahili</p> ' sw/test: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Test set for Swahili</p> ' te: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p> Complete Telugu dataset, including all train, dev, and test queries and qrels. </p> ' te/train: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Train set for Telugu</p> ' te/dev: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Development set for Telugu</p> ' te/test: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Test set for Telugu</p> ' th: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p> Complete Thai dataset, including all train, dev, and test queries and qrels. </p> ' th/train: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Train set for Thai</p> ' th/dev: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Development set for Thai</p> ' th/test: bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa'] desc: ' <p>Test set for Thai</p> ' ================================================ FILE: ir_datasets/docs/msmarco-document-v2.yaml ================================================ _: pretty_name: 'MSMARCO (document, version 2)' desc: ' <p> Version 2 of the MS MARCO document ranking dataset. The corpus contains 12M documents (roughly 3x as many as version 1). </p> <ul> <li>Version 1 of dataset: <a class="ds-ref">msmarco-document</a></li> <li>Documents: Text extracted from web pages</li> <li>Queries: Natural language questions (from query log)</li> <li><a href="https://arxiv.org/abs/1611.09268">Dataset Paper</a></li> </ul>' bibtex_ids: ['Bajaj2016Msmarco'] dev1: desc: ' <p> Official dev1 set with 4,552 queries. </p> ' bibtex_ids: ['Bajaj2016Msmarco'] dev2: desc: ' <p> Official dev2 set with 5,000 queries. </p> ' bibtex_ids: ['Bajaj2016Msmarco'] train: desc: ' <p> Official train set with 322,196 queries. </p> ' bibtex_ids: ['Bajaj2016Msmarco'] trec-dl-2019: desc: ' <p> Queries from the TREC Deep Learning (DL) 2019 shared task, which were sampled from <a class="ds-ref">msmarco-document/eval</a>. A subset of these queries were judged by NIST assessors, (filtered list available in <a class="ds-ref">msmarco-document-v2/trec-dl-2019/judged</a>). </p> <ul> <li><a href="https://arxiv.org/pdf/2003.07820.pdf">Shared Task Paper</a></li> </ul> ' bibtex_ids: ['Craswell2019TrecDl', 'Bajaj2016Msmarco'] trec-dl-2019/judged: desc: ' <p> Subset of <a class="ds-ref">msmarco-document-v2/trec-dl-2019</a>, only including queries with qrels. </p> ' bibtex_ids: ['Craswell2019TrecDl', 'Bajaj2016Msmarco'] trec-dl-2020: desc: ' <p> Queries from the TREC Deep Learning (DL) 2020 shared task, which were sampled from <a class="ds-ref">msmarco-document/eval</a>. A subset of these queries were judged by NIST assessors, (filtered list available in <a class="ds-ref">msmarco-document-v2/trec-dl-2020/judged</a>). </p> <ul> <li><a href="https://arxiv.org/pdf/2102.07662.pdf">Shared Task Paper</a></li> </ul> ' bibtex_ids: ['Craswell2020TrecDl', 'Bajaj2016Msmarco'] trec-dl-2020/judged: desc: ' <p> Subset of <a class="ds-ref">msmarco-document-v2/trec-dl-2020</a>, only including queries with qrels. </p> ' bibtex_ids: ['Craswell2020TrecDl', 'Bajaj2016Msmarco'] trec-dl-2021: desc: ' <p> Official topics for the TREC Deep Learning (DL) 2021 shared task. </p> <p> Note that at this time, qrels are only available to those with TREC active participant login credentials. </p> ' official_measures: ['AP@100', 'nDCG@10', 'P@10', 'RR(rel=2)'] trec-dl-2021/judged: desc: ' <p> <a class="ds-ref">msmarco-document-v2/trec-dl-2021</a>, but filtered down to the 57 queries with qrels. </p> <p> Note that at this time, this is only available to those with TREC active participant login credentials. </p> ' official_measures: ['AP@100', 'nDCG@10', 'P@10', 'RR(rel=2)'] trec-dl-2022: desc: ' <p> Official topics for the TREC Deep Learning (DL) 2022 shared task. </p> <p> Note that these qrels are <i>inferred</i> from the passage ranking task; a document''s relevance label is the maximum of the labels of its passages. </p> ' trec-dl-2022/judged: desc: ' <p> <a class="ds-ref">msmarco-document-v2/trec-dl-2022</a>, but filtered down to only the queries with qrels. </p> ' trec-dl-2023: desc: ' <p> Official topics for the TREC Deep Learning (DL) 2023 shared task. </p> <p> Note that these qrels are <i>inferred</i> from the passage ranking task; a document''s relevance label is the maximum of the labels of its passages. </p> ' trec-dl-2023/judged: desc: ' <p> <a class="ds-ref">msmarco-document-v2/trec-dl-2023</a>, but filtered down to only the queries with qrels. </p> ' anchor-text: pretty_name: "Anchor Text for version 2 of MS Marco" desc: ' <p> For version 2 of MS MARCO, the anchor text collection enriches 4,821,244 documents with anchor text extracted from six Common Crawl snapshots. To keep the collection size reasonable, we sampled 1,000 anchor texts for documents with more than 1,000 anchor texts (this sampling yields that all anchor text is included for 97% of the documents). The <code>text</code> field contains the anchor texts concatenated and the <code>anchors</code> field contains the anchor texts as list. The raw dataset with additional information (roughly 100GB) is <a href="https://github.com/webis-de/ecir22-anchor-text">available online</a>. </p> ' bibtex_ids: ['Froebe2022Anchors'] ================================================ FILE: ir_datasets/docs/msmarco-document.yaml ================================================ _: pretty_name: 'MSMARCO (document)' desc: ' <p> "Based the questions in the [MS-MARCO] Question Answering Dataset and the documents which answered the questions a document ranking task was formulated. There are 3.2 million documents and the goal is to rank based on their relevance. Relevance labels are derived from what passages was marked as having the answer in the QnA dataset." </p> <ul> <li>See also: <a class="ds-ref">msmarco-passage</a></li> <li>Documents: Text extracted from web pages</li> <li>Queries: Natural language questions (from query log)</li> <li><a href="https://microsoft.github.io/msmarco/#docranking">Leaderboard</a></li> <li><a href="https://arxiv.org/abs/1611.09268">Dataset Paper</a></li> </ul>' bibtex_ids: ['Bajaj2016Msmarco'] dev: desc: ' <p> Official dev set. All queries have exactly 1 (positive) relevance judgment. </p> <p> scoreddocs are the top 100 results from Indri QL. These are used for the "re-ranking" setting. </p> ' bibtex_ids: ['Bajaj2016Msmarco'] official_measures: ['RR@10'] eval: desc: ' <p> Official eval set for submission to MS MARCO leaderboard. Relevance judgments are hidden. </p> <p> scoreddocs are the top 100 results from Indri QL. These are used for the "re-ranking" setting. </p> ' bibtex_ids: ['Bajaj2016Msmarco'] official_measures: ['RR@10'] orcas: desc: ' <p> "ORCAS is a click-based dataset associated with the TREC Deep Learning Track. It covers 1.4 million of the TREC DL documents, providing 18 million connections to 10 million distinct queries." </p> <ul> <li>Queries: From query log</li> <li>Relevance Data: User clicks</li> <li>Scored docs: Indri Query Likelihood model</li> <li><a href="https://arxiv.org/abs/2006.05324">Dataset Paper</a></li> </ul> ' bibtex_ids: ['Craswell2020Orcas'] official_measures: ['RR', 'nDCG'] train: desc: ' <p> Official train set. All queries have exactly 1 (positive) relevance judgment. </p> <p> scoreddocs are the top 100 results from Indri QL. These are used for the "re-ranking" setting. </p> ' bibtex_ids: ['Bajaj2016Msmarco'] official_measures: ['RR@10'] trec-dl-2019: desc: ' <p> Queries from the TREC Deep Learning (DL) 2019 shared task, which were sampled from <a class="ds-ref">msmarco-document/eval</a>. A subset of these queries were judged by NIST assessors, (filtered list available in <a class="ds-ref">msmarco-document/trec-dl-2019/judged</a>). </p> <ul> <li><a href="https://arxiv.org/pdf/2003.07820.pdf">Shared Task Paper</a></li> </ul> ' bibtex_ids: ['Craswell2019TrecDl', 'Bajaj2016Msmarco'] official_measures: ['nDCG@10', 'RR', 'MAP'] trec-dl-2019/judged: desc: ' <p> Subset of <a class="ds-ref">msmarco-document/trec-dl-2019</a>, only including queries with qrels. </p> ' bibtex_ids: ['Craswell2019TrecDl', 'Bajaj2016Msmarco'] official_measures: ['nDCG@10', 'RR', 'MAP'] trec-dl-2020: desc: ' <p> Queries from the TREC Deep Learning (DL) 2020 shared task, which were sampled from <a class="ds-ref">msmarco-document/eval</a>. A subset of these queries were judged by NIST assessors, (filtered list available in <a class="ds-ref">msmarco-document/trec-dl-2020/judged</a>). </p> <ul> <li><a href="https://arxiv.org/pdf/2102.07662.pdf">Shared Task Paper</a></li> </ul> ' bibtex_ids: ['Craswell2020TrecDl', 'Bajaj2016Msmarco'] official_measures: ['nDCG@10', 'RR', 'MAP'] trec-dl-2020/judged: desc: ' <p> Subset of <a class="ds-ref">msmarco-document/trec-dl-2020</a>, only including queries with qrels. </p> ' bibtex_ids: ['Craswell2020TrecDl', 'Bajaj2016Msmarco'] official_measures: ['nDCG@10', 'RR', 'MAP'] trec-dl-hard: desc: ' <p> A more challenging subset of <a class="ds-ref">msmarco-document/trec-dl-2019</a> and <a class="ds-ref">msmarco-document/trec-dl-2020</a>. </p> <ul> <li><a href="https://github.com/grill-lab/DL-Hard">data website</a></li> <li>See Also: <a class="ds-ref">msmarco-passage/trec-dl-hard</a></li> </ul> ' bibtex_ids: ['Mackie2021DlHard', 'Bajaj2016Msmarco'] official_measures: ['nDCG@10', 'RR(rel=2)'] trec-dl-hard/fold1: desc: ' <p> Fold 1 of <a class="ds-ref">msmarco-document/trec-dl-hard</a> </p> ' bibtex_ids: ['Mackie2021DlHard', 'Bajaj2016Msmarco'] official_measures: ['nDCG@10', 'RR(rel=2)'] trec-dl-hard/fold2: desc: ' <p> Fold 2 of <a class="ds-ref">msmarco-document/trec-dl-hard</a> </p> ' bibtex_ids: ['Mackie2021DlHard', 'Bajaj2016Msmarco'] official_measures: ['nDCG@10', 'RR(rel=2)'] trec-dl-hard/fold3: desc: ' <p> Fold 3 of <a class="ds-ref">msmarco-document/trec-dl-hard</a> </p> ' bibtex_ids: ['Mackie2021DlHard', 'Bajaj2016Msmarco'] official_measures: ['nDCG@10', 'RR(rel=2)'] trec-dl-hard/fold4: desc: ' <p> Fold 4 of <a class="ds-ref">msmarco-document/trec-dl-hard</a> </p> ' bibtex_ids: ['Mackie2021DlHard', 'Bajaj2016Msmarco'] official_measures: ['nDCG@10', 'RR(rel=2)'] trec-dl-hard/fold5: desc: ' <p> Fold 5 of <a class="ds-ref">msmarco-document/trec-dl-hard</a> </p> ' bibtex_ids: ['Mackie2021DlHard', 'Bajaj2016Msmarco'] official_measures: ['nDCG@10', 'RR(rel=2)'] anchor-text: pretty_name: "Anchor Text for Version 1 of MS MARCO" desc: ' <p> For version 1 of MS MARCO, the anchor text collection enriches 1,703,834 documents with anchor text extracted from six Common Crawl snapshots. To keep the collection size reasonable, we sampled 1,000 anchor texts for documents with more than 1,000 anchor texts (this sampling yields that all anchor text is included for 94% of the documents). The <code>text</code> field contains the anchor texts concatenated and the <code>anchors</code> field contains the anchor texts as list. The raw dataset with additional information (roughly 100GB) is <a href="https://github.com/webis-de/ecir22-anchor-text">available online</a>. </p> ' bibtex_ids: ['Froebe2022Anchors'] ================================================ FILE: ir_datasets/docs/msmarco-passage-v2.yaml ================================================ _: pretty_name: 'MSMARCO (passage, version 2)' desc: ' <p> Version 2 of the MS MARCO passage ranking dataset. The corpus contains 138M passages, which can be linked up with documents in <a class="ds-ref">msmarco-document-v2</a>. </p> <ul> <li>Version 1 of dataset: <a class="ds-ref">msmarco-passage</a></li> <li>Documents: Text extracted from web pages</li> <li>Queries: Natural language questions (from query log)</li> <li><a href="https://arxiv.org/abs/1611.09268">Dataset Paper</a></li> </ul> <p> Change Log </p> <ul> <li> On July 21, 2021, the task organizers <a href="https://github.com/microsoft/msmarco/commit/41b3a684ed8ebd4e753250c3687547a77c62e7dd"> updated the train, dev1, and dev2 qrels</a> to remove duplicate entries from the files. This should not have change results from evaluation tools, but may result in non-repeatable results if these files were used in another process (e.g., model training). The original qrels file for <a class="ds-ref">msmarco-passage-v2/train</a> can be found <a href="https://mirror.ir-datasets.com/abf1fd024b6aca203364d2138c241a6d">here</a> to aid in result repeatability. </li> </ul>' bibtex_ids: ['Bajaj2016Msmarco'] dev1: desc: ' <p> Official dev1 set with 3,903 queries. </p> <p> Note that that qrels in this dataset are not directly human-assessed; labels from <a class="ds-ref">msmarco-passage</a> are mapped to documents via URL, these documents are re-passaged, and then the best approximate match is identified. </p> ' bibtex_ids: ['Bajaj2016Msmarco'] official_measures: ['RR@10'] dev2: desc: ' <p> Official dev2 set with 4,281 queries. </p> <p> Note that that qrels in this dataset are not directly human-assessed; labels from <a class="ds-ref">msmarco-passage</a> are mapped to documents via URL, these documents are re-passaged, and then the best approximate match is identified. </p> ' bibtex_ids: ['Bajaj2016Msmarco'] official_measures: ['RR@10'] train: desc: ' <p> Official train set with 277,144 queries. </p> ' bibtex_ids: ['Bajaj2016Msmarco'] official_measures: ['RR@10'] trec-dl-2021: desc: ' <p> Official topics for the TREC Deep Learning (DL) 2021 shared task. </p> ' official_measures: ['AP@100', 'nDCG@10', 'P(rel=2)@10', 'RR(rel=2)'] trec-dl-2021/judged: desc: ' <p> <a class="ds-ref">msmarco-passage-v2/trec-dl-2021</a>, but filtered down to the 53 queries with qrels. </p> ' official_measures: ['AP@100', 'nDCG@10', 'P(rel=2)@10', 'RR(rel=2)'] trec-dl-2022: desc: ' <p> Official topics for the TREC Deep Learning (DL) 2022 shared task. </p> <p> Note that the officially-released qrels <i>include</i> relevance labels propagated to duplicate passages, while results presented in the notebook papers remove duplicate documents. This means that the results are not directly comparable, and extra care should be taken when making comparisions among systems to ensure that they were evaluated in the same settings. </p> ' trec-dl-2022/judged: desc: ' <p> <a class="ds-ref">msmarco-passage-v2/trec-dl-2022</a>, but filtered down to only the queries with qrels. </p> ' trec-dl-2023: desc: ' <p> Official topics for the TREC Deep Learning (DL) 2023 shared task. </p> <p> Note that the officially-released qrels <i>include</i> relevance labels propagated to duplicate passages, while results presented in the notebook papers remove duplicate documents. This means that the results are not directly comparable, and extra care should be taken when making comparisons among systems to ensure that they were evaluated in the same settings. </p> ' trec-dl-2023/judged: desc: ' <p> <a class="ds-ref">msmarco-passage-v2/trec-dl-2023</a>, but filtered down to only the queries with qrels. </p> ' ================================================ FILE: ir_datasets/docs/msmarco-passage.yaml ================================================ _: pretty_name: 'MSMARCO (passage)' desc: ' <p> A passage ranking benchmark with a collection of 8.8 million passages and question queries. Most relevance judgments are shallow (typically at most 1-2 per query), but the TREC Deep Learning track adds deep judgments. Evaluation typically conducted using MRR@10. </p> <p> Note that the original document source files for this collection contain a double-encoding error that cause strange sequences like "å¬" and "ðºð". These are automatically corrrected (properly converting previous examples to "公" and "🇺🇸"). </p> <ul> <li>See also: <a class="ds-ref">msmarco-document</a></li> <li>Documents: Short passages (from web)</li> <li>Queries: Natural language questions (from query log)</li> <li><a href="https://microsoft.github.io/msmarco/#ranking">Leaderboard</a></li> <li><a href="https://arxiv.org/abs/1611.09268">Dataset Paper</a></li> </ul>' bibtex_ids: ['Bajaj2016Msmarco'] dev: desc: ' <p> Official dev set. </p> <p> scoreddocs are the top 1000 results from BM25. These are used for the "re-ranking" setting. Note that these are sub-sampled to about 1/8 of the total available dev queries by the MSMARCO authors for faster evaluation. The BM25 scores from scoreddocs are not available (all have a score of 0). </p> ' bibtex_ids: ['Bajaj2016Msmarco'] official_measures: ['RR@10'] dev/small: desc: ' <p> Official "small" version of the dev set, consisting of 6,980 queries (6.9% of the full dev set). </p> ' bibtex_ids: ['Bajaj2016Msmarco'] official_measures: ['RR@10'] dev/judged: desc: ' <p> Subset of <a class="ds-ref">msmarco-passage/dev</a> that only includes queries that have at least one qrel. </p> ' bibtex_ids: ['Bajaj2016Msmarco'] official_measures: ['RR@10'] dev/2: desc: ' <p> "Dev2" split of the <a class="ds-ref">msmarco-passage/dev</a> set. Originally released as part of the v2 corpus. </p> ' bibtex_ids: ['Bajaj2016Msmarco'] official_measures: ['RR@10'] eval: desc: ' <p> Official eval set for submission to MS MARCO leaderboard. Relevance judgments are hidden. </p> <p> scoreddocs are the top 1000 results from BM25. These are used for the "re-ranking" setting. Note that these are sub-sampled to about 1/8 of the total available eval queries by the MSMARCO authors for faster evaluation. The BM25 scores from scoreddocs are not available (all have a score of 0). </p> ' bibtex_ids: ['Bajaj2016Msmarco'] official_measures: ['RR@10'] eval/small: desc: ' <p> Official "small" version of the eval set, consisting of 6,837 queries (6.8% of the full eval set). </p> ' bibtex_ids: ['Bajaj2016Msmarco'] official_measures: ['RR@10'] train: desc: ' <p> Official train set. </p> <p> Not all queries have relevance judgments. Use <a class="ds-ref">msmarco-passage/train/judged</a> for a filtered list that only includes documents that have at least one qrel. </p> <p> scoreddocs are the top 1000 results from BM25. These are used for the "re-ranking" setting. Note that these are sub-sampled to about 1/8 of the total available train queries by the MSMARCO authors for faster evaluation. The BM25 scores from scoreddocs are not available (all have a score of 0). </p> <p> docpairs provides access to the "official" sequence for pairwise training. </p> ' bibtex_ids: ['Bajaj2016Msmarco'] official_measures: ['RR@10'] train/triples-v2: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/train</a>, but with version 2 of the triples file. </p> <p> This version of the triples file includes rows that were accidently missing from version 1 of the file (see discussion <a href="https://github.com/microsoft/MSMARCO-Passage-Ranking/commit/4695a71c6c76ce85c07a51c0f12690cab19abbb0">here</a>). </p> <p> Note that this is sorted by the IDs in the file, so you probably would not want to use it unless you first shuffle it before usage. <a href="https://github.com/microsoft/MSMARCO-Passage-Ranking/issues/21">We opened an issue</a> suggesting that a third version of the file is provided that is shuffled so that the order is consistent across groups using the data, but at this time, no such file exists in an official capacity. </p> ' bibtex_ids: ['Bajaj2016Msmarco'] official_measures: ['RR@10'] train/triples-small: desc: ' <p> Version of <a class="ds-ref">msmarco-passage/train</a>, but with the "small" triples file (a 10% sample of the full file). </p> <p> Note that to save on storage space (27GB), the contents of the file are mapped to their corresponding query and document IDs. This process takes a few minutes to run the first time the triples are requested. </p> ' bibtex_ids: ['Bajaj2016Msmarco'] official_measures: ['RR@10'] train/judged: desc: ' <p> Subset of <a class="ds-ref">msmarco-passage/train</a> that only includes queries that have at least one qrel. </p> ' bibtex_ids: ['Bajaj2016Msmarco'] official_measures: ['RR@10'] train/medical: desc: ' <p> Subset of <a class="ds-ref">msmarco-passage/train</a> that only includes queries that have a layman or expert medical term. Note that this includes about 20% false matches due to terms with multiple senses. </p>' bibtex_ids: ['MacAvaney2020MedMarco', 'Bajaj2016Msmarco'] official_measures: ['RR@10'] train/split200-train: desc: ' <p> Subset of <a class="ds-ref">msmarco-passage/train</a> without 200 queries that are meant to be used as a small validation set. From various works. </p> ' bibtex_ids: ['Bajaj2016Msmarco'] official_measures: ['RR@10'] train/split200-valid: desc: ' <p> Subset of <a class="ds-ref">msmarco-passage/train</a> with only 200 queries that are meant to be used as a small validation set. From various works. </p> ' bibtex_ids: ['Bajaj2016Msmarco'] official_measures: ['RR@10'] trec-dl-2019: desc: ' <p> Queries from the TREC Deep Learning (DL) 2019 shared task, which were sampled from <a class="ds-ref">msmarco-passage/eval</a>. A subset of these queries were judged by NIST assessors, (filtered list available in <a class="ds-ref">msmarco-passage/trec-dl-2019/judged</a>). </p> <ul> <li><a href="https://arxiv.org/pdf/2003.07820.pdf">Shared Task Paper</a></li> </ul> ' bibtex_ids: ['Craswell2019TrecDl', 'Bajaj2016Msmarco'] official_measures: ['nDCG@10', 'RR(rel=2)', 'AP(rel=2)'] trec-dl-2019/judged: desc: ' <p> Subset of <a class="ds-ref">msmarco-passage/trec-dl-2019</a>, only including queries with qrels. </p> ' bibtex_ids: ['Craswell2019TrecDl', 'Bajaj2016Msmarco'] official_measures: ['nDCG@10', 'RR(rel=2)', 'AP(rel=2)'] trec-dl-2020: desc: ' <p> Queries from the TREC Deep Learning (DL) 2020 shared task, which were sampled from <a class="ds-ref">msmarco-passage/eval</a>. A subset of these queries were judged by NIST assessors, (filtered list available in <a class="ds-ref">msmarco-passage/trec-dl-2020/judged</a>). </p> <ul> <li><a href="https://arxiv.org/pdf/2102.07662.pdf">Shared Task Paper</a></li> </ul> ' bibtex_ids: ['Craswell2020TrecDl', 'Bajaj2016Msmarco'] official_measures: ['nDCG@10', 'RR(rel=2)', 'AP(rel=2)'] trec-dl-2020/judged: desc: ' <p> Subset of <a class="ds-ref">msmarco-passage/trec-dl-2020</a>, only including queries with qrels. </p> ' bibtex_ids: ['Craswell2020TrecDl', 'Bajaj2016Msmarco'] official_measures: ['nDCG@10', 'RR(rel=2)', 'AP(rel=2)'] trec-dl-hard: desc: ' <p> A more challenging subset of <a class="ds-ref">msmarco-passage/trec-dl-2019</a> and <a class="ds-ref">msmarco-document/trec-dl-2020</a>. </p> <ul> <li><a href="https://github.com/grill-lab/DL-Hard">data website</a></li> <li>See Also: <a class="ds-ref">msmarco-document/trec-dl-hard</a></li> </ul> ' bibtex_ids: ['Mackie2021DlHard', 'Bajaj2016Msmarco'] official_measures: ['nDCG@10', 'RR(rel=2)'] trec-dl-hard/fold1: desc: ' <p> Fold 1 of <a class="ds-ref">msmarco-passage/trec-dl-hard</a> </p> ' bibtex_ids: ['Mackie2021DlHard', 'Bajaj2016Msmarco'] official_measures: ['nDCG@10', 'RR(rel=2)'] trec-dl-hard/fold2: desc: ' <p> Fold 2 of <a class="ds-ref">msmarco-passage/trec-dl-hard</a> </p> ' bibtex_ids: ['Mackie2021DlHard', 'Bajaj2016Msmarco'] official_measures: ['nDCG@10', 'RR(rel=2)'] trec-dl-hard/fold3: desc: ' <p> Fold 3 of <a class="ds-ref">msmarco-passage/trec-dl-hard</a> </p> ' bibtex_ids: ['Mackie2021DlHard', 'Bajaj2016Msmarco'] official_measures: ['nDCG@10', 'RR(rel=2)'] trec-dl-hard/fold4: desc: ' <p> Fold 4 of <a class="ds-ref">msmarco-passage/trec-dl-hard</a> </p> ' bibtex_ids: ['Mackie2021DlHard', 'Bajaj2016Msmarco'] official_measures: ['nDCG@10', 'RR(rel=2)'] trec-dl-hard/fold5: desc: ' <p> Fold 5 of <a class="ds-ref">msmarco-passage/trec-dl-hard</a> </p> ' bibtex_ids: ['Mackie2021DlHard', 'Bajaj2016Msmarco'] official_measures: ['nDCG@10', 'RR(rel=2)'] ================================================ FILE: ir_datasets/docs/msmarco-qna.yaml ================================================ _: pretty_name: 'MSMARCO (QnA)' desc: ' <p> The MS MARCO Question Answering dataset. This is the source collection of <a class="ds-ref">msmarco-passage</a> and <a class="ds-ref">msmarco-document</a>. </p> <div class="warn"> It is prohibited to use information from this dataset for submissions to the MS MARCO passage and document leaderboards or the TREC DL shared task. </div> <p> Query IDs in this collection align with those found in <a class="ds-ref">msmarco-passage</a> and <a class="ds-ref">msmarco-document</a>. The collection does not provide doc_ids, so these are assigned in the following format: <code>[msmarco_passage_id]-[url_seq]</code>, where <code>[msmarco_passage_id]</code> is the document from <a class="ds-ref">msmarco-passage</a> that has matching contents and <code>[url_seq]</code> is assigned sequentially for each URL encountered. In other words, all documents with the same prefix have the same text; they only differ in the originating document. </p> <p> Doc <code>msmarco_passage_id</code> fields are assigned by matching pasasge contents in <a class="ds-ref">msmarco-passage</a>, and this field is provided for every document. Doc <code>msmarco_document_id</code> fields are assigned by matching the URL to the one found in <a class="ds-ref">msmarco-document</a>. Due to how <a class="ds-ref">msmarco-document</a> was constructed, there is not necessarily a match (value will be <code class="kwd">None</code> if no match). </p> <ul> <li>Documents: Short passages (from web)</li> <li>Queries: Natural language questions (from query log), including type and natural-language answers.</li> <li><a href="https://microsoft.github.io/msmarco/#qna">Leaderboard</a></li> <li><a href="https://arxiv.org/abs/1611.09268">Dataset Paper</a></li> <li><a href="https://github.com/microsoft/MSMARCO-Question-Answering">More information</a></li> </ul>' bibtex_ids: ['Bajaj2016Msmarco'] train: desc: ' <p> Official train set. </p> <p> The scoreddocs provides the roughtly 10 passages presented to the user for annotation, where the score indicates the order presented. </p> ' bibtex_ids: ['Bajaj2016Msmarco'] dev: desc: ' <p> Official dev set. </p> <p> The scoreddocs provides the roughtly 10 passages presented to the user for annotation, where the score indicates the order presented. </p> ' bibtex_ids: ['Bajaj2016Msmarco'] eval: desc: ' <p> Official eval set. </p> <p> The scoreddocs provides the roughtly 10 passages presented to the user for annotation, where the score indicates the order presented. </p> ' bibtex_ids: ['Bajaj2016Msmarco'] ================================================ FILE: ir_datasets/docs/nano-beir.yaml ================================================ _: pretty_name: 'Nano Beir (benchmark suite)' desc: ' <p> Nano Beir is a smaller version (max 50 queries per benchmark) of the Beir suite of benchmarks to test zero-shot transfer. </p> <ul> <li><a href="https://arxiv.org/abs/2104.08663">Paper</a></li> <li><a href="https://github.com/UKPLab/beir/blob/main/README.md">GitHub</a></li> </ul> ' bibtex_ids: ['Thakur2021Beir'] arguana: desc: ' <p> A version of the ArguAna Counterargs dataset, for argument retrieval. </p> <ul> <li><a href="https://www.aclweb.org/anthology/P18-1023.pdf">Dataset paper</a></li> <li><a href="http://argumentation.bplaced.net/arguana/data">Dataset website</a></li> </ul> ' bibtex_ids: ['Wachsmuth2018Arguana', 'Thakur2021Beir'] climate-fever: desc: ' <p> A version of the CLIMATE-FEVER dataset, for fact verification on claims about climate. </p> <ul> <li><a href="https://arxiv.org/pdf/2012.00614.pdf">Dataset paper</a></li> <li><a href="https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html">Dataset website</a></li> </ul> ' bibtex_ids: ['Diggelmann2020CLIMATEFEVERAD', 'Thakur2021Beir'] dbpedia-entity: desc: ' <p> A version of the DBPedia-Entity-v2 dataset for entity retrieval. </p> <ul> <li><a href="http://hasibi.com/files/sigir2017-dbpedia_entity.pdf">Dataset paper</a></li> <li><a href="https://github.com/iai-group/DBpedia-Entity">Dataset website</a></li> </ul> ' bibtex_ids: ['Hasibi2017DBpediaEntityVA', 'Thakur2021Beir'] fever: desc: ' <p> A version of the FEVER dataset for fact verification. </p> <ul> <li><a href="https://www.aclweb.org/anthology/N18-1074.pdf">Dataset paper</a></li> <li><a href="https://fever.ai/resources.html">Dataset website</a></li> </ul> ' bibtex_ids: ['Thorne2018Fever', 'Thakur2021Beir'] fiqa: desc: ' <p> A version of the FIQA-2018 dataset (financial opinion question answering). </p> <ul> <li><a href="https://dl.acm.org/doi/10.1145/3184558.3192301">Dataset paper</a></li> <li><a href="https://sites.google.com/view/fiqa/home">Dataset site</a></li> </ul> ' bibtex_ids: ['Maia2018Fiqa', 'Thakur2021Beir'] hotpotqa: desc: ' <p> A version of the Hotpot QA dataset for multi-hop question answering. </p> <ul> <li><a href="https://www.aclweb.org/anthology/D18-1259">Dataset paper</a></li> <li><a href="https://github.com/hotpotqa/hotpot">Dataset website</a></li> </ul> ' bibtex_ids: ['Yang2018Hotpotqa', 'Thakur2021Beir'] msmarco: desc: ' <p> A version of the MS MARCO passage ranking dataset. </p> <p> Note that this version differs from <a class="ds-ref">msmarco-passage</a>, in that it does not correct the encoding problems in the source documents. </p> <ul> <li><a href="https://microsoft.github.io/msmarco/#ranking">Leaderboard</a></li> <li><a href="https://arxiv.org/abs/1611.09268">Dataset Paper</a></li> <li>See also: <a class="ds-ref">msmarco-passage</a></li> </ul> ' bibtex_ids: ['Bajaj2016Msmarco', 'Thakur2021Beir'] nfcorpus: desc: ' <p> A version of the NF Corpus (Nutrition Facts). </p> <p> Data pre-processing may be different than what is done in <a class="ds-ref">nfcorpus</a>. </p> <ul> <li><a href="https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/">Dataset website</p></li> <li><a href="https://link.springer.com/chapter/10.1007/978-3-319-30671-1_58">Dataset paper</p></li> <li>See also: <a class="ds-ref">nfcorpus</a></li> </ul> ' bibtex_ids: ['Boteva2016Nfcorpus', 'Thakur2021Beir'] nq: desc: ' <p> A version of the Natural Questions dev dataset. </p> <p> Data pre-processing differs both from what is done in <a class="ds-ref">natural-questions</a> and <a class="ds-ref">dpr-w100/natural-questions</a>, especially with respect to the document collection and filtering conducted on the queries. See the Beir paper for details. </p> <ul> <li><a href="https://ai.google.com/research/NaturalQuestions">Dataset website</a></li> <li><a href="https://storage.googleapis.com/pub-tools-public-publication-data/pdf/1f7b46b5378d757553d3e92ead36bda2e4254244.pdf">Dataset paper</a></li> <li>See also: <a class="ds-ref">natural-questions</a>, <a class="ds-ref">dpr-w100/natural-questions</a></li> </ul> ' bibtex_ids: ['Kwiatkowski2019Nq', 'Thakur2021Beir'] quora: desc: ' <p> A version of the Quora duplicate question detection dataset (QQP). </p> <ul> <li><a href="https://www.kaggle.com/c/quora-question-pairs">Dataset website</a></li> </ul> ' bibtex_ids: ['Thakur2021Beir'] scidocs: desc: ' <p> A version of the SciDocs dataset, used for citation retrieval. </p> <ul> <li><a href="https://www.aclweb.org/anthology/2020.acl-main.207.pdf">Dataset paper</a></li> <li><a href="https://allenai.org/data/scidocs">Dataset website</a></li> </ul> ' bibtex_ids: ['Cohan2020Scidocs', 'Thakur2021Beir'] scifact: desc: ' <p> A version of the SciFact dataset, for fact verification. </p> <ul> <li><a href="https://www.aclweb.org/anthology/2020.emnlp-main.609.pdf">Dataset paper</a></li> <li><a href="https://www.aclweb.org/anthology/2020.emnlp-main.609.pdf">Dataset website</a></li> </ul> ' bibtex_ids: ['Wadden2020Scifact', 'Thakur2021Beir'] webis-touche2020: desc: ' <p> Original version of the Touchè-2020 dataset, for argument retrieval. </p> <div class="warn"> Consider using <a class="ds-ref">beir/webis-touche2020/v2</a> instead; it uses an updated, more complete version of the qrels. </div> <ul> <li><a href="https://link.springer.com/chapter/10.1007%2F978-3-030-58219-7_26">Dataset paper</a></li> <li><a href="https://webis.de/events/touche-20/">Dataset webiste</a></li> </ul> ' bibtex_ids: ['Bondarenko2020Tuche', 'Thakur2021Beir'] webis-touche2020/v2: desc: ' <p> Version 2 of the Touchè-2020 dataset, for argument retrieval. This version uses the "corrected" version of the qrels, mapped to version 1 of the corpus. </p> <ul> <li><a href="https://link.springer.com/chapter/10.1007%2F978-3-030-58219-7_26">Dataset paper</a></li> <li><a href="https://webis.de/events/touche-20/">Dataset webiste</a></li> </ul> ' bibtex_ids: ['Bondarenko2020Tuche', 'Thakur2021Beir'] ================================================ FILE: ir_datasets/docs/natural-questions.yaml ================================================ _: pretty_name: 'Natural Questions' desc: ' <p> Google Natural Questions is a Q&A dataset containing long, short, and Yes/No answers from Wikipedia. <kbd>ir_datasets</kbd> frames this around an ad-hoc ranking setting by building a collection of all long answer candidate passages. However, short and Yes/No annotations are also available in the <kbd>qrels</kbd>, as are the passages presented to the annotators (via <kbd>scoreddocs</kbd>). </p> <p> Importantly, the document collection does not consist of all Wikipedia passages, but instead a union of the candidate passages presented to the annotators (akin to MS MARCO). <a class="ds-ref">dph-w100/natural-questions/train</a> and <a class="ds-ref">dph-w100/natural-questions/dev</a> contain a filtered set of the questions in this dataset and a full Wikipedia dump (which is a more realistic retrieval setting). </p> <ul> <li><a href="https://ai.google.com/research/NaturalQuestions">Dataset website</a></li> <li><a href="https://storage.googleapis.com/pub-tools-public-publication-data/pdf/1f7b46b5378d757553d3e92ead36bda2e4254244.pdf">Dataset paper</a></li> <li>See also: <a class="ds-ref">dph-w100</a></li> </ul> ' bibtex_ids: ['Kwiatkowski2019Nq'] train: desc: ' <p> Official train set. </p> ' bibtex_ids: ['Kwiatkowski2019Nq'] dev: desc: ' <p> Official dev set. </p> ' bibtex_ids: ['Kwiatkowski2019Nq'] ================================================ FILE: ir_datasets/docs/neuclir.yaml ================================================ _: pretty_name: 'NeuCLIR Corpus' desc: ' <p> This is the dataset created for <a href="https://neuclir.github.io/">TREC 2022 NeuCLIR Track</a>. Topics will be developed and released by June 2022 by NIST. Relevance judgements will be available after the evaluation (around November). </p> <p> The collection designed to be similar to [HC4] and a large portion of documents from HC4 are ported to this collection. Users can conduct experiemnts on this collection with queries and qrels in HC4 for development. </p> <ul> <li>Documents: Web pages from Common Crawl in Chinese, Persian, and Russian.</li> <li> Queries: (To be released) English TREC-style title/description queries. Narrative field contains an example passage for each relevance level. Human and machine translation of the titles and descriptions in the target language (i.e., document language) are provided in the query object. </li> <li>Qrels: (To be released) Documents are judged in three levels of relevance. Please refer to the dataset paper for the full definition of the levels. </li> <li>See also: <a class="ds-ref">hc4</a></li> <li><a href="https://neuclir.github.io/">NeuCLIR Track Website</a></li> <li><a href="https://github.com/NeuCLIR/download-collection">Collection Repository</a></li> </ul>' 1: desc: ' <p> Version 1 of the NeuCLIR corpus. </p> ' 1/zh: desc: ' <p> The Chinese collection contains English queries (to be released) and Chinese documents for retrieval. Human and machine translated queries will be provided in the query object for running monolingual retrieval or cross-language retrival assuming the machine query tranlstion into Chinese is available. </p> ' 1/zh/hc4-filtered: desc: ' <p> Subset of the Chinse collection that intersect with HC4. The 60 queries are the <a class="ds-ref">hc4/zh/dev</a> and <a class="ds-ref">hc4/zh/test</a> sets combined. </p> ' bibtex_ids: ['Lawrie2022HC4'] 1/fa: desc: ' <p> The Persian collection contains English queries (to be released) and Persian documents for retrieval. Human and machine translated queries will be provided in the query object for running monolingual retrieval or cross-language retrival assuming the machine query tranlstion into Persian is available. </p> ' 1/fa/hc4-filtered: desc: ' <p> Subset of the Persian collection that intersect with HC4. The 60 queries are the <a class="ds-ref">hc4/fa/dev</a> and <a class="ds-ref">hc4/fa/test</a> sets combined. </p> ' bibtex_ids: ['Lawrie2022HC4'] 1/ru: desc: ' <p> The Russian collection contains English queries (to be released) and Russian documents for retrieval. Human and machine translated queries will be provided in the query object for running monolingual retrieval or cross-language retrival assuming the machine query tranlstion into Russian is available. </p> ' 1/ru/hc4-filtered: desc: ' <p> Subset of the Russian collection that intersect with HC4. The 54 queries are the <a class="ds-ref">hc4/ru/dev</a> and <a class="ds-ref">hc4/ru/test</a> sets combined. </p> ' bibtex_ids: ['Lawrie2022HC4'] 1/fa/trec-2022: desc: ' <p> Topics and assessments for the TREC NeuCLIR 2022 (Persian language CLIR). </p> ' 1/fa/trec-2023: desc: ' <p> Topics and assessments for the TREC NeuCLIR 2023 (Persian language CLIR). </p> ' 1/ru/trec-2022: desc: ' <p> Topics and assessments for the TREC NeuCLIR 2022 (Russian language CLIR). </p> ' 1/ru/trec-2023: desc: ' <p> Topics and assessments for the TREC NeuCLIR 2023 (Russian language CLIR). </p> ' 1/zh/trec-2022: desc: ' <p> Topics and assessments for the TREC NeuCLIR 2022 (Chinese language CLIR). </p> ' 1/zh/trec-2023: desc: ' <p> Topics and assessments for the TREC NeuCLIR 2023 (Chinese language CLIR). </p> ' 1/multi: desc: ' <p> A combined corpus of NeuCLIR v1 including all Persian, Russian, and Chinese documents. </p> ' 1/multi/trec-2023: desc: ' <p> Topics and assessments for the TREC NeuCLIR 2023 multi-language retrieval task. </p> ' ================================================ FILE: ir_datasets/docs/neumarco.yaml ================================================ _: pretty_name: "neuMARCO" desc: ' <p> A version of <a class="ds-ref">msmarco-passage</a> for cross-language information retrieval, provided by <a href="https://hltcoe.jhu.edu/">JHU HLTCOE</a> with documents translated to other langauges using a <a href="https://www.amazon.science/publications/sockeye-2-a-toolkit-for-neural-machine-translation"> Sockeye 2</a> translation model. </p> <ul> <li>Documents: Web passages using machine translation to English</li> <li>Queries: Natural-language web queries in English</li> </ul> ' fa: desc: ' <p>The <a class="ds-ref">msmarco-passage</a> corpus, translated to Persian (Farsi).</p> ' fa/dev: desc: ' <p>A version of <a class="ds-ref">msmarco-passage/dev</a>, with the corpus translated to Persian (Farsi).</p> ' fa/dev/judged: desc: ' <p>A version of <a class="ds-ref">msmarco-passage/dev/judged</a>, with the corpus translated to Persian (Farsi).</p> ' fa/dev/small: desc: ' <p>A version of <a class="ds-ref">msmarco-passage/dev/small</a>, with the corpus translated to Persian (Farsi).</p> ' fa/train: desc: ' <p>A version of <a class="ds-ref">msmarco-passage/train</a>, with the corpus translated to Persian (Farsi).</p> ' fa/train/judged: desc: ' <p>A version of <a class="ds-ref">msmarco-passage/train/judged</a>, with the corpus translated to Persian (Farsi).</p> ' ru: desc: ' <p>The <a class="ds-ref">msmarco-passage</a> corpus, translated to Russian.</p> ' ru/dev: desc: ' <p>A version of <a class="ds-ref">msmarco-passage/dev</a>, with the corpus translated to Russian.</p> ' ru/dev/judged: desc: ' <p>A version of <a class="ds-ref">msmarco-passage/dev/judged</a>, with the corpus translated to Russian.</p> ' ru/dev/small: desc: ' <p>A version of <a class="ds-ref">msmarco-passage/dev/small</a>, with the corpus translated to Russian.</p> ' ru/train: desc: ' <p>A version of <a class="ds-ref">msmarco-passage/train</a>, with the corpus translated to Russian.</p> ' ru/train/judged: desc: ' <p>A version of <a class="ds-ref">msmarco-passage/train/judged</a>, with the corpus translated to Russian.</p> ' zh: desc: ' <p>The <a class="ds-ref">msmarco-passage</a> corpus, translated to Chinese.</p> ' zh/dev: desc: ' <p>A version of <a class="ds-ref">msmarco-passage/dev</a>, with the corpus translated to Chinese.</p> ' zh/dev/judged: desc: ' <p>A version of <a class="ds-ref">msmarco-passage/dev/judged</a>, with the corpus translated to Chinese.</p> ' zh/dev/small: desc: ' <p>A version of <a class="ds-ref">msmarco-passage/dev/small</a>, with the corpus translated to Chinese.</p> ' zh/train: desc: ' <p>A version of <a class="ds-ref">msmarco-passage/train</a>, with the corpus translated to Chinese.</p> ' zh/train/judged: desc: ' <p>A version of <a class="ds-ref">msmarco-passage/train/judged</a>, with the corpus translated to Chinese.</p> ' ================================================ FILE: ir_datasets/docs/nfcorpus.yaml ================================================ _: pretty_name: 'NFCorpus (NutritionFacts)' desc: ' <p> "NFCorpus is a full-text English retrieval data set for Medical Information Retrieval. It contains a total of 3,244 natural language queries (written in non-technical English, harvested from the NutritionFacts.org site) with 169,756 automatically extracted relevance judgments for 9,964 medical documents (written in a complex terminology-heavy language), mostly from PubMed." </p> <ul> <li><a href="https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/">Dataset website</p></li> <li><a href="https://link.springer.com/chapter/10.1007/978-3-319-30671-1_58">Dataset paper</p></li> </ul> ' bibtex_ids: ['Boteva2016Nfcorpus'] train: desc: ' <p> Official train set. Queries include both title and combinted "all" text field (titles, descriptions, topics, transcripts and comments) </p> ' bibtex_ids: ['Boteva2016Nfcorpus'] train/nontopic: desc: ' <p> Official train set, filtered to exclude queries from topic pages. </p> ' bibtex_ids: ['Boteva2016Nfcorpus'] train/video: desc: ' <p> Official train set, filtered to only include queries from video pages. </p> ' bibtex_ids: ['Boteva2016Nfcorpus'] dev: desc: ' <p> Official dev set. Queries include both title and combinted "all" text field (titles, descriptions, topics, transcripts and comments) </p> ' bibtex_ids: ['Boteva2016Nfcorpus'] dev/nontopic: desc: ' <p> Official dev set, filtered to exclude queries from topic pages. </p> ' bibtex_ids: ['Boteva2016Nfcorpus'] dev/video: desc: ' <p> Official dev set, filtered to only include queries from video pages. </p> ' bibtex_ids: ['Boteva2016Nfcorpus'] test: desc: ' <p> Official test set. Queries include both title and combinted "all" text field (titles, descriptions, topics, transcripts and comments) </p> ' bibtex_ids: ['Boteva2016Nfcorpus'] test/nontopic: desc: ' <p> Official test set, filtered to exclude queries from topic pages. </p> ' bibtex_ids: ['Boteva2016Nfcorpus'] test/video: desc: ' <p> Official test set, filtered to only include queries from video pages. </p> ' bibtex_ids: ['Boteva2016Nfcorpus'] ================================================ FILE: ir_datasets/docs/nyt.yaml ================================================ _: pretty_name: 'NYT' desc: ' <p> The New York Times Annotated Corpus. Consists of articles published between 1987 and 2007. It is used in TREC Core 2017 and it is also useful for transferring relevance signals in cases where training data is in short supply. </p> <p> Uses data from <a href="https://catalog.ldc.upenn.edu/LDC2008T19">LDC2008T19</a>. The source collection can be downloaded from the LDC. </p>' docs_instructions: &inst "docs available from LDC" bibtex_ids: ['Sandhaus2008Nyt'] data_access: ' <p> To use this dataset, you need a copy of the source corpus, provided by the the Linguistic Data Consortium. The specific resource needed is <a href="https://catalog.ldc.upenn.edu/LDC2008T19">LDC2008T19</a>. </p> <p> Many organizations already have a subscription to the LDC, so access to the collection can be as easy as confirming the data usage agreement and downloading the corpus. Check with your library for access details. </p> <p> The source file is: <kbd>nyt_corpus_LDC2008T19.tgz</kbd>. </p> <p> ir_datasets expects this file to be copied/linked as <kbd>~/.ir_datasets/nyt/nyt.tgz</kbd>. </p> ' trec-core-2017: desc: ' <p> The TREC Common Core 2017 benchmark. </p> <p> Note that this dataset only contains the 50 queries assessed by NIST. </p> <ul> <li>Queries: TREC-style (keyword, description, narrative)</li> <li>Relevance: Deeply-annotated</li> <li><a href="https://github.com/trec-core/2017">Shared Task Website</a></li> <li><a href="https://trec.nist.gov/pubs/trec26/papers/Overview-CC.pdf">Shared Task Paper</a></li> <ul>' docs_instructions: *inst bibtex_ids: ['Allan2017TrecCore', 'Sandhaus2008Nyt'] wksup: desc: ' <p> Weak supervision from the NYT document collection. This is useful for transferring relevance signals in cases where training data is in short supply. </p> <ul> <li>Queries: Headlines</li> <li>Relevance: Assumed headline is relevant to article</li> <li><a href="https://arxiv.org/abs/1707.00189">Paper</a></li> <ul>' docs_instructions: *inst queries_instructions: &inst_queries "queries available from LDC" qrels_instructions: &inst_qrels "qrels available from LDC" bibtex_ids: ['MacAvaney2019Wksup', 'Sandhaus2008Nyt'] wksup/train: desc: ' <p> Training set (without held-out <a class="ds-ref">nyt/wksup/valid</a>) for transferring relevance signals from NYT corpus. </p>' docs_instructions: *inst queries_instructions: *inst_queries qrels_instructions: *inst_qrels bibtex_ids: ['MacAvaney2019Wksup', 'Sandhaus2008Nyt'] wksup/valid: desc: ' <p> Held-out validation set for transferring relevance signals from NYT corpus (see <a class="ds-ref">nyt/wksup/train</a>). </p>' docs_instructions: *inst queries_instructions: *inst_queries qrels_instructions: *inst_qrels bibtex_ids: ['MacAvaney2019Wksup', 'Sandhaus2008Nyt'] ================================================ FILE: ir_datasets/docs/pmc.yaml ================================================ _: pretty_name: 'PubMed Central (TREC CDS)' desc: ' <p> Bio-medical articles from <a href="https://www.ncbi.nlm.nih.gov/pmc/">PubMed Central</a>. Right now, only includes subsets used for the TREC Clinical Decision Support (CDS) 2014-16 tasks. </p>' v1: desc: ' <p> Subset of PMC articles used for the TREC 2014 and 2015 tasks (v1). Inclues titles, abstracts, full text. Collected from the open access segment on January 21, 2014. </p> <ul> <li><a href="http://www.trec-cds.org/2014.html#documents">Information on documents</a></li> <ul>' v2: desc: ' <p> Subset of PMC articles used for the TREC 2016 task (v2). Inclues titles, abstracts, full text. Collected from the open access segment on March 28, 2016. </p> <ul> <li><a href="http://www.trec-cds.org/2016.html#documents">Information on documents</a></li> <ul>' v1/trec-cds-2014: desc: ' <p> The TREC Clinical Decision Support (CDS) track from 2014. </p> <ul> <li><a href="http://www.trec-cds.org/2014.html">Shared task site</a></li> <li><a href="https://trec.nist.gov/pubs/trec23/papers/overview-clinical.pdf">Task Overview Paper</a></li> <ul>' bibtex_ids: ['Simpson2014TrecCds'] v1/trec-cds-2015: desc: ' <p> The TREC Clinical Decision Support (CDS) track from 2015. </p> <ul> <li><a href="http://www.trec-cds.org/2015.html">Shared task site</a></li> <li><a href="https://trec.nist.gov/pubs/trec24/papers/Overview-CL.pdf">Task Overview Paper</a></li> <ul>' bibtex_ids: ['Roberts2015TrecCds'] v2/trec-cds-2016: desc: ' <p> The TREC Clinical Decision Support (CDS) track from 2016. </p> <ul> <li><a href="http://www.trec-cds.org/2016.html">Shared task site</a></li> <li><a href="https://trec.nist.gov/pubs/trec25/papers/Overview-CL.pdf">Task Overview Paper</a></li> <ul>' bibtex_ids: ['Roberts2016TrecCds'] ================================================ FILE: ir_datasets/docs/sara.yaml ================================================ _: # matches documentation key above pretty_name: 'SARA' # a more human-readable way to present this dataset than the dataset-id desc: ' <p> A set of sensitivity-aware relevance assessments. More information is avaliable here: <p> <ul> <li><a href="https://github.com/JackMcKechnie/SARA-A-Collection-of-Sensitivity-Aware-Relevance-Assessments">SARA</a></li> </ul> ' ================================================ FILE: ir_datasets/docs/touche-image.yaml ================================================ _: pretty_name: "Touché Image Search" desc: | <p> Focused crawl of about 23 841 images (and associated web pages) as document collection. </p> <p> This collection is licensed with the <a href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International</a>. Individual rights to the content still apply. </p> <ul> <li><a href="https://zenodo.org/record/6873575">Zenodo</a></li> <li><a href="https://touche.webis.de/clef22/touche22-web/image-retrieval-for-arguments.html">Touché 2022 task 3 website</a></li> <li><a href="https://touche.webis.de/clef22/touche22-web/">Touché 2022 lab website</a></li> <li><a href="https://doi.org/10.1007/978-3-030-99739-7_43">Overview paper</a></li> </ul> bibtex_ids: - Bondarenko2022Touche - Kiesel2021Image - Dimitrov2021SemEval - Yanai2007Image 2022-06-13: pretty_name: "Touché Image Search" desc: | <p> Corpus version 2022-06-13 with 23 841 images. It was released on June 13, 2022 on <a href="https://zenodo.org/record/3734893">Zenodo</a>. </p> <p> This collection is licensed with the <a href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International</a>. Individual rights to the content still apply. </p> <ul> <li><a href="https://zenodo.org/record/6873575">Zenodo</a></li> <li><a href="https://touche.webis.de/clef22/touche22-web/image-retrieval-for-arguments.html">Touché 2022 task 3 website</a></li> <li><a href="https://touche.webis.de/clef22/touche22-web/">Touché 2022 lab website</a></li> <li><a href="https://doi.org/10.1007/978-3-030-99739-7_43">Overview paper</a></li> </ul> bibtex_ids: - Bondarenko2022Touche - Kiesel2021Image - Dimitrov2021SemEval - Yanai2007Image ================================================ FILE: ir_datasets/docs/touche.yaml ================================================ 2020/task-1: pretty_name: "Touché 2020 Task 1: Argument Retrieval for Controversial Questions" desc: | <p> Decision making processes, be it at the societal or at the personal level, eventually come to a point where one side will challenge the other with a why-question, which is a prompt to justify one's stance. Thus, technologies for argument mining and argumentation processing are maturing at a rapid pace, giving rise for the first time to argument retrieval. Touché 2020 is the first lab on Argument Retrieval at CLEF 2020 featuring two tasks. </p> <p> Given a question on a controversial topic, retrieve relevant arguments from a focused crawl of online debate portals (<a class="ds-ref">argsme/2020-04-01</a>). </p> <p> Documents are judged based on their general topical relevance. </p> <ul> <li><a href="https://webis.de/events/touche-20/shared-task-1.html">Task 1 website</a></li> <li><a href="https://webis.de/events/touche-20/">Lab website</a></li> <li><a href="https://doi.org/10.1007/978-3-030-58219-7_26">Overview paper</a></li> <li><a href="https://www.youtube.com/playlist?list=PLgD1TOdHQCI90NnCLg9f4g32KLuOfPXR4">Workshop videos</a></li> </ul> bibtex_ids: - Bondarenko2020Touche - Wachsmuth2017Quality 2020/task-1/argsme-1.0/uncorrected: pretty_name: "Touché 2020 Task 1: Argument Retrieval for Controversial Questions (Uncorrected Labels, args.me 1.0)" desc: | <p> Version of <a class="ds-ref">argsme/2020-04-01/touche-2020-task-1</a> that uses the <a class="ds-ref">argsme/1.0</a> corpus with uncorrected relevance judgements derived from crowdworkers. This dataset's relevance judgements should <em>not</em> be used without preprocessing. </p> <ul> <li><a href="https://webis.de/events/touche-20/shared-task-1.html">Task 1 website</a></li> <li><a href="https://webis.de/events/touche-20/">Lab website</a></li> <li><a href="https://doi.org/10.1007/978-3-030-58219-7_26">Overview paper</a></li> <li><a href="https://www.youtube.com/playlist?list=PLgD1TOdHQCI90NnCLg9f4g32KLuOfPXR4">Workshop videos</a></li> </ul> bibtex_ids: - Bondarenko2020Touche - Wachsmuth2017Quality 2020/task-1/argsme-2020-04-01/uncorrected: pretty_name: "Touché 2020 Task 1: Argument Retrieval for Controversial Questions (Uncorrected Labels, args.me 2020-04-01)" desc: | <p> Version of <a class="ds-ref">argsme/2020-04-01/touche-2020-task-1</a> that uses uncorrected relevance judgements derived from crowdworkers. This dataset's relevance judgements should <em>not</em> be used without preprocessing. </p> <ul> <li><a href="https://webis.de/events/touche-20/shared-task-1.html">Task 1 website</a></li> <li><a href="https://webis.de/events/touche-20/">Lab website</a></li> <li><a href="https://doi.org/10.1007/978-3-030-58219-7_26">Overview paper</a></li> <li><a href="https://www.youtube.com/playlist?list=PLgD1TOdHQCI90NnCLg9f4g32KLuOfPXR4">Workshop videos</a></li> </ul> bibtex_ids: - Bondarenko2020Touche - Wachsmuth2017Quality 2020/task-2: pretty_name: "Touché 2020 Task 2: Argument Retrieval for Comparative Questions" desc: | <p> Decision making processes, be it at the societal or at the personal level, eventually come to a point where one side will challenge the other with a why-question, which is a prompt to justify one's stance. Thus, technologies for argument mining and argumentation processing are maturing at a rapid pace, giving rise for the first time to argument retrieval. Touché 2020 is the first lab on Argument Retrieval at CLEF 2020 featuring two tasks. </p> <p> Given a comparative question, retrieve and rank documents from the ClueWeb12 that help to answer the comparative question. </p> <p> Documents are judged based on their general topical relevance. </p> <ul> <li><a href="https://webis.de/events/touche-20/shared-task-2.html">Task 2 website</a></li> <li><a href="https://webis.de/events/touche-20/">Lab website</a></li> <li><a href="https://doi.org/10.1007/978-3-030-58219-7_26">Overview paper</a></li> <li><a href="https://www.youtube.com/playlist?list=PLgD1TOdHQCI90NnCLg9f4g32KLuOfPXR4">Workshop videos</a></li> </ul> bibtex_ids: - Bondarenko2020Touche - Braunstain2016Support - Rafalak2014Credibility 2021/task-1: pretty_name: "Touché 2021 Task 1: Argument Retrieval for Controversial Questions" desc: | <p> Decision making processes, be it at the societal or at the personal level, often come to a point where one side challenges the other with a why-question, which is a prompt to justify some stance based on arguments. Since technologies for argument mining are maturing at a rapid pace, also ad-hoc argument retrieval becomes a feasible task in reach. Touché 2021 is the second lab on argument retrieval at CLEF 2021 featuring two tasks. </p> <p> Given a question on a controversial topic, retrieve relevant arguments from a focused crawl of online debate portals (<a class="ds-ref">argsme/2020-04-01</a>). </p> <p> Documents are judged based on their general topical relevance and for rhetorical quality, i.e., "well-writtenness" of the document: (1) whether the text has a good style of speech (formal language is preferred over informal), (2) whether the text has a proper sentence structure and is easy to read, (3) whether it includes profanity, has typos, and makes use of other detrimental style choices. </p> <ul> <li><a href="https://webis.de/events/touche-21/shared-task-1.html">Task 1 website</a></li> <li><a href="https://webis.de/events/touche-21/">Lab website</a></li> <li><a href="https://doi.org/10.1007/978-3-030-85251-1_28">Overview paper</a></li> <li><a href="https://www.youtube.com/playlist?list=PLgD1TOdHQCI8FDfYnzcjbsf26RIatNgM3">Workshop videos</a></li> </ul> bibtex_ids: - Bondarenko2021Touche 2021/task-2: pretty_name: "Touché 2021 Task 2: Argument Retrieval for Comparative Questions" desc: | <p> Decision making processes, be it at the societal or at the personal level, often come to a point where one side challenges the other with a why-question, which is a prompt to justify some stance based on arguments. Since technologies for argument mining are maturing at a rapid pace, also ad-hoc argument retrieval becomes a feasible task in reach. Touché 2021 is the second lab on argument retrieval at CLEF 2021 featuring two tasks. </p> <p> Given a comparative question, retrieve and rank documents from the ClueWeb12 that help to answer the comparative question. </p> <p> Documents are judged based on their general topical relevance and for rhetorical quality, i.e., "well-writtenness" of the document: (1) whether the text has a good style of speech (formal language is preferred over informal), (2) whether the text has a proper sentence structure and is easy to read, (3) whether it includes profanity, has typos, and makes use of other detrimental style choices. </p> <ul> <li><a href="https://webis.de/events/touche-21/shared-task-2.html">Task 2 website</a></li> <li><a href="https://webis.de/events/touche-21/">Lab website</a></li> <li><a href="https://doi.org/10.1007/978-3-030-85251-1_28">Overview paper</a></li> <li><a href="https://www.youtube.com/playlist?list=PLgD1TOdHQCI8FDfYnzcjbsf26RIatNgM3">Workshop videos</a></li> </ul> bibtex_ids: - Bondarenko2021Touche 2022/task-1: pretty_name: "Touché 2022 Task 1: Argument Retrieval for Controversial Questions" desc: | <p> Decision making processes, be it at the societal or at the personal level, often come to a point where one side challenges the other with a why-question, which is a prompt to justify some stance based on arguments. Since technologies for argument mining are maturing at a rapid pace, also ad-hoc argument retrieval becomes a feasible task in reach. Touché 2022 is the third lab on argument retrieval at CLEF 2022 featuring three tasks. </p> <p> Given a query about a controversial topic, retrieve and rank a relevant pair of sentences from a collection of arguments (<a class="ds-ref">argsme/2020-04-01-processed</a>). </p> <p> Documents are judged based on their general topical relevance and for rhetorical quality, i.e., "well-writtenness" of the document: (1) whether the text has a good style of speech (formal language is preferred over informal), (2) whether the text has a proper sentence structure and is easy to read, (3) whether it includes profanity, has typos, and makes use of other detrimental style choices. </p> <ul> <li><a href="https://touche.webis.de/clef22/touche22-web/argument-retrieval-for-controversial-questions.html">Task 1 website</a></li> <li><a href="https://touche.webis.de/clef22/touche22-web/">Lab website</a></li> <li><a href="https://doi.org/10.1007/978-3-030-99739-7_43">Overview paper</a></li> </ul> bibtex_ids: - Bondarenko2022Touche 2022/task-2: pretty_name: "Touché 2022 Task 2: Argument Retrieval for Comparative Questions" desc: | <p> Decision making processes, be it at the societal or at the personal level, often come to a point where one side challenges the other with a why-question, which is a prompt to justify some stance based on arguments. Since technologies for argument mining are maturing at a rapid pace, also ad-hoc argument retrieval becomes a feasible task in reach. Touché 2022 is the third lab on argument retrieval at CLEF 2022 featuring three tasks. </p> <p> Given a comparative topic and a collection of documents, the task is to retrieve relevant argumentative passages for either compared object or for both and to detect their respective stances with respect to the object they talk about. </p> <p> Documents are judged based on their general topical relevance and for rhetorical quality, i.e., "well-writtenness" of the document: (1) whether the text has a good style of speech (formal language is preferred over informal), (2) whether the text has a proper sentence structure and is easy to read, (3) whether it includes profanity, has typos, and makes use of other detrimental style choices. </p> <p> Additionally, classify the stance of the retrieved text passages towards the compared objects in questions. For instance, in the question <i>Who is a better friend, a cat or a dog?</i> the terms <i>cat</i> and <i>dog</i> are the comparison objects. An answer candidate like <i>Cats can be quite affectionate and attentive, and thus are good friends</i> should be classified as pro the <i>cat</i> object, while <i>Cats are less faithful than dogs</i> as supporting the <i>dog</i> object. </p> <ul> <li><a href="https://touche.webis.de/clef22/touche22-web/argument-retrieval-for-comparative-questions.html">Task 2 website</a></li> <li><a href="https://touche.webis.de/clef22/touche22-web/">Lab website</a></li> <li><a href="https://doi.org/10.1007/978-3-030-99739-7_43">Overview paper</a></li> </ul> bibtex_ids: - Bondarenko2022Touche 2022/task-2/expanded-doc-t5-query: pretty_name: "Touché 2022 Task 2: Argument Retrieval for Comparative Questions (Expanded)" desc: | <p> Pre-processed version of <a class="ds-ref">clueweb12/touche-2022-task-2</a> where each passage has been expanded with queries generated using DocT5Query. </p> bibtex_ids: - Bondarenko2022Touche 2022/task-3: pretty_name: "Touché 2022 Task 3: Argument Retrieval for Comparative Questions" desc: | <p> Decision making processes, be it at the societal or at the personal level, often come to a point where one side challenges the other with a why-question, which is a prompt to justify some stance based on arguments. Since technologies for argument mining are maturing at a rapid pace, also ad-hoc argument retrieval becomes a feasible task in reach. Touché 2022 is the third lab on argument retrieval at CLEF 2022 featuring three tasks. </p> <p> Given a controversial topic, the task is to retrieve images (from <a class="ds-ref">touche-image/2022-06-13</a>) for each stance (pro/con) that show support for that stance. </p> <p> Systems are evaluated on Touché topics 1-50 by the ratio of images among the 20 retrieved images for each topic (10 images for each stance) that are all three: relevant to the topic, argumentative, and have the associated stance. </p> <ul> <li><a href="https://touche.webis.de/clef22/touche22-web/image-retrieval-for-arguments.html">Task 3 website</a></li> <li><a href="https://touche.webis.de/clef22/touche22-web/">Lab website</a></li> <li><a href="https://doi.org/10.1007/978-3-030-99739-7_43">Overview paper</a></li> </ul> bibtex_ids: - Bondarenko2022Touche - Kiesel2021Image - Dimitrov2021SemEval - Yanai2007Image ================================================ FILE: ir_datasets/docs/trec-arabic.yaml ================================================ _: pretty_name: 'TREC Arabic' desc: ' <p> A collection of news articles in Arabic, used for multi-lingual evaluation in TREC 2001 and TREC 2002. </p> <p> Document collection from <a href="https://catalog.ldc.upenn.edu/LDC2001T55">LDC2001T55</a>. </p> ' docs_instructions: &inst "docs available from LDC" bibtex_ids: ['Graff2001Arabic'] data_access: ' <p> To use this dataset, you need a copy of the source corpus, provided by the the Linguistic Data Consortium. The specific resource needed is <a href="https://catalog.ldc.upenn.edu/LDC2001T55">LDC2001T55</a>. </p> <p> Many organizations already have a subscription to the LDC, so access to the collection can be as easy as confirming the data usage agreement and downloading the corpus. Check with your library for access details. </p> <p> The source file is: <kbd>arabic_newswire_a_LDC2001T55.tgz</kbd>. </p> <p> ir_datasets expects this file to be copied/linked as <kbd>~/.ir_datasets/trec-arabic/corpus.tgz</kbd>. </p> ' ar2001: desc: ' <p> Arabic benchmark from TREC 2001. </p> <ul> <li><a href="https://trec.nist.gov/pubs/trec10/papers/clirtrack.pdf">Task Overview Paper</a></li> </ul>' docs_instructions: *inst bibtex_ids: ['Gey2001Arabic', 'Graff2001Arabic'] ar2002: desc: ' <p> Arabic benchmark from TREC 2002. </p> <ul> <li><a href="https://trec.nist.gov/pubs/trec11/papers/OVERVIEW.gey.ps.gz">Task Overview Paper</a></li> </ul>' docs_instructions: *inst bibtex_ids: ['Gey2002Arabic', 'Graff2001Arabic'] ================================================ FILE: ir_datasets/docs/trec-cast.yaml ================================================ _: pretty_name: 'TREC CAsT (Conversational Assistance)' desc: ' <p> The TREC Conversational Assistance Track (CAsT) is a benchmark for Conversational Information Seeking (CIS) models. </p> <ul> <li>Documents: Passages from Wikipedia (TREC CAR or KILT), MS MARCO, and/or the Washington Post (depending on year)</li> <li>Queries: raw utterences in sequence, manual/automatic re-writing of queries (depending on year)</li> <li>Relevance: Deep judgments</li> <li><a href="https://www.treccast.ai/">Track Website</a></li> </ul>' data_access: ' <p> To use version 0 of the corpus, you need a copy of the <a href="https://trec.nist.gov/data/wapost/">Washington Post Collection</a>, provided by NIST. </p> <p> Your organization may already have a copy. If this is the case, you may only need to complete a new "Individual Argeement". Otherwise, your organization will need to file the "Organizational agreement" with NIST. It can take some time to process, but you will end up with a password-protected download link. </p> <p> For the v0 corpus, the source file required is <kbd>WashingtonPost.v2.tar.gz</kbd>. ir_datasets expects the above file to be copied/linked under <kbd>~/.ir_datasets/wapo/WashingtonPost.v2.tar.gz</kbd>. </p> <!--<p> For the v2 corpus, the source file required is <kbd>WashingtonPost.v4.tar.gz</kbd>. ir_datasets expects the above file to be copied/linked under <kbd>~/.ir_datasets/wapo/WashingtonPost.v4.tar.gz</kbd>. </p>--> ' v0: desc: ' <p> Version 0 of the TREC CAsT corpus. This version uses documents from the Washington Post (version 2), TREC CAR (version 2), and MS MARCO passage (version 1). </p> <p> This corpus was originally meant to be used for evaluation of the 2019 task, but the Washington Post corpus was not included for scoring in the final version due to "an error in the process led to ambiguous document ids," and Washington Post documents were removed from participating systems. As such, <a class="ds-ref">trec-cast/v1</a> (which doesn''t include the Washington Post) should be used for the 2019 version of the task. However, this version still can be used for the training set (<a class="ds-ref">trec-cast/v0/train</a>) or for replicating the original submissions to the track (prior to the removal of Washingotn Post documents). </p> <ul> <li><a href="https://arxiv.org/pdf/2003.13624.pdf">Task Overview Paper</a></li> </ul>' docs_instructions: &inst "WaPo docs available from NIST" bibtex_ids: ['Dalton2019Cast'] v0/train: desc: ' <p> Training set provided by TREC CAsT 2019. </p>' docs_instructions: *inst bibtex_ids: ['Dalton2019Cast'] v0/train/judged: desc: ' <p> <a class="ds-ref">trec-cast/2019/train</a>, but with queries that do not appear in the qrels removed. </p>' docs_instructions: *inst bibtex_ids: ['Dalton2019Cast'] v1: desc: ' <p> Version 1 of the TREC CAsT corpus. This version uses documents from the TREC CAR (version 2) and MS MARCO passage (version 1). This version of the corpus was used for TREC CAsT 2019 and 2020. </p> <ul> <li><a href="https://arxiv.org/pdf/2003.13624.pdf">Task Overview Paper</a></li> </ul>' bibtex_ids: ['Dalton2019Cast'] v1/2019: desc: ' <p> Official evaluation set for TREC CAsT 2019. </p>' bibtex_ids: ['Dalton2019Cast'] v1/2019/judged: desc: ' <p> <a class="ds-ref">trec-cast/v1/2019</a>, but with queries that do not appear in the qrels removed. </p>' bibtex_ids: ['Dalton2019Cast'] v1/2020: desc: ' <p> Official evaluation set for TREC CAsT 2020. </p> <ul> <li><a href="https://trec.nist.gov/pubs/trec29/papers/OVERVIEW.C.pdf">Task Overview Paper</a></li> </ul>' bibtex_ids: ['Dalton2020Cast'] v1/2020/judged: desc: ' <p> <a class="ds-ref">trec-cast/v1/2020</a>, but with queries that do not appear in the qrels removed. </p>' bibtex_ids: ['Dalton2020Cast'] v2: desc: ' <p> Version 2 of the TREC CAsT corpus. This version uses documents from the Washington Post (2017-20), KILT and MS Marco V1 (documents). This version of the corpus was used for TREC CAsT 2021. Segmented passages version are also provided (using the 2021 script). </p> <ul> <li><a href="https://trec.nist.gov/pubs/trec30/papers/Overview-CAsT.pdf">Task Overview Paper</a></li> </ul>' v3: desc: ' <p> Version 3 of the TREC CAsT corpus. This version uses segmented documents from the Washington Post (2017-20), KILT and MS Marco V2 (documents). This version of the corpus was used for TREC CAsT 2022. </p> <ul> <li><a href="https://trec.nist.gov/pubs/trec31/papers/Overview_cast.pdf">Task Overview Paper</a></li> </ul>' ================================================ FILE: ir_datasets/docs/trec-fair.yaml ================================================ _: pretty_name: "TREC Fair Ranking" desc: ' <p> The TREC Fair Ranking track evaluates systems according to how well they fairly rank documents. </p> <ul> <li><a href="https://fair-trec.github.io/">Website</a></li> </ul> ' 2021: desc: ' <p> The TREC Fair Ranking 2021 track focuses on fairly prioritising Wikimedia articles for editing to provide a fair exposure to articles from different groups. </p> <ul> <li><a href="https://fair-trec.github.io/2021/">2021 Track Website</a></li> </ul> ' 2021/train: desc: ' <p> Official TREC Fair Ranking 2021 train set. </p> ' 2021/eval: desc: ' <p> Official TREC Fair Ranking 2021 evaluation set. </p> ' 2022: desc: ' <p> The TREC Fair Ranking 2022 track focuses on fairly prioritising Wikimedia articles for editing to provide a fair exposure to articles from different groups. </p> <ul> <li><a href="https://fair-trec.github.io/">2022 Track Website</a></li> </ul> ' 2022/train: desc: ' <p> Official TREC Fair Ranking 2022 train set. </p> ' ================================================ FILE: ir_datasets/docs/trec-mandarin.yaml ================================================ _: pretty_name: 'TREC Mandarin' desc: ' <p> A collection of news articles in Mandarin in Simplified Chinese, used for multi-lingual evaluation in TREC 5 and TREC 6. </p> <p> Document collection from <a href="https://catalog.ldc.upenn.edu/LDC2000T52">LDC2000T52</a>. </p> ' docs_instructions: &inst "docs available from LDC" bibtex_ids: ['Rogers2000Mandarin'] data_access: ' <p> To use this dataset, you need a copy of the source corpus, provided by the the Linguistic Data Consortium. The specific resource needed is <a href="https://catalog.ldc.upenn.edu/LDC2000T52">LDC2000T52</a>. </p> <p> Many organizations already have a subscription to the LDC, so access to the collection can be as easy as confirming the data usage agreement and downloading the corpus. Check with your library for access details. </p> <p> The source file is: <kbd>LDC2000T52.tgz</kbd>. </p> <p> ir_datasets expects this file to be copied/linked as <kbd>~/.ir_datasets/trec-mandarin/corpus.tgz</kbd>. </p> ' trec5: desc: ' <p> Mandarin Chinese benchmark from TREC 5. </p> <ul> <li><a href="https://trec.nist.gov/pubs/trec5/papers/multilingual_track.ps.gz">Task Overview Paper</a></li> </ul> ' docs_instructions: *inst bibtex_ids: ['Harman1997Chinese', 'Rogers2000Mandarin'] trec6: desc: ' <p> Mandarin Chinese benchmark from TREC 6. </p> <ul> <li><a href="https://trec.nist.gov/pubs/trec6/papers/csiro.chinese.ps.gz">Task Overview Paper</a></li> </ul>' docs_instructions: *inst bibtex_ids: ['Wilkinson1998Chinese', 'Rogers2000Mandarin'] ================================================ FILE: ir_datasets/docs/trec-robust04.yaml ================================================ _: pretty_name: 'TREC Robust 2004' desc: ' <p> The TREC Robust retrieval task focuses on "improving the consistency of retrieval technology by focusing on poorly performing topics." </p> <p> The TREC Robust document collection is from TREC disks 4 and 5. Due to the copyrighted nature of the documents, this collection is for research use only, which requires agreements to be filed with NIST. See details <a href="https://trec.nist.gov/data/cd45/index.html">here</a>. </p> <ul> <li>Documents: News articles</li> <li>Queries: keyword queries, descriptions, narratives</li> <li>Relevance: Deep judgments</li> <li><a href="https://trec.nist.gov/pubs/trec13/papers/ROBUST.OVERVIEW.pdf">Task Overview Paper</a></li> <li>See also: <a class="ds-ref">aquaint/trec-robust-2005</a></li> </ul>' docs_instructions: &inst "docs available from NIST" bibtex_ids: ['Voorhees2004Robust'] data_access: ' <p> To use this dataset, you need a copy of <a href="https://trec.nist.gov/data/cd45/index.html">TREC disks 4 and 5</a>, provided by NIST. </p> <p> Your organization may already have a copy. If this is the case, you may only need to complete a new "Individual Argeement". Otherwise, your organization will need to file the "Organizational agreement" with NIST. It can take some time to process, but you will end up with a password-protected download link. </p> <p> ir_datasets needs the following directories from the source: </p> <ul> <li><kbd>FBIS</kbd></li> <li><kbd>FR94</kbd></li> <li><kbd>FT</kbd></li> <li><kbd>LATIMES</kbd></li> </ul> <p> ir_datasets expects the above directories to be copied/linked under <kbd>~/.ir_datasets/trec-robust04/trec45</kbd>. The source document files themselves can either be compressed or uncompressed (it seems they have been distributed both ways in the past.) If ir_datasets does not find the files it is expecting, it will raise an error. </p> <p> </p> ' fold1: desc: ' <p>Robust04 Fold 1 (Title) proposed by Huston & Croft (2014) and used in numerous works</p>' docs_instructions: *inst bibtex_ids: ['Voorhees2004Robust', 'Huston2014ACO'] fold2: desc: ' <p>Robust04 Fold 2 (Title) proposed by Huston & Croft (2014) and used in numerous works</p>' docs_instructions: *inst bibtex_ids: ['Voorhees2004Robust', 'Huston2014ACO'] fold3: desc: ' <p>Robust04 Fold 3 (Title) proposed by Huston & Croft (2014) and used in numerous works</p>' docs_instructions: *inst bibtex_ids: ['Voorhees2004Robust', 'Huston2014ACO'] fold4: desc: ' <p>Robust04 Fold 4 (Title) proposed by Huston & Croft (2014) and used in numerous works</p>' docs_instructions: *inst bibtex_ids: ['Voorhees2004Robust', 'Huston2014ACO'] fold5: desc: ' <p>Robust04 Fold 5 (Title) proposed by Huston & Croft (2014) and used in numerous works</p>' docs_instructions: *inst bibtex_ids: ['Voorhees2004Robust', 'Huston2014ACO'] ================================================ FILE: ir_datasets/docs/trec-spanish.yaml ================================================ _: pretty_name: 'TREC Spanish' desc: ' <p> A collection of news articles in Spanish, used for multi-lingual evaluation in TREC 3 and TREC 4. </p> <p> Document collection from <a href="https://catalog.ldc.upenn.edu/LDC2000T51">LDC2000T51</a>. </p> ' docs_instructions: &inst "docs available from LDC" bibtex_ids: ['Rogers2000Spanish'] data_access: ' <p> To use this dataset, you need a copy of the source corpus, provided by the the Linguistic Data Consortium. The specific resource needed is <a href="https://catalog.ldc.upenn.edu/LDC2000T51">LDC2000T51</a>. </p> <p> Many organizations already have a subscription to the LDC, so access to the collection can be as easy as confirming the data usage agreement and downloading the corpus. Check with your library for access details. </p> <p> The source file is: <kbd>LDC2000T51.tgz</kbd>. </p> <p> ir_datasets expects this file to be copied/linked as <kbd>~/.ir_datasets/trec-spanish/corpus.tgz</kbd>. </p> ' trec3: desc: ' <p> Spanish benchmark from TREC 3. </p> <ul> <li><a href="https://trec.nist.gov/pubs/trec3/papers/overview.pdf">Task Overview Paper</a></li> </ul>' docs_instructions: *inst bibtex_ids: ['Harman1994Trec3', 'Rogers2000Spanish'] trec4: desc: ' <p> Spanish benchmark from TREC 4. </p> <ul> <li><a href="https://trec.nist.gov/pubs/trec4/overview.ps.gz">Task Overview Paper</a></li> </ul>' docs_instructions: *inst bibtex_ids: ['Harman1995Trec4', 'Rogers2000Spanish'] ================================================ FILE: ir_datasets/docs/trec-tot-2025.yaml ================================================ _: pretty_name: 'TREC Tip-of-the-Tongue' desc: ' <p> Tip of the tongue: The phenomenon of failing to retrieve something from memory, combined with partial recall and the feeling that retrieval is imminent. More details <a href="https://trec-tot.github.io/guidelines">are available on the official page for the TREC Tip-of-the-Tongue (ToT) Track</a>. </p> ' 2025: desc: ' <p> Corpus for the TREC 2025 tip-of-the-tongue search track. </p> ' 2025/train: desc: ' <p> Train query set for TREC 2025 tip-of-the-tongue search track. </p> ' 2025/dev1: desc: ' <p> Dev-1 query set for TREC 2025 tip-of-the-tongue search track (the original 2023 dev set). </p> ' 2025/dev2: desc: ' <p> Dev-2 query set for TREC 2025 tip-of-the-tongue search track (the original 2023 test set). </p> ' 2025/dev3: desc: ' <p> Dev-3 query set for TREC 2025 tip-of-the-tongue search track (the original 2024 test set). </p> ' ================================================ FILE: ir_datasets/docs/trec-tot.yaml ================================================ _: pretty_name: 'TREC Tip-of-the-Tongue' desc: ' <p> Tip of the tongue: The phenomenon of failing to retrieve something from memory, combined with partial recall and the feeling that retrieval is imminent. More details <a href="https://trec-tot.github.io/guidelines">are available on the official page for the TREC Tip-of-the-Tongue (ToT) Track</a>. </p> ' 2023: desc: ' <p> Corpus for the TREC 2023 tip-of-the-tongue search track. </p> ' 2023/train: desc: ' <p> Train query set for TREC 2023 tip-of-the-tongue search track. </p> ' 2023/dev: desc: ' <p> Dev query set for TREC 2023 tip-of-the-tongue search track. </p> ' 2024: desc: ' <p> Corpus for the TREC 2024 tip-of-the-tongue search track. </p> ' 2024/test: desc: ' <p> Test query set for TREC 2024 tip-of-the-tongue search track. </p> ' ================================================ FILE: ir_datasets/docs/tripclick.yaml ================================================ _: pretty_name: 'TripClick' desc: ' <p> TripClick is a large collection from the <a href="https://www.tripdatabase.com/">Trip Database</a>. Relevance is inferred from click signals. </p> <p> A copy of this dataset can be obtained from the Trip Database through the process described <a href="https://tripdatabase.github.io/tripclick/#getting-the-data">here</a>. Documents, queries, and qrels require the "TripClick IR Benchmark"; for scoreddocs and docpairs, you will also need to request the "TripClick Training Package for Deep Learning Models". </p> <ul> <li>Documents: <a href="https://www.nlm.nih.gov/medline/medline_overview.html">Medline</a> article titles and abstracts</li> <li>Queries: user queries issued to the <a href="https://www.tripdatabase.com/">Trip Database</a></li> <li>Qrels: Inferred from clicks</li> <li><a href="https://docs.google.com/document/d/1RHVxVnZsPBDDZMDcSvbB8VyNZDl2cn6KpeeSvIu6g_c/edit?usp=sharing">Dataset request form</a></li> <li><a href="https://tripdatabase.github.io/tripclick/">Dataset website</a></li> <li><a href="https://arxiv.org/abs/2103.07901">Dataset paper</a></li> </ul> ' docs_instructions: &docs_inst "docs available from the Trip Database" queries_instructions: &queries_inst "queries available from the Trip Database" qrels_instructions: &qrels_inst "qrels available from the Trip Database" scoreddocs_instructions: &scoreddocs_inst "scoreddocs available from the Trip Database" docpairs_instructions: &docpairs_inst "docpairs available from the Trip Database" bibtex_ids: ['Rekabsaz2021TripClick'] data_access: ' <p> To use this dataset, you need a copy of the source files, provided by the Trip Database. </p> <p> A copy of the source files can be requested through the procedure detailed <a href="https://tripdatabase.github.io/tripclick/#getting-the-data">here</a>. Documents, queries, and qrels require the "TripClick IR Benchmark"; for scoreddocs and docpairs, you will also need to request the "TripClick Training Package for Deep Learning Models". If you want the raw query logs, you will need to request the "Logs Dataset". </p> <p> The source files you will need are: <ul> <li><kbd>benchmark.tar.gz</kbd> (for docs, queries, and qrels)</li> <li><kbd>dlfiles.tar.gz</kbd> (for docpairs and scoreddocs)</li> <li><kbd>dlfiles_runs_test.tar.gz</kbd> (for scoreddocs on the test set)</li> <li><kbd>logs.tar.gz</kbd> (for raw qlogs)</li> </ul> <p> ir_datasets expects these files to be copied/linked in <kbd>~/.ir_datasets/tripclick/</kbd>. </p> ' logs: desc: ' <p> Raw query logs from TripClick. </p> <p> Note that this subset includes a broader set of documents than the main collection, but they only provide the title and URL. </p> ' docs_instructions: *docs_inst qlogs_instructions: &qlogs_inst "qlogs available from the Trip Database" bibtex_ids: ['Rekabsaz2021TripClick'] train: desc: ' <p> Training subset of <a class="ds-ref">tripclick</a>, including all queries from <a class="ds-ref">tripclick/train/head</a>, <a class="ds-ref">tripclick/train/torso</a>, and <a class="ds-ref">tripclick/train/tail</a>. </p> <p> The dataset provides docpairs in a full text format; we map this text back to the query and doc IDs. A small number of docpairs could not be mapped back, so they are skipped. </p> ' docs_instructions: *docs_inst queries_instructions: *queries_inst qrels_instructions: *qrels_inst scoreddocs_instructions: *scoreddocs_inst docpairs_instructions: *docpairs_inst bibtex_ids: ['Rekabsaz2021TripClick'] train/hofstaetter-triples: desc: ' <p> A version of <a class="ds-ref">tripclick/train</a> that replaces the original (noisy) training triples (docpairs) with those sampled from BM25 instead, as suggested by Hofstätter et al (2022). </p> <ul> <li><a href="https://arxiv.org/pdf/2201.00365.pdf">Paper</a></li> </ul> ' docs_instructions: *docs_inst queries_instructions: *queries_inst qrels_instructions: *qrels_inst bibtex_ids: ['Rekabsaz2021TripClick', 'Hofstaetter2022TripClick'] train/head: desc: ' <p> The most frequent queries in the train set. This represents 20% of the search engine traffic. </p> ' docs_instructions: *docs_inst queries_instructions: *queries_inst qrels_instructions: *qrels_inst scoreddocs_instructions: *scoreddocs_inst docpairs_instructions: *docpairs_inst bibtex_ids: ['Rekabsaz2021TripClick'] train/head/dctr: desc: ' <p> The same as <a class="ds-ref">tripclick/train/head</a>, but using qrels scaled by the Document Click Through Rate (DCTR). </p> ' docs_instructions: *docs_inst queries_instructions: *queries_inst qrels_instructions: *qrels_inst scoreddocs_instructions: *scoreddocs_inst docpairs_instructions: *docpairs_inst bibtex_ids: ['Rekabsaz2021TripClick'] train/torso: desc: ' <p> The moderately frequent queries in the train set. This represents 30% of the search engine traffic. </p> ' docs_instructions: *docs_inst queries_instructions: *queries_inst qrels_instructions: *qrels_inst scoreddocs_instructions: *scoreddocs_inst docpairs_instructions: *docpairs_inst bibtex_ids: ['Rekabsaz2021TripClick'] train/tail: desc: ' <p> The least frequent queries in the train set. This represents 50% of the search engine traffic. </p> ' docs_instructions: *docs_inst queries_instructions: *queries_inst qrels_instructions: *qrels_inst scoreddocs_instructions: *scoreddocs_inst docpairs_instructions: *docpairs_inst bibtex_ids: ['Rekabsaz2021TripClick'] val: desc: ' <p> Validation subset of <a class="ds-ref">tripclick</a>, including all queries from <a class="ds-ref">tripclick/val/head</a>, <a class="ds-ref">tripclick/val/torso</a>, and <a class="ds-ref">tripclick/val/tail</a>. </p> <p> The scoreddocs are the official BM25 results from Anserini. </p> ' docs_instructions: *docs_inst queries_instructions: *queries_inst qrels_instructions: *qrels_inst scoreddocs_instructions: *scoreddocs_inst docpairs_instructions: *docpairs_inst bibtex_ids: ['Rekabsaz2021TripClick'] val/head: desc: ' <p> The most frequent queries in the validation set. This represents 20% of the search engine traffic. </p> ' docs_instructions: *docs_inst queries_instructions: *queries_inst qrels_instructions: *qrels_inst scoreddocs_instructions: *scoreddocs_inst docpairs_instructions: *docpairs_inst bibtex_ids: ['Rekabsaz2021TripClick'] val/head/dctr: desc: ' <p> The same as <a class="ds-ref">tripclick/val/head</a>, but using qrels scaled by the Document Click Through Rate (DCTR). </p> ' docs_instructions: *docs_inst queries_instructions: *queries_inst qrels_instructions: *qrels_inst scoreddocs_instructions: *scoreddocs_inst docpairs_instructions: *docpairs_inst bibtex_ids: ['Rekabsaz2021TripClick'] val/torso: desc: ' <p> The moderately frequent queries in the validation set. This represents 30% of the search engine traffic. </p> ' docs_instructions: *docs_inst queries_instructions: *queries_inst qrels_instructions: *qrels_inst scoreddocs_instructions: *scoreddocs_inst docpairs_instructions: *docpairs_inst bibtex_ids: ['Rekabsaz2021TripClick'] val/tail: desc: ' <p> The least frequent queries in the validation set. This represents 50% of the search engine traffic. </p> ' docs_instructions: *docs_inst queries_instructions: *queries_inst qrels_instructions: *qrels_inst scoreddocs_instructions: *scoreddocs_inst docpairs_instructions: *docpairs_inst bibtex_ids: ['Rekabsaz2021TripClick'] test: desc: ' <p> Test subset of <a class="ds-ref">tripclick</a>, including all queries from <a class="ds-ref">tripclick/test/head</a>, <a class="ds-ref">tripclick/test/torso</a>, and <a class="ds-ref">tripclick/test/tail</a>. </p> <p> The scoreddocs are the official BM25 results from Anserini. </p> ' docs_instructions: *docs_inst queries_instructions: *queries_inst qrels_instructions: *qrels_inst scoreddocs_instructions: *scoreddocs_inst docpairs_instructions: *docpairs_inst bibtex_ids: ['Rekabsaz2021TripClick'] test/head: desc: ' <p> The most frequent queries in the test set. This represents 20% of the search engine traffic. </p> ' docs_instructions: *docs_inst queries_instructions: *queries_inst qrels_instructions: *qrels_inst scoreddocs_instructions: *scoreddocs_inst docpairs_instructions: *docpairs_inst bibtex_ids: ['Rekabsaz2021TripClick'] test/torso: desc: ' <p> The moderately frequent queries in the test set. This represents 30% of the search engine traffic. </p> ' docs_instructions: *docs_inst queries_instructions: *queries_inst qrels_instructions: *qrels_inst scoreddocs_instructions: *scoreddocs_inst docpairs_instructions: *docpairs_inst bibtex_ids: ['Rekabsaz2021TripClick'] test/tail: desc: ' <p> The least frequent queries in the test set. This represents 50% of the search engine traffic. </p> ' docs_instructions: *docs_inst queries_instructions: *queries_inst qrels_instructions: *qrels_inst scoreddocs_instructions: *scoreddocs_inst docpairs_instructions: *docpairs_inst bibtex_ids: ['Rekabsaz2021TripClick'] ================================================ FILE: ir_datasets/docs/tweets2013-ia.yaml ================================================ _: pretty_name: 'Tweets 2013 (Internet Archive)' desc: ' <p> A collection of tweets from a 2-month window achived by the Internet Achive. This collection can be a stand-in document collection for the TREC Microblog 2013-14 tasks. (Even though it is not exactly the same collection, <a href="https://cs.uwaterloo.ca/~jimmylin/publications/Sequiera_Lin_SIGIR2017.pdf">Sequiera and Lin</a> show that it it close enough.) </p> <p> This collection is automatically downloaded from the Internet Archive, though download speeds are often slow so it takes some time. ir_datasets constructs a new directory hierarchy during the download process to facilitate fast lookups and slices. </p> <ul> <li>Documents: Tweets</li> <li><a href="https://cs.uwaterloo.ca/~jimmylin/publications/Sequiera_Lin_SIGIR2017.pdf">Information about dataset (paper)</a></li> <li><a href="https://github.com/castorini/Tweets2013-IA">Information about dataset (repository)</a></li> </ul>' bibtex_ids: ['Sequiera2017TweetsIA'] trec-mb-2013: desc: ' <p> TREC Microblog 2013 test collection. </p> <ul> <li><a href="https://trec.nist.gov/pubs/trec22/papers/MB.OVERVIEW.pdf">Shared Task Paper</a></li> <li><a href="https://github.com/lintool/twitter-tools/wiki/TREC-2013-Track-Guidelines">Shared Task Site</a></li> </ul> ' bibtex_ids: ['Lin2013Microblog', 'Sequiera2017TweetsIA'] trec-mb-2014: desc: ' <p> TREC Microblog 2014 test collection. </p> <ul> <li><a href="https://trec.nist.gov/pubs/trec23/papers/overview-microblog.pdf">Shared Task Paper</a></li> <li><a href="https://github.com/lintool/twitter-tools/wiki/TREC-2014-Track-Guidelines">Shared Task Site</a></li> </ul> ' bibtex_ids: ['Lin2014Microblog', 'Sequiera2017TweetsIA'] ================================================ FILE: ir_datasets/docs/vaswani.yaml ================================================ _: pretty_name: 'Vaswani' desc: ' <p> A small corpus of roughly 11,000 scientific abstracts. </p> <ul> <li>Documents: Scientific abstracts</li> <li>Queries: Natural language keywords</li> <li><a href="http://ir.dcs.gla.ac.uk/resources/test_collections/npl/">Dataset Information</a></li> </ul> ' ================================================ FILE: ir_datasets/docs/wapo.yaml ================================================ _: pretty_name: 'Washington Post' desc: ' <p> The Washington Post collection. </p>' docs_instructions: &inst "docs available from NIST" data_access: ' <p> To use this dataset, you need a copy of <a href="https://trec.nist.gov/data/wapost/">Washington Post Collection</a>, provided by NIST. </p> <p> Your organization may already have a copy. If this is the case, you may only need to complete a new "Individual Argeement". Otherwise, your organization will need to file the "Organizational agreement" with NIST. It can take some time to process, but you will end up with a password-protected download link. </p> <p> The source file required is <kbd>WashingtonPost.v2.tar.gz</kbd>. <p> ir_datasets expects the above file to be copied/linked under <kbd>~/.ir_datasets/wapo/WashingtonPost.v2.tar.gz</kbd>. </p> <p> </p> ' v2: desc: ' <p> Version 2 of the Washington Post collection, consisting of articles published between 2012-2017. </p> <p> The collection is obtained from NIST by requesting it from NIST <a href="https://trec.nist.gov/data/wapost/">here</a>. </p> <p> body contains all body text in plain text format, including paragrphs and multi-media captions. body_paras_html contains only source paragraphs and contains HTML markup. body_media contains images, videos, tweets, and galeries, along with a link to the content and a textual caption. </p> <ul> <li><a href="https://trec.nist.gov/data/wapost/">Collection Website</a></li> </ul> ' docs_instructions: *inst v2/trec-core-2018: desc: ' <p> The TREC Common Core 2018 benchmark. </p> <ul> <li>Queries: TREC-style (keyword, description, narrative)</li> <li>Relevance: Deeply-annotated</li> <li><a href="https://github.com/trec-core/2018">Shared Task Website</a></li> <ul>' docs_instructions: *inst v2/trec-news-2018: desc: ' <p> The TREC News 2018 Background Linking task. The task is to find relevant background information for the provided articles. </p> <ul> <li>Queries: Articles via the doc_id field</li> <li><a href="http://trec-news.org/">Shared Task Website</a></li> <li><a href="https://trec.nist.gov/pubs/trec27/papers/Overview-News.pdf">Sared task paper</a></li> <ul>' docs_instructions: *inst bibtex_ids: ['Soboroff2018News'] v2/trec-news-2019: desc: ' <p> The TREC News 2019 Background Linking task. The task is to find relevant background information for the provided articles. </p> <ul> <li>Queries: Articles via the doc_id field</li> <li><a href="http://trec-news.org/">Shared Task Website</a></li> <li><a href="https://trec.nist.gov/pubs/trec28/papers/OVERVIEW.N.pdf">Sared task paper</a></li> <ul>' docs_instructions: *inst bibtex_ids: ['Soboroff2019News'] v3/trec-news-2020: desc: ' <p> The TREC News 2020 Background Linking task. The task is to find relevant background information for the provided articles. </p> <p> If you have a copy of the v3 dataset, we would appreciate a pull request to add support! </p> <ul> <li>Queries: Articles via the doc_id field</li> <li><a href="http://trec-news.org/">Shared Task Website</a></li> <ul>' docs_instructions: *inst ================================================ FILE: ir_datasets/docs/wikiclir.yaml ================================================ _: pretty_name: "WikiCLIR" desc: ' <p> A Cross-Language IR (CLIR) collection between English queries and other language documents, built from Wikipedia. </p> <ul> <li><a href="https://www.cs.jhu.edu/~kevinduh/papers/sasaki18letor.pdf">Dataset Paper</a></li> <li><a href="https://www.cs.jhu.edu/~kevinduh/a/wikiclir2018/">Dataset Information</a></li> </ul> ' bibtex_ids: - 'sasaki-etal-2018-cross' ar: desc: ' <p> WikiCLIR with Arabic documents. </p> ' bibtex_ids: - 'sasaki-etal-2018-cross' ca: desc: ' <p> WikiCLIR with Catalan documents. </p> ' bibtex_ids: - 'sasaki-etal-2018-cross' zh: desc: ' <p> WikiCLIR with Chinese documents. </p> ' bibtex_ids: - 'sasaki-etal-2018-cross' cs: desc: ' <p> WikiCLIR with Czech documents. </p> ' bibtex_ids: - 'sasaki-etal-2018-cross' nl: desc: ' <p> WikiCLIR with Dutch documents. </p> ' bibtex_ids: - 'sasaki-etal-2018-cross' fi: desc: ' <p> WikiCLIR with Finnish documents. </p> ' bibtex_ids: - 'sasaki-etal-2018-cross' fr: desc: ' <p> WikiCLIR with French documents. </p> ' bibtex_ids: - 'sasaki-etal-2018-cross' de: desc: ' <p> WikiCLIR with German documents. </p> ' bibtex_ids: - 'sasaki-etal-2018-cross' it: desc: ' <p> WikiCLIR with Italian documents. </p> ' bibtex_ids: - 'sasaki-etal-2018-cross' ja: desc: ' <p> WikiCLIR with Japanese documents. </p> ' bibtex_ids: - 'sasaki-etal-2018-cross' ko: desc: ' <p> WikiCLIR with Korean documents. </p> ' bibtex_ids: - 'sasaki-etal-2018-cross' no: desc: ' <p> WikiCLIR with Norwegian (Nynorsk) documents. </p> ' bibtex_ids: - 'sasaki-etal-2018-cross' nn: desc: ' <p> WikiCLIR with Norwegian (Bokmål) documents. </p> ' bibtex_ids: - 'sasaki-etal-2018-cross' pl: desc: ' <p> WikiCLIR with Polish documents. </p> ' bibtex_ids: - 'sasaki-etal-2018-cross' pt: desc: ' <p> WikiCLIR with Portuguese documents. </p> ' bibtex_ids: - 'sasaki-etal-2018-cross' ro: desc: ' <p> WikiCLIR with Romanian documents. </p> ' bibtex_ids: - 'sasaki-etal-2018-cross' ru: desc: ' <p> WikiCLIR with Russian documents. </p> ' bibtex_ids: - 'sasaki-etal-2018-cross' en-simple: desc: ' <p> WikiCLIR with Simple English documents. </p> ' bibtex_ids: - 'sasaki-etal-2018-cross' es: desc: ' <p> WikiCLIR with Spanish documents. </p> ' bibtex_ids: - 'sasaki-etal-2018-cross' sw: desc: ' <p> WikiCLIR with Swahili documents. </p> ' bibtex_ids: - 'sasaki-etal-2018-cross' sv: desc: ' <p> WikiCLIR with Swedish documents. </p> ' bibtex_ids: - 'sasaki-etal-2018-cross' tl: desc: ' <p> WikiCLIR with Tagalog documents. </p> ' bibtex_ids: - 'sasaki-etal-2018-cross' tr: desc: ' <p> WikiCLIR with Turkish documents. </p> ' bibtex_ids: - 'sasaki-etal-2018-cross' uk: desc: ' <p> WikiCLIR with Ukrainian documents. </p> ' bibtex_ids: - 'sasaki-etal-2018-cross' vi: desc: ' <p> WikiCLIR with Vietnamese documents. </p> ' bibtex_ids: - 'sasaki-etal-2018-cross' ================================================ FILE: ir_datasets/docs/wikir.yaml ================================================ _: pretty_name: "WikIR" desc: ' <p> A suite of IR benchmarks in multiple languages built from Wikipeida. </p> <ul> <li><a href="https://www.aclweb.org/anthology/2020.lrec-1.237.pdf">WikIR Paper</a></li> <li><a href="http://ceur-ws.org/Vol-2621/CIRCLE20_22.pdf">MLWikIR Paper</a></li> <li><a href="https://github.com/getalp/wikIR">GitHub Repository</a></li> </ul> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] en1k: desc: ' <p> A small version of WikIR for English. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] en1k/training: desc: ' <p> Training set of <a class="ds-ref">wikir/en1k</a>. Scoreddocs are the provided BM25 run. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] en1k/validation: desc: ' <p> Validation set of <a class="ds-ref">wikir/en1k</a>. Scoreddocs are the provided BM25 run. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] en1k/test: desc: ' <p> Test set of <a class="ds-ref">wikir/en1k</a>. Scoreddocs are the provided BM25 run. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] en59k: desc: ' <p> WikIR for English. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] en59k/training: desc: ' <p> Training set of <a class="ds-ref">wikir/en59k</a>. Scoreddocs are the provided BM25 run. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] en59k/validation: desc: ' <p> Validation set of <a class="ds-ref">wikir/en59k</a>. Scoreddocs are the provided BM25 run. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] en59k/test: desc: ' <p> Test set of <a class="ds-ref">wikir/en59k</a>. Scoreddocs are the provided BM25 run. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] en78k: desc: ' <p> WikIR for English. This is one of the two versions used in <a href="https://aclanthology.org/2020.lrec-1.237.pdf">Frej2020Wikir</a>. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] en78k/training: desc: ' <p> Training set of <a class="ds-ref">wikir/en78k</a>. Scoreddocs are the provided BM25 run. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] en78k/validation: desc: ' <p> Validation set of <a class="ds-ref">wikir/en78k</a>. Scoreddocs are the provided BM25 run. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] en78k/test: desc: ' <p> Test set of <a class="ds-ref">wikir/en78k</a>. Scoreddocs are the provided BM25 run. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] ens78k: desc: ' <p> WikIR for English, using the first sentences of articles as queries. This is one of the two versions used in <a href="https://aclanthology.org/2020.lrec-1.237.pdf">Frej2020Wikir</a>. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] ens78k/training: desc: ' <p> Training set of <a class="ds-ref">wikir/ens78k</a>. Scoreddocs are the provided BM25 run. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] ens78k/validation: desc: ' <p> Validation set of <a class="ds-ref">wikir/ens78k</a>. Scoreddocs are the provided BM25 run. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] ens78k/test: desc: ' <p> Test set of <a class="ds-ref">wikir/ens78k</a>. Scoreddocs are the provided BM25 run. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] fr14k: desc: ' <p> WikIR for French. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] fr14k/training: desc: ' <p> Training set of <a class="ds-ref">wikir/fr14k</a>. Scoreddocs are the provided BM25 run. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] fr14k/validation: desc: ' <p> Validation set of <a class="ds-ref">wikir/fr14k</a>. Scoreddocs are the provided BM25 run. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] fr14k/test: desc: ' <p> Test set of <a class="ds-ref">wikir/fr14k</a>. Scoreddocs are the provided BM25 run. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] es13k: desc: ' <p> WikIR for Spanish. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] es13k/training: desc: ' <p> Training set of <a class="ds-ref">wikir/es13k</a>. Scoreddocs are the provided BM25 run. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] es13k/validation: desc: ' <p> Validation set of <a class="ds-ref">wikir/es13k</a>. Scoreddocs are the provided BM25 run. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] es13k/test: desc: ' <p> Test set of <a class="ds-ref">wikir/es13k</a>. Scoreddocs are the provided BM25 run. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] it16k: desc: ' <p> WikIR for Italian. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] it16k/training: desc: ' <p> Training set of <a class="ds-ref">wikir/it16k</a>. Scoreddocs are the provided BM25 run. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] it16k/validation: desc: ' <p> Validation set of <a class="ds-ref">wikir/it16k</a>. Scoreddocs are the provided BM25 run. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] it16k/test: desc: ' <p> Test set of <a class="ds-ref">wikir/it16k</a>. Scoreddocs are the provided BM25 run. </p> ' bibtex_ids: ['Frej2020Wikir', 'Frej2020MlWikir'] ================================================ FILE: ir_datasets/etc/downloads.json ================================================ { "antique": { "docs": { "url": "https://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt", "size_hint": 93608031, "expected_md5": "684f7015aff377062a758e478476aac8", "cache_path": "collection.tsv" }, "train/qrels": { "url": "https://ciir.cs.umass.edu/downloads/Antique/antique-train.qrel", "size_hint": 625622, "expected_md5": "bac76531a3313a2d1debf5f1602d88ab", "cache_path": "train/qrels" }, "train/queries": { "url": "https://ciir.cs.umass.edu/downloads/Antique/antique-train-queries.txt", "size_hint": 136512, "expected_md5": "7684bd977d2682177b559d8da714f45a", "cache_path": "train/queries.txt" }, "test/qrels": { "url": "https://ciir.cs.umass.edu/downloads/Antique/antique-test.qrel", "size_hint": 149838, "expected_md5": "c93ab0f0ce7937c84270c1eef172db4e", "cache_path": "test/qrels" }, "test/queries": { "url": "https://ciir.cs.umass.edu/downloads/Antique/antique-test-queries.txt", "size_hint": 11434, "expected_md5": "d09c5d9ad14368c23c853f6be81e7f2e", "cache_path": "test/queries.txt" }, "disallow_list": { "url": "https://ciir.cs.umass.edu/downloads/Antique/test-queries-blacklist.txt", "size_hint": 184, "expected_md5": "4ca64485dabf26221b90cf96ae2997f9", "cache_path": "test-disallow-list.txt" } }, "aol-ia": { "logs": { "url": "http://www.cim.mcgill.ca/~dudek/206/Logs/AOL-user-ct-collection/aol-data.tar.gz", "expected_md5": "31cd27ce12c3a3f2df62a38050ce4c0a", "size_hint": 460331537, "cache_path": "aol-data.tar.gz" }, "id2wb": { "url": "https://macavaney.us/aol.id2wb.tsv.gz", "expected_md5": "afbf9b03e1a0fabc9f3fdd5105e6ae5a", "size_hint": 40099187, "cache_path": "aol.id2wb.tsv.gz" } }, "aquaint": { "docs": { "instructions": "The AQUAINT corpus is available from the LDC via: <https://catalog.ldc.upenn.edu/LDC2002T31>.\nMore details about the procedure can be found here: <https://ir-datasets.com/aquaint.html#DataAccess>.\nTo proceed, symlink the source file here: {path}", "expected_md5": "ac623257d8dd35326c9d500d5f6834e5", "cache_path": "aquaint_comp_LDC2002T31.tgz" }, "trec-robust-2005/queries": { "url": "https://trec.nist.gov/data/robust/05/05.50.topics.txt", "irds_mirror": true, "size_hint": 25116, "expected_md5": "c2e722e6bdfd00f088c6f6517db564ce", "cache_path": "trec-robust-2005/queries" }, "trec-robust-2005/qrels": { "url": "https://trec.nist.gov/data/robust/05/TREC2005.qrels.txt", "irds_mirror": true, "size_hint": 944950, "expected_md5": "9186021c74090464c50f577d4826e2e2", "cache_path": "trec-robust-2005/qrels" } }, "argsme": { "1.0": { "url": "https://zenodo.org/record/3274636/files/argsme.zip", "size_hint": 238078064, "expected_md5": "c2512648f46a403f8e5e1dc96779e357" }, "1.0-cleaned": { "url": "https://zenodo.org/record/4139439/files/argsme-1.0-cleaned.zip", "size_hint": 236787622, "expected_md5": "fb0837103a4860e1d4536174f55b12c3" }, "2020-04-01/debateorg": { "url": "https://zenodo.org/record/3734893/files/debateorg.zip", "size_hint": 1150846141, "expected_md5": "0368ee47ce0ec8bed837c7e22c024493" }, "2020-04-01/debatepedia": { "url": "https://zenodo.org/record/3734893/files/debatepedia.zip", "size_hint": 184347726, "expected_md5": "bde8e3ed832c19ca5ed8ed1506a862e8" }, "2020-04-01/debatewise": { "url": "https://zenodo.org/record/3734893/files/debatewise.zip", "size_hint": 77388912, "expected_md5": "5e5c498a5f657ed7d02e06016e9ce3b1" }, "2020-04-01/idebate": { "url": "https://zenodo.org/record/3734893/files/idebate.zip", "size_hint": 20241730, "expected_md5": "5b888c94cce740f1216c063e5e47c74c" }, "2020-04-01/parliamentary": { "url": "https://zenodo.org/record/3734893/files/parliamentary.zip", "size_hint": 27319, "expected_md5": "c80d932c953b64fb300f13d0d93096bb" }, "2020-04-01/processed": { "url": "https://zenodo.org/record/6873574/files/args_processed_04_01.tar.gz", "size_hint": 1547009833, "expected_md5": "43bfce957df69bf59b3d59744eb73ded" } }, "beir": { "msmarco": { "url": "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/msmarco.zip", "size_hint": 1082258632, "expected_md5": "444067daf65d982533ea17ebd59501e4", "cache_path": "msmarco/source.zip" }, "trec-covid": { "url": "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/trec-covid.zip", "size_hint": 73876720, "expected_md5": "ce62140cb23feb9becf6270d0d1fe6d1", "cache_path": "trec-covid/source.zip" }, "nfcorpus": { "url": "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/nfcorpus.zip", "size_hint": 2448432, "expected_md5": "a89dba18a62ef92f7d323ec890a0d38d", "cache_path": "nfcorpus/source.zip" }, "nq": { "url": "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/nq.zip", "size_hint": 498307926, "expected_md5": "d4d3d2e48787a744b6f6e691ff534307", "cache_path": "nq/source.zip" }, "hotpotqa": { "url": "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/hotpotqa.zip", "size_hint": 654025350, "expected_md5": "f412724f78b0d91183a0e86805e16114", "cache_path": "hotpotqa/source.zip" }, "fiqa": { "url": "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/fiqa.zip", "size_hint": 17948027, "expected_md5": "17918ed23cd04fb15047f73e6c3bd9d9", "cache_path": "fiqa/source.zip" }, "arguana": { "url": "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/arguana.zip", "size_hint": 3773617, "expected_md5": "8ad3e3c2a5867cdced806d6503f29b99", "cache_path": "arguana/source.zip" }, "webis-touche2020": { "url": "https://macavaney.us/beir-webis-touche2020-v1.zip", "size_hint": 227137373, "expected_md5": "5ec7f8b18481fc2e9b3964ad1b22dd28", "cache_path": "webis-touche2020/source.zip" }, "webis-touche2020/v2": { "url": "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/webis-touche2020.zip", "size_hint": 227132363, "expected_md5": "46f650ba5a527fc69e0a6521c5a23563", "cache_path": "webis-touche2020/v2/source.zip" }, "cqadupstack": { "url": "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/cqadupstack.zip", "size_hint": 5343728040, "expected_md5": "4e41456d7df8ee7760a7f866133bda78", "cache_path": "cqadupstack/source.zip" }, "quora": { "url": "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/quora.zip", "size_hint": 15853968, "expected_md5": "18fb154900ba42a600f84b839c173167", "cache_path": "quora/source.zip" }, "dbpedia-entity": { "url": "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/dbpedia-entity.zip", "size_hint": 639285131, "expected_md5": "c2a39eb420a3164af735795df012ac2c", "cache_path": "dbpedia-entity/source.zip" }, "scidocs": { "url": "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/scidocs.zip", "size_hint": 142471588, "expected_md5": "38121350fc3a4d2f48850f6aff52e4a9", "cache_path": "scidocs/source.zip" }, "fever": { "url": "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/fever.zip", "size_hint": 1236988269, "expected_md5": "5a818580227bfb4b35bb6fa46d9b6c03", "cache_path": "fever/source.zip" }, "climate-fever": { "url": "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/climate-fever.zip", "size_hint": 1228666652, "expected_md5": "8b66f0a9126c521bae2bde127b4dc99d", "cache_path": "climate-fever/source.zip" }, "scifact": { "url": "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/scifact.zip", "size_hint": 2816079, "expected_md5": "5f7d1de60b170fc8027bb7898e2efca1", "cache_path": "scifact/source.zip" } }, "c4": { "en-noclean/sources": { "url": "https://ai2-s2-research-public.s3-us-west-2.amazonaws.com/ir-datasets/c4/en.noclean.sources.json.gz", "size_hint": 240518, "expected_md5": "3faf0f3aaf3f0e5bca573e118f815991", "cache_path": "en.noclean.sources.json.gz" }, "en-noclean/checkpoints": { "url": "https://ai2-s2-research-public.s3-us-west-2.amazonaws.com/ir-datasets/c4/en.noclean.checkpoints.tar.gz", "size_hint": 8983526491, "expected_md5": "eab00c3b5202564da998466198a01298" }, "trec-misinfo-2021/queries": { "instructions": "download file from Active Participants section of TREC website here: <https://trec.nist.gov/act_part/tracks/misinfo/misinfo-2021-topics.xml>.\nLink the file here: {path}", "size_hint": 50721, "expected_md5": "c8fd82d3ffea802a1029720b03de443b", "cache_path": "misinfo-2021-topics.xml" } }, "car": { "docs": { "url": "http://trec-car.cs.unh.edu/datareleases/v1.5/paragraphcorpus-v1.5.tar.xz", "size_hint": 5114258812, "expected_md5": "4d006dd67cbc11541ed7f87b875cb990", "cache_path": "docs.tar.xz" }, "docs/v2.0": { "url": "http://trec-car.cs.unh.edu/datareleases/v2.0/paragraphCorpus.v2.0.tar.xz", "size_hint": 5085726092, "expected_md5": "a404e9256d763ddcacc3da1e34de466a", "cache_path": "paragraphCorpus.v2.0.tar.xz" }, "trec-y1/queries": { "url": "http://trec-car.cs.unh.edu/datareleases/v1.5/benchmarkY1test.public-v1.5.tar.xz", "size_hint": 40508, "expected_md5": "6ab490517accd2a2cb4848c0f160bc8d", "cache_path": "trec-y1/qrels.tar.xz" }, "trec-y1/qrels": { "url": "http://trec-car.cs.unh.edu/datareleases/v1.5/trec-car-2017-qrels.tar.gz", "size_hint": 4334569, "expected_md5": "1ab7cf01c341757af1bb3db2aedd020f", "cache_path": "trec-y1/qrels.tar.gz" }, "test200": { "url": "http://trec-car.cs.unh.edu/datareleases/v1.5/test200-v1.5.tar.xz", "size_hint": 1307336, "expected_md5": "a7d8ea41f933b2ef49f06d782e908d13", "cache_path": "test200.tar.xz" }, "train": { "url": "http://trec-car.cs.unh.edu/datareleases/v1.5/train-v1.5.tar.xz", "size_hint": 2591721692, "expected_md5": "70eb3cf1d9358614f9d96dcd2565dc2b", "cache_path": "train.tar.xz" } }, "clinicaltrials": { "docs/2017": { "url": "https://bionlp.nlm.nih.gov/trec2017precisionmedicine/clinicaltrials_xml.tar.gz", "size_hint": 724731456, "expected_md5": "e5d333ceed0cbbbe513504c96148ab1a", "cache_path": "docs_2017.tar.gz" }, "docs/2019/0": { "url": "http://www.trec-cds.org/clinical_trials.0.tar.gz", "size_hint": 277495873, "expected_md5": "d57fbafa63520c45faceedec3de801b7", "cache_path": "docs_2019_0.tar.gz" }, "docs/2019/1": { "url": "http://www.trec-cds.org/clinical_trials.1.tar.gz", "size_hint": 266267182, "expected_md5": "d32a632fc72c68d63309732af667b1ee", "cache_path": "docs_2019_1.tar.gz" }, "docs/2019/2": { "url": "http://www.trec-cds.org/clinical_trials.2.tar.gz", "size_hint": 242899908, "expected_md5": "055980d685164cb1daefa2b4cc1e9a2f", "cache_path": "docs_2019_2.tar.gz" }, "docs/2019/3": { "url": "http://www.trec-cds.org/clinical_trials.3.tar.gz", "size_hint": 218698307, "expected_md5": "fa8e76c85ab7204c0294f03fa76f6ed2", "cache_path": "docs_2019_3.tar.gz" }, "docs/2021/1": { "url": "http://www.trec-cds.org/2021_data/ClinicalTrials.2021-04-27.part1.zip", "size_hint": 382792518, "expected_md5": "e12eb9a0d21452503b0ef8874c69f490", "cache_path": "docs_2021_1.zip" }, "docs/2021/2": { "url": "http://www.trec-cds.org/2021_data/ClinicalTrials.2021-04-27.part2.zip", "size_hint": 378478271, "expected_md5": "f6986125506434887a162f144ca4d9a2", "cache_path": "docs_2021_2.zip" }, "docs/2021/3": { "url": "http://www.trec-cds.org/2021_data/ClinicalTrials.2021-04-27.part3.zip", "size_hint": 375998752, "expected_md5": "9b7fb528b22edfcf4535154cc3d98111", "cache_path": "docs_2021_3.zip" }, "docs/2021/4": { "url": "http://www.trec-cds.org/2021_data/ClinicalTrials.2021-04-27.part4.zip", "size_hint": 360825058, "expected_md5": "4fd98d209e7b62cee87af211c0c281f6", "cache_path": "docs_2021_4.zip" }, "docs/2021/5": { "url": "http://www.trec-cds.org/2021_data/ClinicalTrials.2021-04-27.part5.zip", "size_hint": 296625845, "expected_md5": "a747f09ac5d4f3cd0cc75957ad9f32d8", "cache_path": "docs_2021_5.zip" }, "trec-pm-2017/qrels": { "url": "https://trec.nist.gov/data/precmed/qrels-final-trials.txt", "irds_mirror": true, "size_hint": 243723, "expected_md5": "3c35f9e62abf64c873250ac8022d5a51", "cache_path": "trec-pm-2017/qrels" }, "trec-pm-2018/qrels": { "url": "https://trec.nist.gov/data/precmed/qrels-treceval-clinical_trials-2018-v2.txt", "irds_mirror": true, "size_hint": 267669, "expected_md5": "a6c9efbecb5f32a19c5ac37f1f98c951", "cache_path": "trec-pm-2018/qrels" }, "trec-pm-2019/qrels": { "url": "https://trec.nist.gov/data/precmed/qrels-treceval-trials.38.txt", "irds_mirror": true, "size_hint": 243548, "expected_md5": "fc4b0cf6007b2dc2a7e81536add65a8c", "cache_path": "trec-pm-2019/qrels" }, "trec-pm-2019/queries": { "url": "http://www.trec-cds.org/topics2019.xml", "size_hint": 6302, "expected_md5": "bc26cef87e842837daf2e6680d4652c0", "cache_path": "trec-pm-2019/queries.xml" }, "trec-ct-2021/queries": { "url": "http://www.trec-cds.org/topics2021.xml", "size_hint": 64618, "expected_md5": "6d842b40387d760274447c1f8d7396a8", "cache_path": "trec-ct-2021/queries.xml" }, "trec-ct-2021/qrels": { "url": "https://trec.nist.gov/data/trials/qrels2021.txt", "irds_mirror": true, "size_hint": 676496, "expected_md5": "0335d95c58d5f5fd9bc730bccb60ca90", "cache_path": "trec-ct-2021/qrels" }, "trec-ct-2022/queries": { "url": "https://www.trec-cds.org/topics2022.xml", "size_hint": 32423, "expected_md5": "73bcb3985a17fd60d786f5d8f5a0bb2e", "cache_path": "trec-ct-2022/queries.xml" } }, "clirmatrix": { "downloads": { "url": "http://www.cs.jhu.edu/~shuosun/clirmatrix/data/downloads.json.gz", "size_hint": 5143717, "expected_md5": "371cc532aca236759bd3602eb6ce2181", "cache_path": "clirmatrix_downloads.json.gz" }, "metadata": { "url": "https://macavaney.us/clirmatrix-metadata.json.lz4", "size_hint": 6517585, "expected_md5": "537510770a139b25dd12684c6711c91a", "cache_path": "clirmatrix-metadata.json.lz4" } }, "clueweb09": { "docs": { "instructions": "ClueWeb09 is available by hard drives from CMU here: <https://lemurproject.org/clueweb09/>\nMore details about the procedure can be found here: <https://ir-datasets.com/ClueWeb09.html#DataAccess>.\nLink the ClueWeb09 source files here: {path}\nShould contain directories like ClueWeb09_English_1", "cache_path": "corpus" }, "docs.chk": { "url": "https://ai2-s2-research-public.s3-us-west-2.amazonaws.com/ir-datasets/clueweb09/clueweb09-source-chk.tar.gz", "size_hint": 3582668561, "expected_md5": "74328d9c743c52ddef434ce41a4e6dc1" }, "trec-web-2009/queries": { "url": "https://trec.nist.gov/data/web/09/wt09.topics.full.xml", "irds_mirror": true, "size_hint": 35853, "expected_md5": "52e4a03d32718fa11290286e8e8dff47", "cache_path": "trec-web-2009/queries.xml" }, "trec-web-2009/qrels.adhoc": { "url": "https://trec.nist.gov/data/web/09/prels.1-50.gz", "irds_mirror": true, "size_hint": 171396, "expected_md5": "3afdef86adf3211629182e3380f9e751", "cache_path": "trec-web-2009/prels.gz" }, "trec-web-2009/qrels.all": { "url": "https://trec.nist.gov/data/web/09/qrels.diversity.gz", "irds_mirror": true, "size_hint": 166538, "expected_md5": "0a3fb04bfdaa1551d8960d862e925c9e", "cache_path": "trec-web-2009/qrels.all.gz" }, "trec-web-2010/queries": { "url": "https://trec.nist.gov/data/web/10/wt2010-topics.xml", "irds_mirror": true, "size_hint": 32661, "expected_md5": "8f084cc90c13e4cd66192d3a9585235e", "cache_path": "trec-web-2010/queries.xml" }, "trec-web-2010/qrels.adhoc": { "url": "https://trec.nist.gov/data/web/10/10.adhoc-qrels.final", "irds_mirror": true, "size_hint": 837288, "expected_md5": "8a22083b0370d6ac799e1e779110de06", "cache_path": "trec-web-2010/qrels" }, "trec-web-2010/qrels.all": { "url": "https://trec.nist.gov/data/web/10/10.diversity-qrels.final", "irds_mirror": true, "size_hint": 297198, "expected_md5": "0a78c8bf7a809039a1fc9013a4bfe4eb", "cache_path": "trec-web-2010/qrels.all" }, "trec-web-2011/queries": { "url": "https://trec.nist.gov/data/web/11/full-topics.xml", "irds_mirror": true, "size_hint": 29693, "expected_md5": "23914875d80a5d24571d4f458a83c7fa", "cache_path": "trec-web-2011/queries.xml" }, "trec-web-2011/qrels.adhoc": { "url": "https://trec.nist.gov/data/web/11/qrels.adhoc", "irds_mirror": true, "size_hint": 659973, "expected_md5": "7844c8a9cc3a4b6f740d45e56013693d", "cache_path": "trec-web-2011/qrels" }, "trec-web-2011/qrels.all": { "url": "https://trec.nist.gov/data/web/11/qrels.diversity", "irds_mirror": true, "size_hint": 2208947, "expected_md5": "b88c1d42afbbbc5a4a776dd3f0b905c2", "cache_path": "trec-web-2011/qrels.all" }, "trec-web-2012/queries": { "url": "https://trec.nist.gov/data/web/12/full-topics.xml", "irds_mirror": true, "size_hint": 29353, "expected_md5": "a0b8ee33da312a284fda379582b0bc2a", "cache_path": "trec-web-2012/queries.xml" }, "trec-web-2012/qrels.adhoc": { "url": "https://trec.nist.gov/data/web/12/qrels.adhoc", "irds_mirror": true, "size_hint": 610948, "expected_md5": "079723ba3e955269f0de6254c4bec180", "cache_path": "trec-web-2012/qrels" }, "trec-web-2012/qrels.all": { "url": "https://trec.nist.gov/data/web/12/qrels.diversity", "irds_mirror": true, "size_hint": 2124769, "expected_md5": "bbfde42fc4bc502b19aec5dcc6922faa", "cache_path": "trec-web-2012/qrels.all" }, "trec-mq-2009/queries": { "url": "https://trec.nist.gov/data/million.query/09/09.mq.topics.20001-60000.gz", "irds_mirror": true, "size_hint": 437150, "expected_md5": "6347147d4d6c847f0423709140a7b10d", "cache_path": "trec-mq-2009/queries.txt.gz" }, "trec-mq-2009/qrels": { "url": "https://trec.nist.gov/data/million.query/09/prels.20001-60000.gz", "irds_mirror": true, "size_hint": 323113, "expected_md5": "e67f45d3060e20667596f37e34d696c8", "cache_path": "trec-mq-2009/prels.gz" } }, "clueweb12": { "docs": { "instructions": "ClueWeb12 is available by hard drives from CMU here: <https://lemurproject.org/clueweb12/>\nMore details about the procedure can be found here: <https://ir-datasets.com/ClueWeb12.html#DataAccess>.\nLink the ClueWeb12 source files here: {path}\nShould contain ClueWeb12_00--19 directories", "cache_path": "corpus" }, "docs.chk": { "url": "https://ai2-s2-research-public.s3-us-west-2.amazonaws.com/ir-datasets/clueweb12/clueweb12-source-chk.tar.gz", "size_hint": 3883120643, "expected_md5": "fb92d1f8ed1436839313d2eb47f628a5" }, "cw12b-info": { "url": "http://lemurproject.org/clueweb12/ClueWeb12-CreateB13.tgz", "size_hint": 1310407043, "expected_md5": "8175ce74a97e46be80c2127d965da200", "cache_path": "ClueWeb12-CreateB13.tgz" }, "trec-web-2013/queries": { "url": "https://trec.nist.gov/data/web/2013/trec2013-topics.xml", "irds_mirror": true, "size_hint": 23143, "expected_md5": "4c0ecdddc8632d3fa8fecb507f19801d", "cache_path": "trec-web-2013/queries.xml" }, "trec-web-2013/qrels.adhoc": { "url": "https://trec.nist.gov/data/web/2013/qrels.adhoc.txt", "irds_mirror": true, "size_hint": 492350, "expected_md5": "44aa6300f9df4a77f7205c574afb9c2d", "cache_path": "trec-web-2013/qrels.adhoc" }, "trec-web-2013/qrels.all": { "url": "https://trec.nist.gov/data/web/2013/qrels.all.txt", "irds_mirror": true, "size_hint": 1598265, "expected_md5": "741e76258543ad47ae75030363be13a9", "cache_path": "trec-web-2013/qrels.all" }, "trec-web-2014/queries": { "url": "https://trec.nist.gov/data/web/2014/trec2014-topics.xml", "irds_mirror": true, "size_hint": 22873, "expected_md5": "b1bf5c7aa9f6e7026e1558686330744f", "cache_path": "trec-web-2014/queries.xml" }, "trec-web-2014/qrels.adhoc": { "url": "https://trec.nist.gov/data/web/2014/qrels.adhoc.txt", "irds_mirror": true, "size_hint": 491247, "expected_md5": "afa1db71680acf71283adc7846282a44", "cache_path": "trec-web-2014/qrels.adhoc" }, "trec-web-2014/qrels.all": { "url": "https://trec.nist.gov/data/web/2014/qrels.all.txt", "irds_mirror": true, "size_hint": 1492061, "expected_md5": "085256d18544cd3e34b9fa9cc29ae513", "cache_path": "trec-web-2014/qrels.all" }, "ntcir-www-1/queries": { "url": "http://www.thuir.cn/ntcirwww/files/eng.queries.xml.zip", "size_hint": 1611, "expected_md5": "ed43ba82791bb20776c049421525a055" }, "ntcir-www-1/qrels": { "url": "https://macavaney.us/misc/ntcir-www-1.qrels", "size_hint": 865810, "expected_md5": "634464456437bf378725958822910242", "cache_path": "ntcir-www-1/qrels" }, "ntcir-www-2/queries": { "url": "http://www.thuir.cn/ntcirwww2/qEng.zip", "size_hint": 3466, "expected_md5": "d4b108b52b2e2c8bedc2e12540414735" }, "ntcir-www-2/qrels": { "url": "http://www.thuir.cn/ntcirwww3/www2e.qrels", "size_hint": 939318, "expected_md5": "155b515dd9fc05e1aeb9c116c9147bb0", "cache_path": "ntcir-www-2/qrels" }, "ntcir-www-3/queries": { "url": "http://www.thuir.cn/ntcirwww3/www2www3topics-E.xml", "size_hint": 28823, "expected_md5": "1ecd1380b20894014a54eb6cb8064587", "cache_path": "ntcir-www-3/queries.xml" }, "trec-misinfo-2019/queries": { "url": "https://trec.nist.gov/data/misinfo/2019topics.xml", "irds_mirror": true, "size_hint": 30028, "expected_md5": "e46bb8ff3058bbcc1bd73a0ecbda1621", "cache_path": "trec-misinfo-2019/queries.xml" }, "trec-misinfo-2019/qrels": { "url": "https://trec.nist.gov/data/misinfo/2019qrels_raw.txt", "irds_mirror": true, "size_hint": 878952, "expected_md5": "faf86b2ac5fcca52b189a3ad408fd019", "cache_path": "trec-misinfo-2019/qrels" }, "clef-ehealth/queries": { "url": "https://raw.githubusercontent.com/CLEFeHealth/CLEFeHealth2016Task3/master/eng_queries/queries2016_with_url.xml", "size_hint": 47427, "expected_md5": "ed0289056e21643baa07dfc7bee9574b", "cache_path": "clef-ehealth/queries.xml" }, "clef-ehealth/queries/cs": { "url": "https://raw.githubusercontent.com/CLEFeHealth/CLEFeHealth2017IRtask/master/queries/multilingual/queries2016cs.xml", "size_hint": 33079, "expected_md5": "f7940a8961fae713742b935c83b6118d", "cache_path": "clef-ehealth/queries.cs.xml" }, "clef-ehealth/queries/de": { "url": "https://raw.githubusercontent.com/CLEFeHealth/CLEFeHealth2017IRtask/master/queries/multilingual/queries2016de.xml", "size_hint": 31575, "expected_md5": "f59e000328a8cbe357ff94d5d5e82474", "cache_path": "clef-ehealth/queries.de.xml" }, "clef-ehealth/queries/fr": { "url": "https://raw.githubusercontent.com/CLEFeHealth/CLEFeHealth2017IRtask/master/queries/multilingual/queries2016fr.xml", "size_hint": 35744, "expected_md5": "7362ccb451a606af7a47e359894f524c", "cache_path": "clef-ehealth/queries.fr.xml" }, "clef-ehealth/queries/hu": { "url": "https://raw.githubusercontent.com/CLEFeHealth/CLEFeHealth2017IRtask/master/queries/multilingual/queries2016hu.xml", "size_hint": 31670, "expected_md5": "17603aecae72eeb70ada73700956ab3f", "cache_path": "clef-ehealth/queries.hu.xml" }, "clef-ehealth/queries/pl": { "url": "https://raw.githubusercontent.com/CLEFeHealth/CLEFeHealth2017IRtask/master/queries/multilingual/queries2016pl.xml", "size_hint": 32741, "expected_md5": "e6d442846786793c235bf927289af4c3", "cache_path": "clef-ehealth/queries.pl.xml" }, "clef-ehealth/queries/sv": { "url": "https://raw.githubusercontent.com/CLEFeHealth/CLEFeHealth2017IRtask/master/queries/multilingual/queries2016sv.xml", "size_hint": 30669, "expected_md5": "bc9066f128a391c9f5c282dbcca3e44f", "cache_path": "clef-ehealth/queries.sv.xml" }, "clef-ehealth/2016.qrels": { "url": "https://raw.githubusercontent.com/CLEFeHealth/CLEFeHealth2016Task3/master/qrels/task1.qrels", "size_hint": 5550000, "expected_md5": "5392a6f7cdbb0cab56c34656ab100684", "cache_path": "clef-ehealth-2016/2016.qrels" }, "clef-ehealth/2016.qtrust": { "url": "https://raw.githubusercontent.com/CLEFeHealth/CLEFeHealth2016Task3/master/qrels/task1.qtrust", "size_hint": 5669328, "expected_md5": "14cf266961f686b49d8430b802064ac6", "cache_path": "clef-ehealth/2016.qtrust" }, "clef-ehealth/2016.qunder": { "url": "https://raw.githubusercontent.com/CLEFeHealth/CLEFeHealth2016Task3/master/qrels/task1.qunder", "size_hint": 5695332, "expected_md5": "4314d1a0db76a50204e5e900ffa2d4e3", "cache_path": "clef-ehealth/2016.qunder" }, "clef-ehealth/2017.qrels": { "url": "https://raw.githubusercontent.com/CLEFeHealth/CLEFeHealth2017IRtask/master/assessments/2017/clef2017_qrels.txt", "size_hint": 4411584, "expected_md5": "b9909f2fa7f2a0ceca1033fc92729482", "cache_path": "clef-ehealth/2017.qrels" }, "clef-ehealth/2017.qtrust": { "url": "https://raw.githubusercontent.com/CLEFeHealth/CLEFeHealth2017IRtask/master/assessments/2017/clef2017_qtrust.txt", "size_hint": 4528752, "expected_md5": "3a43cc9a49a781b13f0b8438bceed2f5", "cache_path": "clef-ehealth/2017.qtrust" }, "clef-ehealth/2017.qreads": { "url": "https://raw.githubusercontent.com/CLEFeHealth/CLEFeHealth2017IRtask/master/assessments/2017/clef2017_qreads.txt", "size_hint": 4531566, "expected_md5": "75302c012adf0126ab93e330ffb6aaab", "cache_path": "clef-ehealth/2017.qureads" } }, "codec": { "documents": { "instructions": "The CODEC document collection is available on request (see here: <https://github.com/grill-lab/CODEC>). Once downloaded, move/link the comets_documents.jsonl file to {path}.", "cache_path": "v1/comets_documents.jsonl" }, "topics": { "url": "https://raw.githubusercontent.com/grill-lab/CODEC/main/topics/topics.json", "size_hint": 47192, "expected_md5": "f75e4733693588449f68f7fdceb02ec9", "cache_path": "v1/topics.json" }, "qrels": { "url": "https://raw.githubusercontent.com/grill-lab/CODEC/main/raw_judgments/raw_document_judgments.txt", "size_hint": 306976, "expected_md5": "7200606d6dc573abe2dd93160d5a5ab5", "cache_path": "v1/document.qrels" } }, "codesearchnet": { "python": { "url": "https://huggingface.co/datasets/macavaney/codesearchnet-mirror/resolve/main/v2/python.zip", "size_hint": 940909997, "expected_md5": "07b49dd01fbac894fbdae22da6462e4f" }, "java": { "url": "https://huggingface.co/datasets/macavaney/codesearchnet-mirror/resolve/main/v2/java.zip", "size_hint": 1060569153, "expected_md5": "fea180077275d8f98f42a3386f492837" }, "go": { "url": "https://huggingface.co/datasets/macavaney/codesearchnet-mirror/resolve/main/v2/go.zip", "size_hint": 487525935, "expected_md5": "c0288db91f067c95bb952577949e7b13" }, "php": { "url": "https://huggingface.co/datasets/macavaney/codesearchnet-mirror/resolve/main/v2/php.zip", "size_hint": 851894048, "expected_md5": "62373f85cfae2f5d7422dc1e55fbbb50" }, "ruby": { "url": "https://huggingface.co/datasets/macavaney/codesearchnet-mirror/resolve/main/v2/ruby.zip", "size_hint": 111758028, "expected_md5": "6847c0149666334cc937909b0e2297ae" }, "javascript": { "url": "https://huggingface.co/datasets/macavaney/codesearchnet-mirror/resolve/main/v2/javascript.zip", "size_hint": 1664713350, "expected_md5": "7649178a02f7b5c8fbccb64abe7946b6" }, "challenge/queries": { "url": "https://raw.githubusercontent.com/github/CodeSearchNet/master/resources/queries.csv", "size_hint": 2493, "expected_md5": "6041a0c32dff4286859ca76d420d76f4", "cache_path": "queries.csv" }, "challenge/qrels": { "url": "https://raw.githubusercontent.com/github/CodeSearchNet/master/resources/annotationStore.csv", "size_hint": 677798, "expected_md5": "9e0a57ae90b3dd0144d59064d0751abd", "cache_path": "qrels.csv" } }, "cord19": { "docs/2020-07-16": { "url": "https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases/cord-19_2020-07-16.tar.gz", "size_hint": 3662861028, "expected_md5": "018c4bc4d76d4ae072a26ac28c8b456b", "cache_path": "2020-07-16.tar.gz" }, "docs/2020-07-16/metadata": { "url": "https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv", "size_hint": 269219095, "expected_md5": "80d664e496b8b7e50a39c6f6bb92e0ef", "cache_path": "2020-07-16/metadata.csv" }, "docs/2020-04-10/metadata": { "url": "https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-04-10/metadata.csv", "size_hint": 77323567, "expected_md5": "42a21f386be86c24647a41bedde34046", "cache_path": "2020-04-10/metadata.csv" }, "docs/2020-05-01/metadata": { "url": "https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-05-01/metadata.csv", "size_hint": 89290114, "expected_md5": "b1d2e409026494e0c8034278bacd1248", "cache_path": "2020-05-01/metadata.csv" }, "docs/2020-05-19/metadata": { "url": "https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-05-19/metadata.csv", "size_hint": 189687667, "expected_md5": "e3c5c8af3a078e19cb179e630c345959", "cache_path": "2020-05-19/metadata.csv" }, "docs/2020-06-19/metadata": { "url": "https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-06-19/metadata.csv", "size_hint": 228730850, "expected_md5": "4e8788b6e44f3428ff9ab1d4bfdfb6ab", "cache_path": "2020-06-19/metadata.csv" }, "trec-covid/queries": { "url": "https://ir.nist.gov/covidSubmit/data/topics-rnd5.xml", "irds_mirror": true, "size_hint": 18707, "expected_md5": "0307a37b6b9f1a5f233340a769d538ea", "cache_path": "trec-covid/queries.xml" }, "trec-covid/qrels": { "url": "https://ir.nist.gov/covidSubmit/data/qrels-covid_d5_j0.5-5.txt", "irds_mirror": true, "size_hint": 1142244, "expected_md5": "8138424a59daea0aba751c8a891e5f54", "cache_path": "trec-covid/qrels" }, "trec-covid/round1/queries": { "url": "https://ir.nist.gov/covidSubmit/data/topics-rnd1.xml", "irds_mirror": true, "size_hint": 10348, "expected_md5": "cf1b605222f45f7dbc90ca8e4d9b2c31", "cache_path": "trec-covid/round1/queries.xml" }, "trec-covid/round1/qrels": { "url": "https://ir.nist.gov/covidSubmit/data/qrels-rnd1.txt", "irds_mirror": true, "size_hint": 150110, "expected_md5": "d58586df5823e7d1d0b3619a73b31518", "cache_path": "trec-covid/round1/qrels" }, "trec-covid/round2/queries": { "url": "https://ir.nist.gov/covidSubmit/data/topics-rnd2.xml", "irds_mirror": true, "size_hint": 12291, "expected_md5": "550129e71c83de3fb4d6d29a172c5842", "cache_path": "trec-covid/round2/queries.xml" }, "trec-covid/round2/qrels": { "url": "https://ir.nist.gov/covidSubmit/data/qrels-rnd2.txt", "irds_mirror": true, "size_hint": 212662, "expected_md5": "157df01d5a084b09be089407f41cf51b", "cache_path": "trec-covid/round2/qrels" }, "trec-covid/round3/queries": { "url": "https://ir.nist.gov/covidSubmit/data/topics-rnd3.xml", "irds_mirror": true, "size_hint": 14271, "expected_md5": "aa42a15c107e74488c8189a16a311358", "cache_path": "trec-covid/round3/queries.xml" }, "trec-covid/round3/qrels": { "url": "https://ir.nist.gov/covidSubmit/data/qrels-covid_d3_j2.5-3.txt", "irds_mirror": true, "size_hint": 223360, "expected_md5": "2a534a42b5b6b43dd8ae7d9433249006", "cache_path": "trec-covid/round3/qrels" }, "trec-covid/round4/queries": { "url": "https://ir.nist.gov/covidSubmit/data/topics-rnd4.xml", "irds_mirror": true, "size_hint": 16327, "expected_md5": "202ba3155b1e390115ae13f34d80d4fc", "cache_path": "trec-covid/round4/queries.xml" }, "trec-covid/round4/qrels": { "url": "https://ir.nist.gov/covidSubmit/data/qrels-covid_d4_j3.5-4.txt", "irds_mirror": true, "size_hint": 232379, "expected_md5": "b86dd338d6b0a41e62f18e566e541b96", "cache_path": "trec-covid/round4/qrels" }, "trec-covid/round5/qrels": { "url": "https://ir.nist.gov/covidSubmit/data/qrels-covid_d5_j4.5-5.txt", "irds_mirror": true, "size_hint": 402401, "expected_md5": "6111c00ac9adac774f5b51e4f9a2a25b", "cache_path": "trec-covid/round5/qrels" } }, "cranfield": { "main": { "url": "http://ir.dcs.gla.ac.uk/resources/test_collections/cran/cran.tar.gz", "size_hint": 506960, "expected_md5": "1730f7be572d95a5a4b56c59a7b900a5", "cache_path": "cran.tar.gz" } }, "csl": { "docs": { "url": "https://huggingface.co/datasets/neuclir/csl/resolve/main/data/csl.jsonl.gz?download=true", "size_hint": 115749077, "expected_md5": "4198f7b442187320e2351b3b473c1883", "cache_path": "docs.jsonl.gz" }, "trec-2023/queries": { "url": "https://trec.nist.gov/data/neuclir/2023/neuclir-2023-technical_topics.0719.jsonl", "size_hint": 86519, "expected_md5": "0dd5ba173c695362a8705056edca481b", "cache_path": "trec-2023/topics.jsonl", "irds_mirror": true }, "trec-2023/qrels": { "url": "https://trec.nist.gov/data/neuclir/2023/neuclir-2023-qrels.final.tar.gz", "size_hint": 6023886, "expected_md5": "cea4ff3d9eba612c7119e6490217d4e1", "cache_path": "trec-2023/qrels.tar.gz", "irds_mirror": true } }, "disks45": { "docs": { "instructions": "The TREC Robust document collection is from TREC disks 4 and 5. Due to the copyrighted nature of the documents, this collection is for research use only, which requires agreements to be filed with NIST. See details here: <https://trec.nist.gov/data/cd45/index.html>.\nMore details about the procedure can be found here: <https://ir-datasets.com/trec-robust04.html#DataAccess>.\nOnce completed, place the uncompressed source here: {path}\nThis should contain directories like NEWS_data/FBIS, NEWS_data/FR94, etc.", "cache_path": "corpus" }, "robust04-queries": { "url": "https://trec.nist.gov/data/robust/04.testset.gz", "irds_mirror": true, "size_hint": 34293, "expected_md5": "5eac3d774a2f87da61c08a94f945beff", "cache_path": "04.testset.gz" }, "robust04-qrels": { "url": "https://trec.nist.gov/data/robust/qrels.robust2004.txt", "irds_mirror": true, "size_hint": 6543541, "expected_md5": "123c2a0ba2ec31178cb1050995dcfdfa", "cache_path": "qrels.robust2004.txt" }, "trec8-qrels": { "url": "https://trec.nist.gov/data/qrels_eng/qrels.trec8.adhoc.parts1-5.tar.gz", "irds_mirror": true, "size_hint": 325935, "expected_md5": "ce1cfa80b29746d2a5eeddab268d4f6a", "cache_path": "qrels.trec8.adhoc.parts1-5.tar.gz" }, "trec8-queries": { "url": "https://trec.nist.gov/data/topics_eng/topics.401-450.gz", "irds_mirror": true, "size_hint": 6946, "expected_md5": "daaafb700eed76f61a6e9e4b0dcc40c8", "cache_path": "topics.401-450.gz" }, "trec7-qrels": { "url": "https://trec.nist.gov/data/qrels_eng/qrels.trec7.adhoc.parts1-5.tar.gz", "irds_mirror": true, "size_hint": 307120, "expected_md5": "43def30d4f4b33a830ae67e3dce19023", "cache_path": "qrels.trec7.adhoc.parts1-5.tar.gz" }, "trec7-queries": { "url": "https://trec.nist.gov/data/topics_eng/topics.351-400.gz", "irds_mirror": true, "size_hint": 7400, "expected_md5": "fdee3f7e37e173fd6fcdc00fbe1fc671", "cache_path": "topics.351-400.gz" } }, "dpr-w100": { "docs": { "url": "https://dl.fbaipublicfiles.com/dpr/wikipedia_split/psgs_w100.tsv.gz", "size_hint": 4694541059, "expected_md5": "612fe66e0b6b41ee28f806140226c563", "cache_path": "docs.tsv.gz" }, "nq-train": { "url": "https://dl.fbaipublicfiles.com/dpr/data/retriever/biencoder-nq-train.json.gz", "size_hint": 2314892908, "expected_md5": "a1c927b5adae71388eb064329387709f", "cache_path": "nq-train.json.gz" }, "nq-dev": { "url": "https://dl.fbaipublicfiles.com/dpr/data/retriever/biencoder-nq-dev.json.gz", "size_hint": 256239282, "expected_md5": "2640483dbe0df7ae29c6da419c551a80", "cache_path": "nq-dev.json.gz" }, "tqa-train": { "url": "https://dl.fbaipublicfiles.com/dpr/data/retriever/biencoder-trivia-train.json.gz", "size_hint": 1848559940, "expected_md5": "5aa4d3577c91425cd20e239ed89a252b", "cache_path": "tqa-train.json.gz" }, "tqa-dev": { "url": "https://dl.fbaipublicfiles.com/dpr/data/retriever/biencoder-trivia-dev.json.gz", "size_hint": 207271749, "expected_md5": "d559dffe09acfe5a6370adea55b8abf2", "cache_path": "tqa-dev.json.gz" } }, "trec-fair": { "2021/docs": { "url": "https://data.boisestate.edu/library/Ekstrand-2021/TRECFairRanking2021/trec_corpus.json.gz", "size_hint": 15575740862, "expected_md5": "4c1e81d120566a493d5fa90b6114bd49", "cache_path": "2021/corpus.json.gz" }, "2021/train/topics": { "url": "https://data.boisestate.edu/library/Ekstrand-2021/TRECFairRanking2021/trec_topics.json.gz", "size_hint": 7271598, "expected_md5": "bdb72f896833d0c87421b6415d895846", "cache_path": "2021/train/topics.json.gz" }, "2021/eval/topics": { "url": "https://drive.google.com/uc?export=download&id=1jGyjB7qOt45jakb32ZtroSkxs5sq5gvU", "size_hint": 6055, "expected_md5": "2e153903c375596914ee9ffdbcefd6a5", "cache_path": "2021/eval/topics.json.gz" }, "2021/eval/qrels": { "url": "https://trec.nist.gov/data/fair/2021-eval-topics-with-qrels.json.gz", "size_hint": 120050, "expected_md5": "50068634036c00adb54e8be9314bf37c", "cache_path": "2021/eval/topics-with-qrels.json.gz", "irds_mirror": true }, "2021/metadata": { "url": "https://data.boisestate.edu/library/Ekstrand-2021/TRECFairRanking2021/trec_metadata.json.gz", "size_hint": 56827296, "expected_md5": "ae251e9ae0c9fb3a58c3b12e216dcea7", "cache_path": "2021/metadata.json.gz" }, "2022/docs": { "url": "https://data.boisestate.edu/library/Ekstrand/TRECFairRanking/corpus/trec_corpus_20220301_plain.json.gz", "size_hint": 7677063809, "expected_md5": "54661197940765ed5129f0bb0d459a99", "cache_path": "2022/trec_corpus_20220301_plain.json.gz" }, "2022/metadata": { "url": "https://data.boisestate.edu/library/Ekstrand/TRECFairRanking/2022/trec_2022_articles_discrete.json.gz", "size_hint": 236812182, "expected_md5": "af48525886bae53205f4b64435ae81f2", "cache_path": "2022/trec_2022_articles_discrete.json.gz" }, "2022/train/topics": { "url": "https://data.boisestate.edu/library/Ekstrand/TRECFairRanking/2022/trec_2022_train_reldocs.jsonl", "size_hint": 18018410, "expected_md5": "d132b4cc8c6c75525479728321db5176", "cache_path": "2022/trec_2022_train_reldocs.jsonl" } }, "gov": { "docs": { "instructions": "GOV is available by hard drive from UoG here: <http://ir.dcs.gla.ac.uk/test_collections/access_to_data.html>\nMore details about the procedure can be found here: <https://ir-datasets.com/gov.html#DataAccess>.\nLink the GOV source files here: {path}\nShould contain G00, G01, G02, ...", "cache_path": "corpus" }, "trec-web-2002/queries": { "url": "https://trec.nist.gov/data/topics_eng/webtopics_551-600.txt.gz", "irds_mirror": true, "size_hint": 6528, "expected_md5": "133e5d1628684f7a044df86ad08907f0", "cache_path": "trec-web-2002/queries.txt.gz" }, "trec-web-2002/qrels": { "url": "https://trec.nist.gov/data/qrels_eng/qrels.distillation.txt.gz", "irds_mirror": true, "size_hint": 402641, "expected_md5": "313d1cab9a37aa9b76b6c647cf7151a8", "cache_path": "trec-web-2002/qrels.txt.gz" }, "trec-web-2002/named-page/queries": { "url": "https://trec.nist.gov/data/topics_eng/webnamed_page_topics.1-150.txt.gz", "irds_mirror": true, "size_hint": 3168, "expected_md5": "00422f1c1f5109d7f609708de071e527", "cache_path": "trec-web-2002/named-page/queries.txt.gz" }, "trec-web-2002/named-page/qrels": { "url": "https://trec.nist.gov/data/qrels_eng/qrels.named-page.txt.gz", "irds_mirror": true, "size_hint": 1649, "expected_md5": "ed7e69528faddd1baece4cae41c6f613", "cache_path": "trec-web-2002/named-page/qrels.txt.gz" }, "trec-web-2003/queries": { "url": "https://trec.nist.gov/data/topics_eng/2003.distillation_topics.1-50.txt", "irds_mirror": true, "size_hint": 8221, "expected_md5": "409e5d16eb8c795945715850c7d26a8e", "cache_path": "trec-web-2003/queries.txt" }, "trec-web-2003/qrels": { "url": "https://trec.nist.gov/data/qrels_eng/qrels.distillation.2003.txt", "irds_mirror": true, "size_hint": 1113881, "expected_md5": "ce11fa22c6f7f5d8048bdc0d104986e5", "cache_path": "trec-web-2003/qrels.txt" }, "trec-web-2003/named-page/queries": { "url": "https://trec.nist.gov/data/topics_eng/2003.named_page_topics.151-450.txt", "irds_mirror": true, "size_hint": 26337, "expected_md5": "0b9bbe2bce309c5bf5754536abaaa0b6", "cache_path": "trec-web-2003/named-page/queries.txt" }, "trec-web-2003/named-page/qrels": { "url": "https://trec.nist.gov/data/qrels_eng/qrels.named-page.2003.txt", "irds_mirror": true, "size_hint": 8096, "expected_md5": "e7b05e05fab39862d5f8ad6ebc0c36fd", "cache_path": "trec-web-2003/named-page/qrels.txt" }, "trec-web-2004/queries": { "url": "https://trec.nist.gov/data/web/Web2004.query.stream.trecformat.txt", "irds_mirror": true, "size_hint": 15657, "expected_md5": "10821f7a000b8bec058097ede39570be", "cache_path": "trec-web-2004/queries.txt" }, "trec-web-2004/qrels": { "url": "https://trec.nist.gov/data/web/04.qrels.web.mixed.txt", "irds_mirror": true, "size_hint": 1996931, "expected_md5": "93daa0e4b4190c84e30d2cce78a0f674", "cache_path": "trec-web-2004/qrels.txt" }, "trec-web-2004/types": { "url": "https://trec.nist.gov/data/web/04.topic-map.official.txt", "irds_mirror": true, "size_hint": 2592, "expected_md5": "79737768b3be1aa07b14691aa54802c5", "cache_path": "trec-web-2004/types.txt" } }, "gov2": { "docs": { "instructions": "GOV2 is available by hard drive from UoG here: <http://ir.dcs.gla.ac.uk/test_collections/access_to_data.html>\nMore details about the procedure can be found here: <https://ir-datasets.com/gov2.html#DataAccess>.\nLink the GOV2 source files here: {path}\nShould contain GOV2_data", "cache_path": "corpus" }, "trec-tb-2004/queries": { "url": "https://trec.nist.gov/data/terabyte/04/04topics.701-750.txt", "irds_mirror": true, "size_hint": 21236, "expected_md5": "18b390335e440d099f3d64bef81708be", "cache_path": "trec-tb-2004/queries.txt" }, "trec-tb-2004/qrels": { "url": "https://trec.nist.gov/data/terabyte/04/04.qrels.12-Nov-04", "irds_mirror": true, "size_hint": 1475219, "expected_md5": "228e4b0c466b1778a01b3337f8774fb6", "cache_path": "trec-tb-2004/qrels.txt" }, "trec-tb-2005/queries": { "url": "https://trec.nist.gov/data/terabyte/05/05.topics.751-800.txt", "irds_mirror": true, "size_hint": 24822, "expected_md5": "f0fb2603c7d89425965e5aaa104ddca6", "cache_path": "trec-tb-2005/queries.txt" }, "trec-tb-2005/qrels": { "url": "https://trec.nist.gov/data/terabyte/05/05.adhoc_qrels", "irds_mirror": true, "size_hint": 1150486, "expected_md5": "87f2af26215f092c948249771c8607f6", "cache_path": "trec-tb-2005/qrels.txt" }, "trec-tb-2005/named-page/queries": { "url": "https://trec.nist.gov/data/terabyte/05/05.np_topics.601-872.final.txt", "irds_mirror": true, "size_hint": 20987, "expected_md5": "266444c58e3567250f56df5c6a79670d", "cache_path": "trec-tb-2005/named-page/queries.txt" }, "trec-tb-2005/named-page/qrels": { "url": "https://trec.nist.gov/data/terabyte/05/05.np_qrels", "irds_mirror": true, "size_hint": 297947, "expected_md5": "0b0f73650d1297a7e5572576a4b93d28", "cache_path": "trec-tb-2005/named-page/qrels.txt" }, "trec-tb-2005/efficiency/queries": { "url": "https://trec.nist.gov/data/terabyte/05/05.efficiency_topics.gz", "irds_mirror": true, "size_hint": 554590, "expected_md5": "034a21c9dd956f3b7fb4f162782c9909", "cache_path": "trec-tb-2005/efficiency/queries.txt.gz" }, "trec-tb-2006/queries": { "url": "https://trec.nist.gov/data/terabyte/06/06.topics.801-850.txt", "irds_mirror": true, "size_hint": 27791, "expected_md5": "6e23a748c060ef5be64dbcc65245072f", "cache_path": "trec-tb-2006/queries.txt" }, "trec-tb-2006/qrels": { "url": "https://trec.nist.gov/data/terabyte/06/qrels.tb06.top50", "irds_mirror": true, "size_hint": 812484, "expected_md5": "1b1dfd769ff00d9e8ec4530c64221543", "cache_path": "trec-tb-2006/qrels.txt" }, "trec-tb-2006/named-page/queries": { "url": "https://trec.nist.gov/data/terabyte/06/06.np_topics.901-1081.txt", "irds_mirror": true, "size_hint": 13224, "expected_md5": "811a53107b4445a9955e7376d90a1eec", "cache_path": "trec-tb-2006/named-page/queries.txt" }, "trec-tb-2006/named-page/qrels": { "url": "https://trec.nist.gov/data/terabyte/06/qrels.tb06.np", "irds_mirror": true, "size_hint": 60528, "expected_md5": "f9f7d07de3070eafc08989dd98d1fab8", "cache_path": "trec-tb-2006/named-page/qrels.txt" }, "trec-tb-2006/efficiency/queries": { "url": "https://trec.nist.gov/data/terabyte/06/06.efficiency_topics.tar.gz", "irds_mirror": true, "size_hint": 3015007, "expected_md5": "e8599a08af5b3f036c203957f5b82de8", "cache_path": "trec-tb-2006/efficiency/queries.tar.gz" }, "trec-mq-2007/queries": { "url": "https://trec.nist.gov/data/million.query/07/07-million-query-topics.1-10000.gz", "irds_mirror": true, "size_hint": 143691, "expected_md5": "db64470f08450c6b15a9ee2c7eac2f9b", "cache_path": "trec-mq-2007/queries.txt.gz" }, "trec-mq-2007/qrels": { "url": "https://trec.nist.gov/data/million.query/07/07.prels", "irds_mirror": true, "size_hint": 2749725, "expected_md5": "4f930d85442ac74cc00b56a4252f1f4a", "cache_path": "trec-mq-2007/prels" }, "trec-mq-2008/queries": { "url": "https://trec.nist.gov/data/million.query/08/08.million-query-topics.10001-20000.gz", "irds_mirror": true, "size_hint": 162791, "expected_md5": "fc8fc0e92ae9bc1d16756534ac682058", "cache_path": "trec-mq-2008/queries.txt.gz" }, "trec-mq-2008/qrels": { "url": "https://trec.nist.gov/data/million.query/08/2008.RC1.tgz", "irds_mirror": true, "size_hint": 459172, "expected_md5": "dae403e1834e87cba1babbef71b73714", "cache_path": "trec-mq-2008/prels.tar.gz" } }, "hc4": { "train/topics": { "url": "https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/train.topics.v1-0.jsonl", "size_hint": 104898, "expected_md5": "cf3a43c4085e28ce03f37704771e0e36", "cache_path": "train.topics.v1-0.jsonl" }, "dev/topics": { "url": "https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/dev.topics.v1-0.jsonl", "size_hint": 81444, "expected_md5": "4c34c546c3e90de4733c3c9411ce9c6d", "cache_path": "dev.topics.v1-0.jsonl" }, "test/topics": { "url": "https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/test.topics.v1-0.jsonl", "size_hint": 964434, "expected_md5": "a311237913a7335d45fe261e9ff7f11e", "cache_path": "test.topics.v1-0.jsonl" }, "fa/docs": { "instructions": "To use this dataset, you need to download the document files using the HC4 document download the post-processing script here: <https://github.com/hltcoe/HC4>.\nTo proceed, symlink the source file here: {path}", "cache_path": "fas/hc4_docs.jsonl" }, "fa/docs/ids": { "url": "https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/fas/ids.jsonl.gz", "hint_hint": 21890148, "expected_md5": "553e510633c30ce783c22ed37471ed3a", "cache_path": "fas/ids.jsonl.gz" }, "fa/train/qrels": { "url": "https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/fas/train.qrels.v1-0.txt", "size_hint": 5152, "expected_md5": "4a343957837ce996a7275a71a30ed806", "cache_path": "fas/train.qrels.v1-0.txt" }, "fa/dev/qrels": { "url": "https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/fas/dev.qrels.v1-0.txt", "size_hint": 24574, "expected_md5": "d66b62e3733a66b151f895e2f57ee144", "cache_path": "fas/dev.qrels.v1-0.txt" }, "fa/test/qrels": { "url": "https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/fas/test.qrels.v1-0.txt", "size_hint": 113490, "expected_md5": "842c307e3e01a688897fadefc7a9672f", "cache_path": "fas/test.qrels.v1-0.txt" }, "zh/docs": { "instructions": "To use this dataset, you need to download the document files using the HC4 document download the post-processing script here: <https://github.com/hltcoe/HC4>.\nTo proceed, symlink the source file here: {path}", "cache_path": "zho/hc4_docs.jsonl" }, "zh/docs/ids": { "url": "https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/zho/ids.jsonl.gz", "size_hint": 29051136, "expected_md5": "d4ee35f9ca55c0416fe439d4f41a9e2a", "cache_path": "zho/ids.jsonl.gz" }, "zh/train/qrels": { "url": "https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/zho/train.qrels.v1-0.txt", "size_hint": 15686, "expected_md5": "67d956dcc5b373ae8b6ce5f360b03987", "cache_path": "zho/train.qrels.v1-0.txt" }, "zh/dev/qrels": { "url": "https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/zho/dev.qrels.v1-0.txt", "size_hint": 20126, "expected_md5": "f252fb5edeee1fa38ccc8ee1c2a6e6f0", "cache_path": "zho/dev.qrels.v1-0.txt" }, "zh/test/qrels": { "url": "https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/zho/test.qrels.v1-0.txt", "size_hint": 123795, "expected_md5": "1bc5cfcefc49805884142b8b32f1a6ea", "cache_path": "zho/test.qrels.v1-0.txt" }, "ru/docs": { "instructions": "To use this dataset, you need to download the document files using the HC4 document download the post-processing script here: <https://github.com/hltcoe/HC4>.\nTo proceed, symlink the source file here: {path}", "cache_path": "rus/hc4_docs.jsonl" }, "ru/docs/ids/0": { "url": "https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/rus/ids.0.jsonl.gz", "size_hint": 26794412, "expected_md5": "4763df966f6ea953c731ef2d572044e5", "cache_path": "rus/ids.0.jsonl.gz" }, "ru/docs/ids/1": { "url": "https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/rus/ids.1.jsonl.gz", "size_hint": 26784412, "expected_md5": "c19fb0dd1aceb0f6fd02f92818fa55b7", "cache_path": "rus/ids.1.jsonl.gz" }, "ru/docs/ids/2": { "url": "https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/rus/ids.2.jsonl.gz", "size_hint": 26771449, "expected_md5": "41d6db2ae68b8a4a1e2b371e4f5fe7a8", "cache_path": "rus/ids.2.jsonl.gz" }, "ru/docs/ids/3": { "url": "https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/rus/ids.3.jsonl.gz", "size_hint": 26765684, "expected_md5": "e3d20167c9fdce77e633b3ea0421cb51", "cache_path": "rus/ids.3.jsonl.gz" }, "ru/docs/ids/4": { "url": "https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/rus/ids.4.jsonl.gz", "size_hint": 26790863, "expected_md5": "54db61aec1a4585ce172c39111725be7", "cache_path": "rus/ids.4.jsonl.gz" }, "ru/docs/ids/5": { "url": "https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/rus/ids.5.jsonl.gz", "size_hint": 26802522, "expected_md5": "ba8a7bace2df0be82f80f7ae84f736d5", "cache_path": "rus/ids.5.jsonl.gz" }, "ru/docs/ids/6": { "url": "https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/rus/ids.6.jsonl.gz", "size_hint": 26793985, "expected_md5": "9ada14526d375e2c7aaf95be80f8a043", "cache_path": "rus/ids.6.jsonl.gz" }, "ru/docs/ids/7": { "url": "https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/rus/ids.7.jsonl.gz", "size_hint": 23266384, "expected_md5": "8555423b846aaf097527017cf8eda94c", "cache_path": "rus/ids.7.jsonl.gz" }, "ru/train/qrels": { "url": "https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/rus/train.qrels.v1-0.txt", "size_hint": 4232, "expected_md5": "2e2c52e404a6ee0fe676cb88d63c26bb", "cache_path": "rus/train.qrels.v1-0.txt" }, "ru/dev/qrels": { "url": "https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/rus/dev.qrels.v1-0.txt", "size_hint": 11532, "expected_md5": "24305749fd39fb0392be171e76ef2510", "cache_path": "rus/dev.qrels.v1-0.txt" }, "ru/test/qrels": { "url": "https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/rus/test.qrels.v1-0.txt", "size_hint": 133650, "expected_md5": "1d1112350a11289496426ea558c321a2", "cache_path": "rus/test.qrels.v1-0.txt" } }, "highwire": { "ajepidem": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/ajepidem.zip", "size_hint": 25454608, "expected_md5": "d7db27233b28245724f3212a5b3cd659", "cache_path": "corpus/ajepidem.zip" }, "ajpcell": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/ajpcell.zip", "size_hint": 64758393, "expected_md5": "5d5cdc8e6cdaeeb924bc5af603e4478e", "cache_path": "corpus/ajpcell.zip" }, "ajpendometa": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/ajpendometa.zip", "size_hint": 50030877, "expected_md5": "2ce8f78e813e60e424136f010f9e0af0", "cache_path": "corpus/ajpendometa.zip" }, "ajpgastro": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/ajpgastro.zip", "size_hint": 50853597, "expected_md5": "0fe9e5091ac4f3adcc2325ed6d881964", "cache_path": "corpus/ajpgastro.zip" }, "ajpheart": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/ajpheart.zip", "size_hint": 104053949, "expected_md5": "4152b0904cc66e95f953a5c097d7ad77", "cache_path": "corpus/ajpheart.zip" }, "ajplung": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/ajplung.zip", "size_hint": 50558558, "expected_md5": "ca5a42357e4d8413ac32ffc21cc7a25f", "cache_path": "corpus/ajplung.zip" }, "ajprenal": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/ajprenal.zip", "size_hint": 41011735, "expected_md5": "6c21485c1858eda002766ec364af576a", "cache_path": "corpus/ajprenal.zip" }, "alcohol": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/alcohol.zip", "size_hint": 10152433, "expected_md5": "ca34997e36b919e64b99e12606c511cc", "cache_path": "corpus/alcohol.zip" }, "andrology": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/andrology.zip", "size_hint": 7454866, "expected_md5": "d715933ff1c96689e9647bbecdaaf599", "cache_path": "corpus/andrology.zip" }, "annonc": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/annonc.zip", "size_hint": 16794376, "expected_md5": "32234a3d00ea71b923006dd1f90c2dd5", "cache_path": "corpus/annonc.zip" }, "bjanast": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/bjanast.zip", "size_hint": 21847467, "expected_md5": "5a9c47c19d6bd33b8e30c27be9e41d25", "cache_path": "corpus/bjanast.zip" }, "bjp": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/bjp.zip", "size_hint": 17371510, "expected_md5": "93707a10817b15773e67bf1a2d3b2e24", "cache_path": "corpus/bjp.zip" }, "blood": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/blood.zip", "size_hint": 219220876, "expected_md5": "2216234fc6d66445026cc38c9c714855", "cache_path": "corpus/blood.zip" }, "carcinogenesis": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/carcinogenesis.zip", "size_hint": 38205239, "expected_md5": "7f4d797e2b779c9490eb9e523ebf628f", "cache_path": "corpus/carcinogenesis.zip" }, "cercor": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/cercor.zip", "size_hint": 23218757, "expected_md5": "60cfd9eebf42e1602cd69805de409e04", "cache_path": "corpus/cercor.zip" }, "development": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/development.zip", "size_hint": 64783714, "expected_md5": "f71ca283a55848a51a53c73a5af7db3c", "cache_path": "corpus/development.zip" }, "diabetes": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/diabetes.zip", "size_hint": 39020405, "expected_md5": "e6fcc28d779833b5b07fc8f2d2b4077e", "cache_path": "corpus/diabetes.zip" }, "endocrinology": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/endocrinology.zip", "size_hint": 109222542, "expected_md5": "85cf5f2870ccf333114e59ce1ab4d1db", "cache_path": "corpus/endocrinology.zip" }, "euroheartj": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/euroheartj.zip", "size_hint": 15462924, "expected_md5": "099dd344c7a98e3b6d3dbed2cdb17fb7", "cache_path": "corpus/euroheartj.zip" }, "glycobiology": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/glycobiology.zip", "size_hint": 15210749, "expected_md5": "4e390a903cbf2381c2eb530a3e7b3b68", "cache_path": "corpus/glycobiology.zip" }, "humanrep": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/humanrep.zip", "size_hint": 52745668, "expected_md5": "b30e1d36c8c551f33c0a8cf4cc09dfce", "cache_path": "corpus/humanrep.zip" }, "humolgen": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/humolgen.zip", "size_hint": 61054993, "expected_md5": "81ae00345d24ec20a542e9c151181c93", "cache_path": "corpus/humolgen.zip" }, "ijepidem": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/ijepidem.zip", "size_hint": 14035365, "expected_md5": "d9e1ea6aaabcc04fb57f26a756a49b40", "cache_path": "corpus/ijepidem.zip" }, "intimm": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/intimm.zip", "size_hint": 23787976, "expected_md5": "17ebda5809cd22a23ac92310a103fb9c", "cache_path": "corpus/intimm.zip" }, "jantichemo": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/jantichemo.zip", "size_hint": 30906187, "expected_md5": "e8b6088e655078ff40647af9a3cbd30f", "cache_path": "corpus/jantichemo.zip" }, "jappliedphysio": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/jappliedphysio.zip", "size_hint": 109740124, "expected_md5": "7a5406c65b0385a5cc493fede1424b27", "cache_path": "corpus/jappliedphysio.zip" }, "jbc-1995": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/jbc-1995.zip", "size_hint": 77817763, "expected_md5": "690f25229a7040627d32be56bdcf4556", "cache_path": "corpus/jbc-1995.zip" }, "jbc-1996": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/jbc-1996.zip", "size_hint": 34630282, "expected_md5": "646a254c22f40861966fccacea0b1b5b", "cache_path": "corpus/jbc-1996.zip" }, "jbc-1997": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/jbc-1997.zip", "size_hint": 62855350, "expected_md5": "5dfb2a85548e3da5305bd5d7e051c2fa", "cache_path": "corpus/jbc-1997.zip" }, "jbc-1998": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/jbc-1998.zip", "size_hint": 61361036, "expected_md5": "43cc8a9797700d99d17cd14596a08028", "cache_path": "corpus/jbc-1998.zip" }, "jbc-1999": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/jbc-1999.zip", "size_hint": 51850260, "expected_md5": "d5972b628653f5719a7ca3d04d238c10", "cache_path": "corpus/jbc-1999.zip" }, "jbc-2000": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/jbc-2000.zip", "size_hint": 116888541, "expected_md5": "d00f5eb17beb680752fa1711c6fe2cb8", "cache_path": "corpus/jbc-2000.zip" }, "jbc-2001": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/jbc-2001.zip", "size_hint": 72164976, "expected_md5": "a8e8e08a530754a5c2de9197c1d4259c", "cache_path": "corpus/jbc-2001.zip" }, "jbc-2002": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/jbc-2002.zip", "size_hint": 125104356, "expected_md5": "93e8ae6565afc20affcf785fb6a34646", "cache_path": "corpus/jbc-2002.zip" }, "jbc-2003": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/jbc-2003.zip", "size_hint": 80012227, "expected_md5": "7ddf90fede68fc0a94a674557e457063", "cache_path": "corpus/jbc-2003.zip" }, "jbc-2004": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/jbc-2004.zip", "size_hint": 138441109, "expected_md5": "d50a6264161f1aed2545737ace31ad32", "cache_path": "corpus/jbc-2004.zip" }, "jbc-2005": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/jbc-2005.zip", "size_hint": 113908404, "expected_md5": "4bac0f72d3845b6266a5bdf448d215b4", "cache_path": "corpus/jbc-2005.zip" }, "jcb": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/jcb.zip", "size_hint": 98000404, "expected_md5": "dba03aa0b7c8e1b72817afc880f5ff6c", "cache_path": "corpus/jcb.zip" }, "jclinicalendometa": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/jclinicalendometa.zip", "size_hint": 7262318, "expected_md5": "fdc5a8b41be0b58beabc822dc28660e2", "cache_path": "corpus/jclinicalendometa.zip" }, "jcs": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/jcs.zip", "size_hint": 57046291, "expected_md5": "cd506b2d653c41916450f7f46a78d402", "cache_path": "corpus/jcs.zip" }, "jexpbio": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/jexpbio.zip", "size_hint": 43249960, "expected_md5": "2072dad39b3fcdc58cb5a522c35f1b59", "cache_path": "corpus/jexpbio.zip" }, "jexpmed": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/jexpmed.zip", "size_hint": 73328992, "expected_md5": "6a6b6fd5d883c99be91d84e8663b6293", "cache_path": "corpus/jexpmed.zip" }, "jgenphysio": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/jgenphysio.zip", "size_hint": 25699101, "expected_md5": "4cb4bbb2c979e992e2668c0a6585d9b4", "cache_path": "corpus/jgenphysio.zip" }, "jgenviro": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/jgenviro.zip", "size_hint": 42041939, "expected_md5": "ba0fa54e2b8e7101c2c5008cb89f4c47", "cache_path": "corpus/jgenviro.zip" }, "jhistocyto": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/jhistocyto.zip", "size_hint": 25544265, "expected_md5": "def37c9b86e4ddbffd6a845c642ef009", "cache_path": "corpus/jhistocyto.zip" }, "jnci": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/jnci.zip", "size_hint": 36174794, "expected_md5": "649621e050ab0f0e73eb626002cf2a91", "cache_path": "corpus/jnci.zip" }, "jneuro": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/jneuro.zip", "size_hint": 71785901, "expected_md5": "cfd69effb5d2e0eb845173095e21888c", "cache_path": "corpus/jneuro.zip" }, "mcp": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/mcp.zip", "size_hint": 9910430, "expected_md5": "198e85d2c65d3c3964195e955660409d", "cache_path": "corpus/mcp.zip" }, "microbio": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/microbio.zip", "size_hint": 48411405, "expected_md5": "06840bddca76f1c7d8644ca989189368", "cache_path": "corpus/microbio.zip" }, "molbiolevol": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/molbiolevol.zip", "size_hint": 26619856, "expected_md5": "8b1eb9eceaea7da237f177d235a058a5", "cache_path": "corpus/molbiolevol.zip" }, "molendo": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/molendo.zip", "size_hint": 38090065, "expected_md5": "a2f55efa0e3bffe5d736316585540b62", "cache_path": "corpus/molendo.zip" }, "molhumanrep": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/molhumanrep.zip", "size_hint": 14601617, "expected_md5": "125a49b8f11ad1a78eda3ce03f867ef4", "cache_path": "corpus/molhumanrep.zip" }, "nar": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/nar.zip", "size_hint": 132453207, "expected_md5": "63c34a05440cd0259b06aef3edb22ea3", "cache_path": "corpus/nar.zip" }, "nephrodiatransp": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/nephrodiatransp.zip", "size_hint": 39541854, "expected_md5": "914e51d8ac94e85d2bbdc8afb9206731", "cache_path": "corpus/nephrodiatransp.zip" }, "peds": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/peds.zip", "size_hint": 15511622, "expected_md5": "db8266c186e902715beb511f12bcd16a", "cache_path": "corpus/peds.zip" }, "physiogenomics": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/physiogenomics.zip", "size_hint": 13767348, "expected_md5": "eeda5cc0b14ad36402cdc6ffd964e3c1", "cache_path": "corpus/physiogenomics.zip" }, "rheumatolgy": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/rheumatolgy.zip", "size_hint": 22015632, "expected_md5": "3e840dfb598dbefe630cdb1b412f3c57", "cache_path": "corpus/rheumatolgy.zip" }, "rna": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/rna.zip", "size_hint": 11853002, "expected_md5": "f48dd7fe09ae06609d6a8dc5e77aae2f", "cache_path": "corpus/rna.zip" }, "toxsci": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/toxsci.zip", "size_hint": 34838564, "expected_md5": "0093349b4f41e5431b1d832d13738010", "cache_path": "corpus/toxsci.zip" }, "legalspans": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/documents/legalspans.txt", "size_hint": 236522265, "expected_md5": "24c7bbed8eb3bdd2daf3f3ab2c1963b2", "cache_path": "legalspans.txt" }, "trec-genomics-2006/queries": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/topics/2006topics.txt", "size_hint": 2056, "expected_md5": "fd458f5398350e59831745e51854b2b0", "cache_path": "trec-genomics-2006/queries.txt" }, "trec-genomics-2006/qrels": { "url": "https://dmice.ohsu.edu/trec-gen/data/2006/trec2006.raw.relevance.tsv.txt", "size_hint": 1323494, "expected_md5": "a133e38bcd03c8b6509f46506cae753b", "cache_path": "trec-genomics-2006/qrels" }, "trec-genomics-2007/queries": { "url": "https://dmice.ohsu.edu/trec-gen/data/2007/2007topics.txt", "size_hint": 2576, "expected_md5": "be5fc2d4e984003da6aa9dfab9eb67a3", "cache_path": "trec-genomics-2007/queries.txt" }, "trec-genomics-2007/qrels": { "url": "https://dmice.ohsu.edu/trec-gen/data/2007/trecgen2007.all.judgments.tsv.txt", "size_hint": 1288196, "expected_md5": "5be6b6eea10d8ec0dac25bbe21af38a0", "cache_path": "trec-genomics-2007/qrels" } }, "istella22": { "source": { "url": "https://www.istella.ai/dataset/istella22.tar.gz", "size_hint": 26499490813, "expected_md5": "c2e49dca9730fbb14164ed890756dc1d", "cache_path": "istella22.tar.gz" } }, "kilt": { "knowledgesource": { "url": "http://dl.fbaipublicfiles.com/KILT/kilt_knowledgesource.json", "size_hint": 37318876722, "expected_md5": "d1dca62aa6ba889d2e842182e3114af5", "cache_path": "kilt_knowledgesource.json" }, "codec/qrels": { "url": "https://raw.githubusercontent.com/grill-lab/CODEC/main/raw_judgments/raw_entity_judgments.txt", "size_hint": 282367, "expected_md5": "51781fd0de5f7ca6b537222e4001e8ba", "cache_path": "codec/v1/raw_entity_judgments.txt" } }, "lotte": { "source": { "url": "https://downloads.cs.stanford.edu/nlp/data/colbert/colbertv2/lotte.tar.gz", "size_hint": 3576167599, "expected_md5": "3b2e88b1d66933627462950b4c3f5d0f", "cache_path": "lotte.tar.gz" } }, "medline": { "2004/a": { "url": "https://dmice.ohsu.edu/trec-gen/data/2004/XML/2004_TREC_XML_MEDLINE_A.gz", "size_hint": 579470012, "expected_md5": "7858e5b908c25b88e30965b770e9780f", "cache_path": "2004/2004_TREC_XML_MEDLINE_A.gz" }, "2004/b": { "url": "https://dmice.ohsu.edu/trec-gen/data/2004/XML/2004_TREC_XML_MEDLINE_B.gz", "size_hint": 623531586, "expected_md5": "d4f8b510716d71612dc84129b7bc86a8", "cache_path": "2004/2004_TREC_XML_MEDLINE_B.gz" }, "2004/c": { "url": "https://dmice.ohsu.edu/trec-gen/data/2004/XML/2004_TREC_XML_MEDLINE_C.gz", "size_hint": 593454144, "expected_md5": "155c77b6f75b549a810863dae03058d1", "cache_path": "2004/2004_TREC_XML_MEDLINE_C.gz" }, "2004/d": { "url": "https://dmice.ohsu.edu/trec-gen/data/2004/XML/2004_TREC_XML_MEDLINE_D.gz", "size_hint": 599821183, "expected_md5": "cb1570f0212f8b737c757b6177788f36", "cache_path": "2004/2004_TREC_XML_MEDLINE_D.gz" }, "trec-genomics-2004/queries": { "url": "https://dmice.ohsu.edu/trec-gen/data/2004/rest.zip", "size_hint": 227203, "expected_md5": "3f252c59774fe8e74337637d73f8afc6", "cache_path": "trec-genomics-2004/rest.zip" }, "trec-genomics-2004/qrels": { "url": "https://dmice.ohsu.edu/trec-gen/data/2004/04.qrels.txt", "size_hint": 128056, "expected_md5": "1cb017045d7909102476bcb17fb19878", "cache_path": "trec-genomics-2004/qrels" }, "trec-genomics-2005/queries": { "url": "https://dmice.ohsu.edu/trec-gen/data/2005/adhoc2005narrative.txt", "size_hint": 5551, "expected_md5": "71e8044cb65458731f4496fdc2aad94a", "cache_path": "trec-genomics-2005/queries.txt" }, "trec-genomics-2005/qrels": { "url": "https://dmice.ohsu.edu/trec-gen/data/2005/genomics.qrels.large.txt", "size_hint": 661626, "expected_md5": "fd6ac71dcd337c0c0cddf0ffc0528cc6", "cache_path": "trec-genomics-2005/qrels" }, "2017/part1": { "url": "https://bionlp.nlm.nih.gov/trec2017precisionmedicine/medline_xml.part1.tar.gz", "size_hint": 5257751264, "expected_md5": "04d14a46af586faf9306580291758c29", "cache_path": "2017/part1.tar.gz" }, "2017/part2": { "url": "https://bionlp.nlm.nih.gov/trec2017precisionmedicine/medline_xml.part2.tar.gz", "size_hint": 5257075322, "expected_md5": "19740bcde4e5e3bcfc583b347cd59d17", "cache_path": "2017/part2.tar.gz" }, "2017/part3": { "url": "https://bionlp.nlm.nih.gov/trec2017precisionmedicine/medline_xml.part3.tar.gz", "size_hint": 5249034853, "expected_md5": "7e7d3cfb452c6e4260704f0a6cc7932e", "cache_path": "2017/part3.tar.gz" }, "2017/part4": { "url": "https://bionlp.nlm.nih.gov/trec2017precisionmedicine/medline_xml.part4.tar.gz", "size_hint": 5245054502, "expected_md5": "1b22d0932d319d127be326358435c5e5", "cache_path": "2017/part4.tar.gz" }, "2017/part5": { "url": "https://bionlp.nlm.nih.gov/trec2017precisionmedicine/medline_xml.part5.tar.gz", "size_hint": 1187092702, "expected_md5": "d71b9bb9e11d017f3f77ffe47dbf8aa9", "cache_path": "2017/part5.tar.gz" }, "2017/aacr_asco_extra": { "url": "https://bionlp.nlm.nih.gov/trec2017precisionmedicine/extra_abstracts.tar.gz", "size_hint": 61150087, "expected_md5": "d91bb4ca9b50cbbd5986bb5c43082afb", "cache_path": "2017/extra_abstracts.tar.gz" }, "trec-pm-2017/queries": { "url": "https://trec.nist.gov/data/precmed/topics2017.xml", "irds_mirror": true, "size_hint": 5660, "expected_md5": "16d69bf9119aaaf4b8545c24dde4156d", "cache_path": "trec-pm-2017/queries.xml" }, "trec-pm-2017/qrels": { "url": "https://trec.nist.gov/data/precmed/qrels-final-abstracts.txt", "irds_mirror": true, "size_hint": 362092, "expected_md5": "0a302cb9cd580709d9e3db9881a25d47", "cache_path": "trec-pm-2017/qrels" }, "trec-pm-2018/queries": { "url": "http://www.trec-cds.org/topics2018.xml", "size_hint": 7515, "expected_md5": "6c59de8eaf3bd9925a567c50dfab6936", "cache_path": "trec-pm-2018/queries.xml" }, "trec-pm-2018/qrels": { "url": "https://trec.nist.gov/data/precmed/qrels-treceval-abstracts-2018-v2.txt", "irds_mirror": true, "size_hint": 364910, "expected_md5": "a09754dec58ee90458ff8e0e7f2cb934", "cache_path": "trec-pm-2018/qrels" } }, "miracl": { "v1.0/ar/dev/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-ar/topics/topics.miracl-v1.0-ar-dev.tsv", "expected_md5": "4ef84df620f1b5521ce8e7ebb438f344", "size_hint": 572724, "cache_path": "v1.0/ar/dev/topics.tsv" }, "v1.0/ar/test-a/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-ar/topics/topics.miracl-v1.0-ar-test-a.tsv", "expected_md5": "8b2a7f0ca268ff4c15dd7efd5d8f59cd", "size_hint": 510548, "cache_path": "v1.0/ar/test-a/topics.tsv" }, "v1.0/ar/test-b/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-ar/topics/topics.miracl-v1.0-ar-test-b.tsv", "expected_md5": "3bbc0e8ed604b12b4591d1eb6be574bc", "size_hint": 174692, "cache_path": "v1.0/ar/test-b/topics.tsv" }, "v1.0/ar/train/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-ar/topics/topics.miracl-v1.0-ar-train.tsv", "expected_md5": "38c6afa5a27b8c3565205bbc23756d8f", "size_hint": 58270, "cache_path": "v1.0/ar/train/topics.tsv" }, "v1.0/bn/dev/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-bn/topics/topics.miracl-v1.0-bn-dev.tsv", "expected_md5": "17ff030d33d678547626c66fb9f52b4c", "size_hint": 95063, "cache_path": "v1.0/bn/dev/topics.tsv" }, "v1.0/bn/test-a/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-bn/topics/topics.miracl-v1.0-bn-test-a.tsv", "expected_md5": "1c94d4bf5b07fa8eda2b8b5f1b2ae42c", "size_hint": 212525, "cache_path": "v1.0/bn/test-a/topics.tsv" }, "v1.0/bn/test-b/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-bn/topics/topics.miracl-v1.0-bn-test-b.tsv", "expected_md5": "1cb9838ec9d90fdef68ff0fc5d489cd2", "size_hint": 75970, "cache_path": "v1.0/bn/test-b/topics.tsv" }, "v1.0/bn/train/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-bn/topics/topics.miracl-v1.0-bn-train.tsv", "expected_md5": "0a48f9dff4565980c6c70f60403c2541", "size_hint": 302570, "cache_path": "v1.0/bn/train/topics.tsv" }, "v1.0/de/dev/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-de/topics/topics.miracl-v1.0-de-dev.tsv", "expected_md5": "ad5cd6dc4e1fa51d6383fec385cf2854", "size_hint": 53473, "cache_path": "v1.0/de/dev/topics.tsv" }, "v1.0/de/test-b/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-de/topics/topics.miracl-v1.0-de-test-b.tsv", "expected_md5": "3ef6e6e6d702714d40ad8b78ece98d7a", "size_hint": 14749, "cache_path": "v1.0/de/test-b/topics.tsv" }, "v1.0/en/dev/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-en/topics/topics.miracl-v1.0-en-dev.tsv", "expected_md5": "cd1442ed7b711ea5ff246ea2be8ce0c7", "size_hint": 135842, "cache_path": "v1.0/en/dev/topics.tsv" }, "v1.0/en/test-a/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-en/topics/topics.miracl-v1.0-en-test-a.tsv", "expected_md5": "3d834410789bbf4568e74942344d0831", "size_hint": 212262, "cache_path": "v1.0/en/test-a/topics.tsv" }, "v1.0/en/test-b/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-en/topics/topics.miracl-v1.0-en-test-b.tsv", "expected_md5": "5f8ef71d59a420b2e46afb70e4128464", "size_hint": 77351, "cache_path": "v1.0/en/test-b/topics.tsv" }, "v1.0/en/train/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-en/topics/topics.miracl-v1.0-en-train.tsv", "expected_md5": "a8a1ae3254c07c46e37abd6bbfab63ad", "size_hint": 17620, "cache_path": "v1.0/en/train/topics.tsv" }, "v1.0/es/dev/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-es/topics/topics.miracl-v1.0-es-dev.tsv", "expected_md5": "8f057b82c8bb6ba44776cadacee63e7b", "size_hint": 40042, "cache_path": "v1.0/es/dev/topics.tsv" }, "v1.0/es/test-b/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-es/topics/topics.miracl-v1.0-es-test-b.tsv", "expected_md5": "197e982a1bbac021b0f40cf7b5e958a7", "size_hint": 167817, "cache_path": "v1.0/es/test-b/topics.tsv" }, "v1.0/es/train/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-es/topics/topics.miracl-v1.0-es-train.tsv", "expected_md5": "efe6c440986a96ebd3edeecb91c80492", "size_hint": 589618, "cache_path": "v1.0/es/train/topics.tsv" }, "v1.0/fa/dev/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-fa/topics/topics.miracl-v1.0-fa-dev.tsv", "expected_md5": "c64fb62af718c135a1843722ea8fdb8d", "size_hint": 36782, "cache_path": "v1.0/fa/dev/topics.tsv" }, "v1.0/fa/test-b/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-fa/topics/topics.miracl-v1.0-fa-test-b.tsv", "expected_md5": "3114048d32c21d7e86f36a81ec86d8b0", "size_hint": 34135, "cache_path": "v1.0/fa/test-b/topics.tsv" }, "v1.0/fa/train/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-fa/topics/topics.miracl-v1.0-fa-train.tsv", "expected_md5": "8843271f035a12a43983d7795b0e144c", "size_hint": 109792, "cache_path": "v1.0/fa/train/topics.tsv" }, "v1.0/fi/dev/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-fi/topics/topics.miracl-v1.0-fi-dev.tsv", "expected_md5": "e328e92f38c13f6e5a5b6903670e01fc", "size_hint": 128428, "cache_path": "v1.0/fi/dev/topics.tsv" }, "v1.0/fi/test-a/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-fi/topics/topics.miracl-v1.0-fi-test-a.tsv", "expected_md5": "33480316a834ec45d935a65663f148ea", "size_hint": 157701, "cache_path": "v1.0/fi/test-a/topics.tsv" }, "v1.0/fi/test-b/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-fi/topics/topics.miracl-v1.0-fi-test-b.tsv", "expected_md5": "098a89152d59a59a98824bb4ed63c2e0", "size_hint": 526487, "cache_path": "v1.0/fi/test-b/topics.tsv" }, "v1.0/fi/train/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-fi/topics/topics.miracl-v1.0-fi-train.tsv", "expected_md5": "b1c1f3bceb7c52b2e44b587fb0690454", "size_hint": 39616, "cache_path": "v1.0/fi/train/topics.tsv" }, "v1.0/fr/dev/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-fr/topics/topics.miracl-v1.0-fr-dev.tsv", "expected_md5": "d8b97c45c2256480c0b0c56634deb828", "size_hint": 92357, "cache_path": "v1.0/fr/dev/topics.tsv" }, "v1.0/fr/test-b/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-fr/topics/topics.miracl-v1.0-fr-test-b.tsv", "expected_md5": "3988760b1e1355a92a87094b5ad0f51b", "size_hint": 131489, "cache_path": "v1.0/fr/test-b/topics.tsv" }, "v1.0/fr/train/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-fr/topics/topics.miracl-v1.0-fr-train.tsv", "expected_md5": "962aebe813c808a9e6b4cf7ac6764758", "size_hint": 157653, "cache_path": "v1.0/fr/train/topics.tsv" }, "v1.0/hi/dev/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-hi/topics/topics.miracl-v1.0-hi-dev.tsv", "expected_md5": "69aafc100baf5617d7d7e63b47c4d86a", "size_hint": 525158, "cache_path": "v1.0/hi/dev/topics.tsv" }, "v1.0/hi/test-b/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-hi/topics/topics.miracl-v1.0-hi-test-b.tsv", "expected_md5": "1c1aef94f1206e4b98aa519b3e7c9dd8", "size_hint": 54322, "cache_path": "v1.0/hi/test-b/topics.tsv" }, "v1.0/hi/train/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-hi/topics/topics.miracl-v1.0-hi-train.tsv", "expected_md5": "fde7fc047da3a9947efb711fc45b79a4", "size_hint": 124918, "cache_path": "v1.0/hi/train/topics.tsv" }, "v1.0/id/dev/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-id/topics/topics.miracl-v1.0-id-dev.tsv", "expected_md5": "1ecff4a071699fb1c1d16a342a7083ff", "size_hint": 180570, "cache_path": "v1.0/id/dev/topics.tsv" }, "v1.0/id/test-a/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-id/topics/topics.miracl-v1.0-id-test-a.tsv", "expected_md5": "7dbd725b4e83a363f630bb8d0c3e775b", "size_hint": 226188, "cache_path": "v1.0/id/test-a/topics.tsv" }, "v1.0/id/test-b/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-id/topics/topics.miracl-v1.0-id-test-b.tsv", "expected_md5": "b51cb52b5312b99cccb16206a56ac29f", "size_hint": 381821, "cache_path": "v1.0/id/test-b/topics.tsv" }, "v1.0/id/train/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-id/topics/topics.miracl-v1.0-id-train.tsv", "expected_md5": "2051ab2d790d9e10b450d193dc1878e8", "size_hint": 58112, "cache_path": "v1.0/id/train/topics.tsv" }, "v1.0/ja/dev/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-ja/topics/topics.miracl-v1.0-ja-dev.tsv", "expected_md5": "93207778d1032e1d0908e4a3e0326e5f", "size_hint": 47863, "cache_path": "v1.0/ja/dev/topics.tsv" }, "v1.0/ja/test-a/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-ja/topics/topics.miracl-v1.0-ja-test-a.tsv", "expected_md5": "de78e8e351f23488b01ddc52e756c35f", "size_hint": 33471, "cache_path": "v1.0/ja/test-a/topics.tsv" }, "v1.0/ja/test-b/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-ja/topics/topics.miracl-v1.0-ja-test-b.tsv", "expected_md5": "61a655f2d14ce4d0b661a11f09a5ec57", "size_hint": 129620, "cache_path": "v1.0/ja/test-b/topics.tsv" }, "v1.0/ja/train/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-ja/topics/topics.miracl-v1.0-ja-train.tsv", "expected_md5": "e3207521af4663e5d5947efe73cd3c2a", "size_hint": 84975, "cache_path": "v1.0/ja/train/topics.tsv" }, "v1.0/ko/dev/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-ko/topics/topics.miracl-v1.0-ko-dev.tsv", "expected_md5": "61061f60143677b7dd9eac8cd963ac15", "size_hint": 283176, "cache_path": "v1.0/ko/dev/topics.tsv" }, "v1.0/ko/test-a/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-ko/topics/topics.miracl-v1.0-ko-test-a.tsv", "expected_md5": "b23aa4792976fedf3bd45cbb55d28f6d", "size_hint": 19375, "cache_path": "v1.0/ko/test-a/topics.tsv" }, "v1.0/ko/test-b/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-ko/topics/topics.miracl-v1.0-ko-test-b.tsv", "expected_md5": "35a60095be8f6d05e106eae2a13253fa", "size_hint": 45335, "cache_path": "v1.0/ko/test-b/topics.tsv" }, "v1.0/ko/train/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-ko/topics/topics.miracl-v1.0-ko-train.tsv", "expected_md5": "32a7b5a373761ec50b689a8d2fb2439a", "size_hint": 64472, "cache_path": "v1.0/ko/train/topics.tsv" }, "v1.0/ru/dev/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-ru/topics/topics.miracl-v1.0-ru-dev.tsv", "expected_md5": "8a86550b1a041d7e512884c9683a7867", "size_hint": 79771, "cache_path": "v1.0/ru/dev/topics.tsv" }, "v1.0/ru/test-a/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-ru/topics/topics.miracl-v1.0-ru-test-a.tsv", "expected_md5": "cc3e7ca3cb78b3e298b549adaf689317", "size_hint": 266412, "cache_path": "v1.0/ru/test-a/topics.tsv" }, "v1.0/ru/test-b/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-ru/topics/topics.miracl-v1.0-ru-test-b.tsv", "expected_md5": "a3c7b2659aba4777eed8f53df588f28b", "size_hint": 52022, "cache_path": "v1.0/ru/test-b/topics.tsv" }, "v1.0/ru/train/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-ru/topics/topics.miracl-v1.0-ru-train.tsv", "expected_md5": "ad6a5a0bf13295857602c110ba6b04c9", "size_hint": 120565, "cache_path": "v1.0/ru/train/topics.tsv" }, "v1.0/sw/dev/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-sw/topics/topics.miracl-v1.0-sw-dev.tsv", "expected_md5": "5ffbaf70e2a5868780ee6fc09fe08944", "size_hint": 169941, "cache_path": "v1.0/sw/dev/topics.tsv" }, "v1.0/sw/test-a/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-sw/topics/topics.miracl-v1.0-sw-test-a.tsv", "expected_md5": "7e21f860b85d3660d415f1dd5f29b2bf", "size_hint": 176946, "cache_path": "v1.0/sw/test-a/topics.tsv" }, "v1.0/sw/test-b/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-sw/topics/topics.miracl-v1.0-sw-test-b.tsv", "expected_md5": "002e79f781b4eb89ced2929a3e4693f9", "size_hint": 758155, "cache_path": "v1.0/sw/test-b/topics.tsv" }, "v1.0/sw/train/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-sw/topics/topics.miracl-v1.0-sw-train.tsv", "expected_md5": "6a0497e1568b7f8518ee4f5205167d89", "size_hint": 42043, "cache_path": "v1.0/sw/train/topics.tsv" }, "v1.0/te/dev/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-te/topics/topics.miracl-v1.0-te-dev.tsv", "expected_md5": "6382282438291981c2b65ae3c8f3dd71", "size_hint": 33832, "cache_path": "v1.0/te/dev/topics.tsv" }, "v1.0/te/test-a/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-te/topics/topics.miracl-v1.0-te-test-a.tsv", "expected_md5": "413a15ae8e5dbe9bd55a1d3c722f83e6", "size_hint": 24834, "cache_path": "v1.0/te/test-a/topics.tsv" }, "v1.0/te/test-b/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-te/topics/topics.miracl-v1.0-te-test-b.tsv", "expected_md5": "bdc6dc6bd94405d48dc7ff96b32eec0a", "size_hint": 174924, "cache_path": "v1.0/te/test-b/topics.tsv" }, "v1.0/te/train/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-te/topics/topics.miracl-v1.0-te-train.tsv", "expected_md5": "43de28312b4b7fe39eb5c0a7e72a3883", "size_hint": 160882, "cache_path": "v1.0/te/train/topics.tsv" }, "v1.0/th/dev/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-th/topics/topics.miracl-v1.0-th-dev.tsv", "expected_md5": "142e213631cd7ee91cb5b9e384a14a6d", "size_hint": 663060, "cache_path": "v1.0/th/dev/topics.tsv" }, "v1.0/th/test-a/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-th/topics/topics.miracl-v1.0-th-test-a.tsv", "expected_md5": "f4786ff257fcffca790112a17696c261", "size_hint": 50019, "cache_path": "v1.0/th/test-a/topics.tsv" }, "v1.0/th/test-b/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-th/topics/topics.miracl-v1.0-th-test-b.tsv", "expected_md5": "61a692a875d8b39824a591d864cc09ed", "size_hint": 38489, "cache_path": "v1.0/th/test-b/topics.tsv" }, "v1.0/th/train/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-th/topics/topics.miracl-v1.0-th-train.tsv", "expected_md5": "2e188722d9f0014be8edcaa2f6ef4581", "size_hint": 66260, "cache_path": "v1.0/th/train/topics.tsv" }, "v1.0/yo/dev/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-yo/topics/topics.miracl-v1.0-yo-dev.tsv", "expected_md5": "7820b5fddac56306dfef4863271d3f5c", "size_hint": 200188, "cache_path": "v1.0/yo/dev/topics.tsv" }, "v1.0/yo/test-b/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-yo/topics/topics.miracl-v1.0-yo-test-b.tsv", "expected_md5": "90123239f6b66592385b0e1386a03186", "size_hint": 55675, "cache_path": "v1.0/yo/test-b/topics.tsv" }, "v1.0/zh/dev/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-zh/topics/topics.miracl-v1.0-zh-dev.tsv", "expected_md5": "43d975e76eec94edac3f325139fe04d1", "size_hint": 232882, "cache_path": "v1.0/zh/dev/topics.tsv" }, "v1.0/zh/test-b/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-zh/topics/topics.miracl-v1.0-zh-test-b.tsv", "expected_md5": "4cb3cda9f62d284476bd212ad6b4038f", "size_hint": 12597, "cache_path": "v1.0/zh/test-b/topics.tsv" }, "v1.0/zh/train/topics": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-zh/topics/topics.miracl-v1.0-zh-train.tsv", "expected_md5": "421e6122338cb80d43fcaa7ecacc1611", "size_hint": 16716, "cache_path": "v1.0/zh/train/topics.tsv" }, "v1.0/ar/dev/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-ar/qrels/qrels.miracl-v1.0-ar-dev.tsv", "expected_md5": "ac6496c6f8efabd33936b144557548b0", "size_hint": 74893, "cache_path": "v1.0/ar/dev/qrels.tsv" }, "v1.0/ar/train/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-ar/qrels/qrels.miracl-v1.0-ar-train.tsv", "expected_md5": "f85eed133bec7e50336bbfe0cbbb01c0", "size_hint": 53825, "cache_path": "v1.0/ar/train/qrels.tsv" }, "v1.0/bn/dev/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-bn/qrels/qrels.miracl-v1.0-bn-dev.tsv", "expected_md5": "42158a39e400a5a6ebef97b9e91e54b1", "size_hint": 255921, "cache_path": "v1.0/bn/dev/qrels.tsv" }, "v1.0/bn/train/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-bn/qrels/qrels.miracl-v1.0-bn-train.tsv", "expected_md5": "d59c2bedbae4a4c2c8290539c66a93ea", "size_hint": 663142, "cache_path": "v1.0/bn/train/qrels.tsv" }, "v1.0/de/dev/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-de/qrels/qrels.miracl-v1.0-de-dev.tsv", "expected_md5": "39d96bd97b691783fa07ff523955c8ad", "size_hint": 108525, "cache_path": "v1.0/de/dev/qrels.tsv" }, "v1.0/en/dev/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-en/qrels/qrels.miracl-v1.0-en-dev.tsv", "expected_md5": "ce9a8e2345de41210895f65866c4c2e7", "size_hint": 83173, "cache_path": "v1.0/en/dev/qrels.tsv" }, "v1.0/en/train/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-en/qrels/qrels.miracl-v1.0-en-train.tsv", "expected_md5": "556f857f7f67c942f683c6a7b4e29b48", "size_hint": 53644, "cache_path": "v1.0/en/train/qrels.tsv" }, "v1.0/es/dev/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-es/qrels/qrels.miracl-v1.0-es-dev.tsv", "expected_md5": "3a25e2467698ff529b7f97e76b8eab6b", "size_hint": 406366, "cache_path": "v1.0/es/dev/qrels.tsv" }, "v1.0/es/train/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-es/qrels/qrels.miracl-v1.0-es-train.tsv", "expected_md5": "5913f00b8ae436c198d563e8fbaefdb1", "size_hint": 89581, "cache_path": "v1.0/es/train/qrels.tsv" }, "v1.0/fa/dev/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-fa/qrels/qrels.miracl-v1.0-fa-dev.tsv", "expected_md5": "a1384b550860cea3eadb9bd8407f36ad", "size_hint": 166752, "cache_path": "v1.0/fa/dev/qrels.tsv" }, "v1.0/fa/train/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-fa/qrels/qrels.miracl-v1.0-fa-train.tsv", "expected_md5": "b1cd41c3c72473f372f2d0c0c2f4395c", "size_hint": 21465, "cache_path": "v1.0/fa/train/qrels.tsv" }, "v1.0/fi/dev/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-fi/qrels/qrels.miracl-v1.0-fi-dev.tsv", "expected_md5": "c0bdc01bdbd4753ee6b4f2e9dee08ceb", "size_hint": 30603, "cache_path": "v1.0/fi/dev/qrels.tsv" }, "v1.0/fi/train/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-fi/qrels/qrels.miracl-v1.0-fi-train.tsv", "expected_md5": "348b8e5b45cbd2ef4074ef9cb6ff9076", "size_hint": 22874, "cache_path": "v1.0/fi/train/qrels.tsv" }, "v1.0/fr/dev/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-fr/qrels/qrels.miracl-v1.0-fr-dev.tsv", "expected_md5": "aa458c4c603c919856f2d38adb474c25", "size_hint": 84980, "cache_path": "v1.0/fr/dev/qrels.tsv" }, "v1.0/fr/train/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-fr/qrels/qrels.miracl-v1.0-fr-train.tsv", "expected_md5": "9e0796915fe5bb9ecb144439ebf2e2c0", "size_hint": 28814, "cache_path": "v1.0/fr/train/qrels.tsv" }, "v1.0/hi/dev/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-hi/qrels/qrels.miracl-v1.0-hi-dev.tsv", "expected_md5": "0b97e26e09bdf24a21325b74016d3047", "size_hint": 334413, "cache_path": "v1.0/hi/dev/qrels.tsv" }, "v1.0/hi/train/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-hi/qrels/qrels.miracl-v1.0-hi-train.tsv", "expected_md5": "a4f6627539655cdb3f61ca575552354d", "size_hint": 90134, "cache_path": "v1.0/hi/train/qrels.tsv" }, "v1.0/id/dev/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-id/qrels/qrels.miracl-v1.0-id-dev.tsv", "expected_md5": "db025a23bda716dce6e37b42229b436f", "size_hint": 63949, "cache_path": "v1.0/id/dev/qrels.tsv" }, "v1.0/id/train/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-id/qrels/qrels.miracl-v1.0-id-train.tsv", "expected_md5": "7aeb868e30db507ec05c21a3365c7290", "size_hint": 82445, "cache_path": "v1.0/id/train/qrels.tsv" }, "v1.0/ja/dev/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-ja/qrels/qrels.miracl-v1.0-ja-dev.tsv", "expected_md5": "1ea89d53b04d60aecccf01d90d33b00d", "size_hint": 375820, "cache_path": "v1.0/ja/dev/qrels.tsv" }, "v1.0/ja/train/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-ja/qrels/qrels.miracl-v1.0-ja-train.tsv", "expected_md5": "05b2cca3cb634ada628a131bb036943d", "size_hint": 139845, "cache_path": "v1.0/ja/train/qrels.tsv" }, "v1.0/ko/dev/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-ko/qrels/qrels.miracl-v1.0-ko-dev.tsv", "expected_md5": "c0935fc8df5e10d578a37b4ad4d1b8ac", "size_hint": 395454, "cache_path": "v1.0/ko/dev/qrels.tsv" }, "v1.0/ko/train/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-ko/qrels/qrels.miracl-v1.0-ko-train.tsv", "expected_md5": "2ef507d04d4c730ccb18908b8cc8e1be", "size_hint": 94299, "cache_path": "v1.0/ko/train/qrels.tsv" }, "v1.0/ru/dev/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-ru/qrels/qrels.miracl-v1.0-ru-dev.tsv", "expected_md5": "748511ee0442e09e836a49deb1e2e8b3", "size_hint": 126527, "cache_path": "v1.0/ru/dev/qrels.tsv" }, "v1.0/ru/train/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-ru/qrels/qrels.miracl-v1.0-ru-train.tsv", "expected_md5": "b6b30a6cacab487bf41d81d93f01faa0", "size_hint": 71664, "cache_path": "v1.0/ru/train/qrels.tsv" }, "v1.0/sw/dev/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-sw/qrels/qrels.miracl-v1.0-sw-dev.tsv", "expected_md5": "2ca78e665778ed6be4f9b01e0b6e80c0", "size_hint": 379671, "cache_path": "v1.0/sw/dev/qrels.tsv" }, "v1.0/sw/train/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-sw/qrels/qrels.miracl-v1.0-sw-train.tsv", "expected_md5": "e582f9074db31dea26fe43dce87d0ff5", "size_hint": 24820, "cache_path": "v1.0/sw/train/qrels.tsv" }, "v1.0/te/dev/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-te/qrels/qrels.miracl-v1.0-te-dev.tsv", "expected_md5": "2f7c45f44bc8842633a8eb14dbb99f19", "size_hint": 5983, "cache_path": "v1.0/te/dev/qrels.tsv" }, "v1.0/te/train/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-te/qrels/qrels.miracl-v1.0-te-train.tsv", "expected_md5": "b55f03829da637e3e186780292e0b305", "size_hint": 14289, "cache_path": "v1.0/te/train/qrels.tsv" }, "v1.0/th/dev/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-th/qrels/qrels.miracl-v1.0-th-dev.tsv", "expected_md5": "89c8aa2ebb027d3f3c1fd984aeb58ee9", "size_hint": 94140, "cache_path": "v1.0/th/dev/qrels.tsv" }, "v1.0/th/train/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-th/qrels/qrels.miracl-v1.0-th-train.tsv", "expected_md5": "0745678058c20c179c6b2dc0206d88ca", "size_hint": 314756, "cache_path": "v1.0/th/train/qrels.tsv" }, "v1.0/yo/dev/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-yo/qrels/qrels.miracl-v1.0-yo-dev.tsv", "expected_md5": "523aeb9fb3c631dee98e267760a46735", "size_hint": 16903, "cache_path": "v1.0/yo/dev/qrels.tsv" }, "v1.0/zh/dev/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-zh/qrels/qrels.miracl-v1.0-zh-dev.tsv", "expected_md5": "5906a4dc2373cf524167487660d7b6f1", "size_hint": 39712, "cache_path": "v1.0/zh/dev/qrels.tsv" }, "v1.0/zh/train/qrels": { "url": "https://huggingface.co/datasets/macavaney/miracl-noauth/resolve/main/miracl-v1.0-zh/qrels/qrels.miracl-v1.0-zh-train.tsv", "expected_md5": "2c3b64e46276e1df97a15c65ab5b90f1", "size_hint": 57079, "cache_path": "v1.0/zh/train/qrels.tsv" }, "v1.0/ar/corpus/0": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ar/docs-0.jsonl.gz", "size_hint": 94104175, "expected_md5": "d484e02a3cd973b4c36bd7867d7f40fa", "cache_path": "v1.0/ar/docs/docs-0.jsonl.gz" }, "v1.0/ar/corpus/1": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ar/docs-1.jsonl.gz", "size_hint": 83793880, "expected_md5": "2af6f236d7c37c2bde97be4a13a3abaa", "cache_path": "v1.0/ar/docs/docs-1.jsonl.gz" }, "v1.0/ar/corpus/2": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ar/docs-2.jsonl.gz", "size_hint": 70295610, "expected_md5": "f5ee426bd15eb07e335c0b2f51798672", "cache_path": "v1.0/ar/docs/docs-2.jsonl.gz" }, "v1.0/ar/corpus/3": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ar/docs-3.jsonl.gz", "size_hint": 64551259, "expected_md5": "3808e89c2468bc10f49d4d4ae5f8a66c", "cache_path": "v1.0/ar/docs/docs-3.jsonl.gz" }, "v1.0/ar/corpus/4": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ar/docs-4.jsonl.gz", "size_hint": 7227421, "expected_md5": "73a29b3038c984a86da189aa16c3fb8f", "cache_path": "v1.0/ar/docs/docs-4.jsonl.gz" }, "v1.0/bn/corpus/0": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-bn/docs-0.jsonl.gz", "size_hint": 59713182, "expected_md5": "1e30b159bc3634e068bcf8f1fbff68ef", "cache_path": "v1.0/bn/docs/docs-0.jsonl.gz" }, "v1.0/de/corpus/0": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-0.jsonl.gz", "size_hint": 97640576, "expected_md5": "ca7ab6bb0328f2e17a634e2a3445f1ea", "cache_path": "v1.0/de/docs/docs-0.jsonl.gz" }, "v1.0/de/corpus/10": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-10.jsonl.gz", "size_hint": 81937816, "expected_md5": "527fdba98da58526ca3f30b7a1633939", "cache_path": "v1.0/de/docs/docs-10.jsonl.gz" }, "v1.0/de/corpus/11": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-11.jsonl.gz", "size_hint": 82225480, "expected_md5": "50dedc25b39f4a9ebc1ad8e94a2f85e7", "cache_path": "v1.0/de/docs/docs-11.jsonl.gz" }, "v1.0/de/corpus/12": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-12.jsonl.gz", "size_hint": 82302655, "expected_md5": "ed1cbe22bd2bcb419a5999d8140c14cf", "cache_path": "v1.0/de/docs/docs-12.jsonl.gz" }, "v1.0/de/corpus/13": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-13.jsonl.gz", "size_hint": 81286223, "expected_md5": "2929a7c9b8a280958a388a990a14e581", "cache_path": "v1.0/de/docs/docs-13.jsonl.gz" }, "v1.0/de/corpus/14": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-14.jsonl.gz", "size_hint": 81285393, "expected_md5": "e8ab4bbd1b4a34ed7c90a0326a3d04f6", "cache_path": "v1.0/de/docs/docs-14.jsonl.gz" }, "v1.0/de/corpus/15": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-15.jsonl.gz", "size_hint": 81734841, "expected_md5": "19e90cfff174c6639b657e1f90a3609c", "cache_path": "v1.0/de/docs/docs-15.jsonl.gz" }, "v1.0/de/corpus/16": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-16.jsonl.gz", "size_hint": 81453194, "expected_md5": "75017466b064eaaed9ce11cd629a2766", "cache_path": "v1.0/de/docs/docs-16.jsonl.gz" }, "v1.0/de/corpus/17": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-17.jsonl.gz", "size_hint": 81111867, "expected_md5": "10f71f7df716a10d678bbd5bbd640b77", "cache_path": "v1.0/de/docs/docs-17.jsonl.gz" }, "v1.0/de/corpus/18": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-18.jsonl.gz", "size_hint": 79961121, "expected_md5": "2e4ccc640c95be49dd4c9118f5e7fe25", "cache_path": "v1.0/de/docs/docs-18.jsonl.gz" }, "v1.0/de/corpus/19": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-19.jsonl.gz", "size_hint": 80334928, "expected_md5": "8ea6c268ff9d6aca89b2373fbe3490b5", "cache_path": "v1.0/de/docs/docs-19.jsonl.gz" }, "v1.0/de/corpus/1": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-1.jsonl.gz", "size_hint": 94992507, "expected_md5": "68e5d3add0fb289bbc354d93471850be", "cache_path": "v1.0/de/docs/docs-1.jsonl.gz" }, "v1.0/de/corpus/20": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-20.jsonl.gz", "size_hint": 80594842, "expected_md5": "2cb88bfd5e644b26aec39a59e7efd79e", "cache_path": "v1.0/de/docs/docs-20.jsonl.gz" }, "v1.0/de/corpus/21": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-21.jsonl.gz", "size_hint": 81303770, "expected_md5": "7d63fb93948260af3409e41f673f8c77", "cache_path": "v1.0/de/docs/docs-21.jsonl.gz" }, "v1.0/de/corpus/22": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-22.jsonl.gz", "size_hint": 82621593, "expected_md5": "c0db07f9c2e01282909030105797dddf", "cache_path": "v1.0/de/docs/docs-22.jsonl.gz" }, "v1.0/de/corpus/23": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-23.jsonl.gz", "size_hint": 81564108, "expected_md5": "2d710c2e4a9b1eff7a0012580ef25540", "cache_path": "v1.0/de/docs/docs-23.jsonl.gz" }, "v1.0/de/corpus/24": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-24.jsonl.gz", "size_hint": 80246927, "expected_md5": "397eba2f7fd3b310111a45f0e37aea42", "cache_path": "v1.0/de/docs/docs-24.jsonl.gz" }, "v1.0/de/corpus/25": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-25.jsonl.gz", "size_hint": 79283563, "expected_md5": "e62f5d0c7640c6b51974466b7a9bde42", "cache_path": "v1.0/de/docs/docs-25.jsonl.gz" }, "v1.0/de/corpus/26": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-26.jsonl.gz", "size_hint": 78943370, "expected_md5": "bdebcadf33dc1bb7c2e88237ace54c8a", "cache_path": "v1.0/de/docs/docs-26.jsonl.gz" }, "v1.0/de/corpus/27": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-27.jsonl.gz", "size_hint": 80159901, "expected_md5": "e56b7656cef7356ba696d77261464cf6", "cache_path": "v1.0/de/docs/docs-27.jsonl.gz" }, "v1.0/de/corpus/28": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-28.jsonl.gz", "size_hint": 78672926, "expected_md5": "e1ffce8721a7c336e2925d43100b6e78", "cache_path": "v1.0/de/docs/docs-28.jsonl.gz" }, "v1.0/de/corpus/29": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-29.jsonl.gz", "size_hint": 80702668, "expected_md5": "570dc8906b883c8566e116ac0d3c8ddb", "cache_path": "v1.0/de/docs/docs-29.jsonl.gz" }, "v1.0/de/corpus/2": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-2.jsonl.gz", "size_hint": 91425088, "expected_md5": "fc427890706a5d31ff2e8d559fba5552", "cache_path": "v1.0/de/docs/docs-2.jsonl.gz" }, "v1.0/de/corpus/30": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-30.jsonl.gz", "size_hint": 80576937, "expected_md5": "667aa167000a65bf6f68d82058dd803d", "cache_path": "v1.0/de/docs/docs-30.jsonl.gz" }, "v1.0/de/corpus/31": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-31.jsonl.gz", "size_hint": 57737892, "expected_md5": "d733bea534c2c2bd9e6c125fc14d97d1", "cache_path": "v1.0/de/docs/docs-31.jsonl.gz" }, "v1.0/de/corpus/3": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-3.jsonl.gz", "size_hint": 85007239, "expected_md5": "990dbdef008fe2901cea858524df8bab", "cache_path": "v1.0/de/docs/docs-3.jsonl.gz" }, "v1.0/de/corpus/4": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-4.jsonl.gz", "size_hint": 85433314, "expected_md5": "6c0ab305737fd9b3e0ab31d9b66aebbe", "cache_path": "v1.0/de/docs/docs-4.jsonl.gz" }, "v1.0/de/corpus/5": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-5.jsonl.gz", "size_hint": 85494784, "expected_md5": "ade6161113825b462355ebec32ab57bc", "cache_path": "v1.0/de/docs/docs-5.jsonl.gz" }, "v1.0/de/corpus/6": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-6.jsonl.gz", "size_hint": 85048361, "expected_md5": "4347f0cad367548e6776465c316d4571", "cache_path": "v1.0/de/docs/docs-6.jsonl.gz" }, "v1.0/de/corpus/7": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-7.jsonl.gz", "size_hint": 85544324, "expected_md5": "aad7fb0916949387bb1a158199d0c11d", "cache_path": "v1.0/de/docs/docs-7.jsonl.gz" }, "v1.0/de/corpus/8": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-8.jsonl.gz", "size_hint": 82876799, "expected_md5": "f820f563a1aa2aa0e1841342336553a9", "cache_path": "v1.0/de/docs/docs-8.jsonl.gz" }, "v1.0/de/corpus/9": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-de/docs-9.jsonl.gz", "size_hint": 79775850, "expected_md5": "243c95ee4c7b22a058e08d7eb1027b47", "cache_path": "v1.0/de/docs/docs-9.jsonl.gz" }, "v1.0/en/corpus/0": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-0.jsonl.gz", "size_hint": 98856369, "expected_md5": "ed94fdd2dad913fcffe0308b021082d6", "cache_path": "v1.0/en/docs/docs-0.jsonl.gz" }, "v1.0/en/corpus/10": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-10.jsonl.gz", "size_hint": 85787457, "expected_md5": "20e5926a36d44c6ece5c1942bda19817", "cache_path": "v1.0/en/docs/docs-10.jsonl.gz" }, "v1.0/en/corpus/11": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-11.jsonl.gz", "size_hint": 85858461, "expected_md5": "7552668e7db1591d0fe71df3b044cffd", "cache_path": "v1.0/en/docs/docs-11.jsonl.gz" }, "v1.0/en/corpus/12": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-12.jsonl.gz", "size_hint": 85089158, "expected_md5": "383760be692a540b30e5b1b2239bab16", "cache_path": "v1.0/en/docs/docs-12.jsonl.gz" }, "v1.0/en/corpus/13": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-13.jsonl.gz", "size_hint": 82769410, "expected_md5": "7da0ffd49848f9c7e2912b1ad1152d3b", "cache_path": "v1.0/en/docs/docs-13.jsonl.gz" }, "v1.0/en/corpus/14": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-14.jsonl.gz", "size_hint": 83032778, "expected_md5": "2c4d6f9220716e624daa016756d707e8", "cache_path": "v1.0/en/docs/docs-14.jsonl.gz" }, "v1.0/en/corpus/15": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-15.jsonl.gz", "size_hint": 82548385, "expected_md5": "a90950760f60e5f0e92c4e7f3774f7e7", "cache_path": "v1.0/en/docs/docs-15.jsonl.gz" }, "v1.0/en/corpus/16": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-16.jsonl.gz", "size_hint": 81947138, "expected_md5": "20e2030038fbb450be2ac09fedaf77b3", "cache_path": "v1.0/en/docs/docs-16.jsonl.gz" }, "v1.0/en/corpus/17": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-17.jsonl.gz", "size_hint": 81038479, "expected_md5": "12136a0df0a80092eafd694820cc6f81", "cache_path": "v1.0/en/docs/docs-17.jsonl.gz" }, "v1.0/en/corpus/18": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-18.jsonl.gz", "size_hint": 80496066, "expected_md5": "a907bbf4ef1b264d0304bb99cc392b15", "cache_path": "v1.0/en/docs/docs-18.jsonl.gz" }, "v1.0/en/corpus/19": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-19.jsonl.gz", "size_hint": 80243337, "expected_md5": "f52ad0a000336d7a73a4e0d0e2bb996f", "cache_path": "v1.0/en/docs/docs-19.jsonl.gz" }, "v1.0/en/corpus/1": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-1.jsonl.gz", "size_hint": 97338482, "expected_md5": "9adcfccb058f46050648dea35755a39f", "cache_path": "v1.0/en/docs/docs-1.jsonl.gz" }, "v1.0/en/corpus/20": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-20.jsonl.gz", "size_hint": 80003557, "expected_md5": "8b9d4eb4374aeba9567e09470cf5b061", "cache_path": "v1.0/en/docs/docs-20.jsonl.gz" }, "v1.0/en/corpus/21": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-21.jsonl.gz", "size_hint": 79751834, "expected_md5": "32ab1841c2deb612eb3ec83a2f371452", "cache_path": "v1.0/en/docs/docs-21.jsonl.gz" }, "v1.0/en/corpus/22": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-22.jsonl.gz", "size_hint": 76766869, "expected_md5": "5334d398deaa053e6f380612973580ef", "cache_path": "v1.0/en/docs/docs-22.jsonl.gz" }, "v1.0/en/corpus/23": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-23.jsonl.gz", "size_hint": 75810312, "expected_md5": "505f419951b8665cd9fc69604f6b5e62", "cache_path": "v1.0/en/docs/docs-23.jsonl.gz" }, "v1.0/en/corpus/24": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-24.jsonl.gz", "size_hint": 76272005, "expected_md5": "eb579b4ad40d6735d370bf30b7877a25", "cache_path": "v1.0/en/docs/docs-24.jsonl.gz" }, "v1.0/en/corpus/25": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-25.jsonl.gz", "size_hint": 77082664, "expected_md5": "7de75404b5cf6c8b965c8257e62ba70b", "cache_path": "v1.0/en/docs/docs-25.jsonl.gz" }, "v1.0/en/corpus/26": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-26.jsonl.gz", "size_hint": 78228256, "expected_md5": "d42ed88eaa947b610846188185a25f54", "cache_path": "v1.0/en/docs/docs-26.jsonl.gz" }, "v1.0/en/corpus/27": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-27.jsonl.gz", "size_hint": 76445022, "expected_md5": "e61b3098053a98b9bdb753a34476e239", "cache_path": "v1.0/en/docs/docs-27.jsonl.gz" }, "v1.0/en/corpus/28": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-28.jsonl.gz", "size_hint": 76434972, "expected_md5": "de185edb91a43a6dbb434658fbdbb96f", "cache_path": "v1.0/en/docs/docs-28.jsonl.gz" }, "v1.0/en/corpus/29": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-29.jsonl.gz", "size_hint": 74697707, "expected_md5": "8f25bd36199c36e2a1812b2e22b56430", "cache_path": "v1.0/en/docs/docs-29.jsonl.gz" }, "v1.0/en/corpus/2": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-2.jsonl.gz", "size_hint": 64571457, "expected_md5": "f961ce2675bc472b4d4126399acb7564", "cache_path": "v1.0/en/docs/docs-2.jsonl.gz" }, "v1.0/en/corpus/30": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-30.jsonl.gz", "size_hint": 71839143, "expected_md5": "9c2e84252fb7ca24b4fd593682bd2aef", "cache_path": "v1.0/en/docs/docs-30.jsonl.gz" }, "v1.0/en/corpus/31": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-31.jsonl.gz", "size_hint": 73437320, "expected_md5": "4fc0fb743bc402bb9d0469856fd3694c", "cache_path": "v1.0/en/docs/docs-31.jsonl.gz" }, "v1.0/en/corpus/32": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-32.jsonl.gz", "size_hint": 73322325, "expected_md5": "64c3d8cae61441df7201d44b08972e41", "cache_path": "v1.0/en/docs/docs-32.jsonl.gz" }, "v1.0/en/corpus/33": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-33.jsonl.gz", "size_hint": 70699147, "expected_md5": "42ebd5e87d5a417689c37cf336749e7f", "cache_path": "v1.0/en/docs/docs-33.jsonl.gz" }, "v1.0/en/corpus/34": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-34.jsonl.gz", "size_hint": 74843362, "expected_md5": "91635ee421cfcd9b8ea5834695e556b7", "cache_path": "v1.0/en/docs/docs-34.jsonl.gz" }, "v1.0/en/corpus/35": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-35.jsonl.gz", "size_hint": 71914591, "expected_md5": "fa7ada6f5f145f5a04bd4113cfe09787", "cache_path": "v1.0/en/docs/docs-35.jsonl.gz" }, "v1.0/en/corpus/36": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-36.jsonl.gz", "size_hint": 73255731, "expected_md5": "90c6abea75caa84381d238fd85e4520e", "cache_path": "v1.0/en/docs/docs-36.jsonl.gz" }, "v1.0/en/corpus/37": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-37.jsonl.gz", "size_hint": 74956992, "expected_md5": "6fdf53729c158371ea30cf56f37d7db5", "cache_path": "v1.0/en/docs/docs-37.jsonl.gz" }, "v1.0/en/corpus/38": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-38.jsonl.gz", "size_hint": 70506538, "expected_md5": "8dcb112f443c49e1d35bdac0db63a111", "cache_path": "v1.0/en/docs/docs-38.jsonl.gz" }, "v1.0/en/corpus/39": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-39.jsonl.gz", "size_hint": 73824158, "expected_md5": "071adb3f777cb077ff170c6856bbdcd7", "cache_path": "v1.0/en/docs/docs-39.jsonl.gz" }, "v1.0/en/corpus/3": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-3.jsonl.gz", "size_hint": 79988237, "expected_md5": "853a5180f4c1012ebee1f2c5a3e26506", "cache_path": "v1.0/en/docs/docs-3.jsonl.gz" }, "v1.0/en/corpus/40": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-40.jsonl.gz", "size_hint": 72684614, "expected_md5": "879d54b7d7826a2d7133955a689890f5", "cache_path": "v1.0/en/docs/docs-40.jsonl.gz" }, "v1.0/en/corpus/41": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-41.jsonl.gz", "size_hint": 74674474, "expected_md5": "f379d5441be8e19a690f116a7a9ad501", "cache_path": "v1.0/en/docs/docs-41.jsonl.gz" }, "v1.0/en/corpus/42": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-42.jsonl.gz", "size_hint": 74166535, "expected_md5": "1fb0a973751261adc3c2165fc99f0ce1", "cache_path": "v1.0/en/docs/docs-42.jsonl.gz" }, "v1.0/en/corpus/43": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-43.jsonl.gz", "size_hint": 72571698, "expected_md5": "e1621cf06f3e5d6b3eff4d8607641258", "cache_path": "v1.0/en/docs/docs-43.jsonl.gz" }, "v1.0/en/corpus/44": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-44.jsonl.gz", "size_hint": 72937865, "expected_md5": "47b0191ce682e60708a979b2232607f9", "cache_path": "v1.0/en/docs/docs-44.jsonl.gz" }, "v1.0/en/corpus/45": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-45.jsonl.gz", "size_hint": 74948873, "expected_md5": "3fa31818300aba29db933557f31b3494", "cache_path": "v1.0/en/docs/docs-45.jsonl.gz" }, "v1.0/en/corpus/46": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-46.jsonl.gz", "size_hint": 75950579, "expected_md5": "462641fe08c052ab1fbf7add67a48859", "cache_path": "v1.0/en/docs/docs-46.jsonl.gz" }, "v1.0/en/corpus/47": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-47.jsonl.gz", "size_hint": 72978719, "expected_md5": "5cb8207d31dab7cd82ba64317e65c8ac", "cache_path": "v1.0/en/docs/docs-47.jsonl.gz" }, "v1.0/en/corpus/48": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-48.jsonl.gz", "size_hint": 73930541, "expected_md5": "6a8433dcf294c669078cdf1e5a88aa38", "cache_path": "v1.0/en/docs/docs-48.jsonl.gz" }, "v1.0/en/corpus/49": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-49.jsonl.gz", "size_hint": 74004419, "expected_md5": "75975d9495b9cc4aa231f873281c5f02", "cache_path": "v1.0/en/docs/docs-49.jsonl.gz" }, "v1.0/en/corpus/4": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-4.jsonl.gz", "size_hint": 91846360, "expected_md5": "80271ce096ccb01565857d4dd8f2b4a5", "cache_path": "v1.0/en/docs/docs-4.jsonl.gz" }, "v1.0/en/corpus/50": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-50.jsonl.gz", "size_hint": 70736448, "expected_md5": "60dbe4823f30923e1ebe3ef5b8f0879d", "cache_path": "v1.0/en/docs/docs-50.jsonl.gz" }, "v1.0/en/corpus/51": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-51.jsonl.gz", "size_hint": 73415840, "expected_md5": "ec9c101022dfc897c058e96c220758f6", "cache_path": "v1.0/en/docs/docs-51.jsonl.gz" }, "v1.0/en/corpus/52": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-52.jsonl.gz", "size_hint": 73776910, "expected_md5": "925b65b5ef08abb41ce09d8645bd6dd2", "cache_path": "v1.0/en/docs/docs-52.jsonl.gz" }, "v1.0/en/corpus/53": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-53.jsonl.gz", "size_hint": 72091328, "expected_md5": "62385f5b6cf87e7aa039cdc542a9e55c", "cache_path": "v1.0/en/docs/docs-53.jsonl.gz" }, "v1.0/en/corpus/54": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-54.jsonl.gz", "size_hint": 72782654, "expected_md5": "58f025c4ef8d3868e5eabef9d39395dd", "cache_path": "v1.0/en/docs/docs-54.jsonl.gz" }, "v1.0/en/corpus/55": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-55.jsonl.gz", "size_hint": 70569978, "expected_md5": "5f3f8787af6baf5a85ea3cf06698af38", "cache_path": "v1.0/en/docs/docs-55.jsonl.gz" }, "v1.0/en/corpus/56": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-56.jsonl.gz", "size_hint": 71740868, "expected_md5": "eee9cccdcd05cc89ef8e5e2fa9fff7ac", "cache_path": "v1.0/en/docs/docs-56.jsonl.gz" }, "v1.0/en/corpus/57": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-57.jsonl.gz", "size_hint": 72217620, "expected_md5": "4213cb413ca6079920f0b0241bad7b56", "cache_path": "v1.0/en/docs/docs-57.jsonl.gz" }, "v1.0/en/corpus/58": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-58.jsonl.gz", "size_hint": 71862927, "expected_md5": "38de32e0b8a9723dbcaa83d23e0e8d6d", "cache_path": "v1.0/en/docs/docs-58.jsonl.gz" }, "v1.0/en/corpus/59": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-59.jsonl.gz", "size_hint": 71495670, "expected_md5": "0a6a80e46a434776ec2ff67bc6a28e6e", "cache_path": "v1.0/en/docs/docs-59.jsonl.gz" }, "v1.0/en/corpus/5": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-5.jsonl.gz", "size_hint": 91542927, "expected_md5": "2ae2b6919cbea03f03469633690acfd1", "cache_path": "v1.0/en/docs/docs-5.jsonl.gz" }, "v1.0/en/corpus/60": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-60.jsonl.gz", "size_hint": 69897224, "expected_md5": "a583b681b03b9637a2b80fb6e4e35afa", "cache_path": "v1.0/en/docs/docs-60.jsonl.gz" }, "v1.0/en/corpus/61": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-61.jsonl.gz", "size_hint": 70170718, "expected_md5": "3df26a61e7ecb4c2244376041d7b4e1c", "cache_path": "v1.0/en/docs/docs-61.jsonl.gz" }, "v1.0/en/corpus/62": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-62.jsonl.gz", "size_hint": 69279270, "expected_md5": "739d42eb5157373333a3c978eec150b1", "cache_path": "v1.0/en/docs/docs-62.jsonl.gz" }, "v1.0/en/corpus/63": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-63.jsonl.gz", "size_hint": 68754531, "expected_md5": "0b25fcfc2a77bd551cb32d763a83b49d", "cache_path": "v1.0/en/docs/docs-63.jsonl.gz" }, "v1.0/en/corpus/64": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-64.jsonl.gz", "size_hint": 65487314, "expected_md5": "2391aec604153b8e4b78cd6ae83decd4", "cache_path": "v1.0/en/docs/docs-64.jsonl.gz" }, "v1.0/en/corpus/65": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-65.jsonl.gz", "size_hint": 52897612, "expected_md5": "9122a610d9b816975e93b19ae0121f46", "cache_path": "v1.0/en/docs/docs-65.jsonl.gz" }, "v1.0/en/corpus/6": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-6.jsonl.gz", "size_hint": 88919126, "expected_md5": "7543b1726742ce466cd2a2a57f765e62", "cache_path": "v1.0/en/docs/docs-6.jsonl.gz" }, "v1.0/en/corpus/7": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-7.jsonl.gz", "size_hint": 88149943, "expected_md5": "c9ac416afef48a6f7d9dcc66ddeff2d5", "cache_path": "v1.0/en/docs/docs-7.jsonl.gz" }, "v1.0/en/corpus/8": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-8.jsonl.gz", "size_hint": 87557556, "expected_md5": "96f9fd6a228e9f46bfdb4eda0ffd6b90", "cache_path": "v1.0/en/docs/docs-8.jsonl.gz" }, "v1.0/en/corpus/9": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-en/docs-9.jsonl.gz", "size_hint": 86565246, "expected_md5": "6d3d37aac292937e93dbe0eb0efb1c00", "cache_path": "v1.0/en/docs/docs-9.jsonl.gz" }, "v1.0/es/corpus/0": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-es/docs-0.jsonl.gz", "size_hint": 92874838, "expected_md5": "f28bff7c3058d2cb8c9e99b919de6c4f", "cache_path": "v1.0/es/docs/docs-0.jsonl.gz" }, "v1.0/es/corpus/10": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-es/docs-10.jsonl.gz", "size_hint": 62622594, "expected_md5": "c71af44961b7e0fd33c49fe1f61039bf", "cache_path": "v1.0/es/docs/docs-10.jsonl.gz" }, "v1.0/es/corpus/11": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-es/docs-11.jsonl.gz", "size_hint": 65707558, "expected_md5": "0bd1d3ab02269c7ee2207ff39fe544e5", "cache_path": "v1.0/es/docs/docs-11.jsonl.gz" }, "v1.0/es/corpus/12": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-es/docs-12.jsonl.gz", "size_hint": 72601971, "expected_md5": "e94562d6f2df0a1c84075c58cb035883", "cache_path": "v1.0/es/docs/docs-12.jsonl.gz" }, "v1.0/es/corpus/13": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-es/docs-13.jsonl.gz", "size_hint": 75474457, "expected_md5": "19509077dc27fc650f134223b385a5e7", "cache_path": "v1.0/es/docs/docs-13.jsonl.gz" }, "v1.0/es/corpus/14": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-es/docs-14.jsonl.gz", "size_hint": 76497338, "expected_md5": "deb94c634d4c1d85e78bb2ff35028383", "cache_path": "v1.0/es/docs/docs-14.jsonl.gz" }, "v1.0/es/corpus/15": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-es/docs-15.jsonl.gz", "size_hint": 73009124, "expected_md5": "02a780bf6a3a31bfdfc759ec403403bc", "cache_path": "v1.0/es/docs/docs-15.jsonl.gz" }, "v1.0/es/corpus/16": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-es/docs-16.jsonl.gz", "size_hint": 74206192, "expected_md5": "748934c50ae05d7273a2c470b40d6e89", "cache_path": "v1.0/es/docs/docs-16.jsonl.gz" }, "v1.0/es/corpus/17": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-es/docs-17.jsonl.gz", "size_hint": 67140050, "expected_md5": "62ff853bfc22d96f4ecae8e8e1c86caa", "cache_path": "v1.0/es/docs/docs-17.jsonl.gz" }, "v1.0/es/corpus/18": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-es/docs-18.jsonl.gz", "size_hint": 77883748, "expected_md5": "81e79b7ee32173601e71fd9173bf62b3", "cache_path": "v1.0/es/docs/docs-18.jsonl.gz" }, "v1.0/es/corpus/19": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-es/docs-19.jsonl.gz", "size_hint": 79904116, "expected_md5": "fa6bb29881142b6df1a1049aafa864de", "cache_path": "v1.0/es/docs/docs-19.jsonl.gz" }, "v1.0/es/corpus/1": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-es/docs-1.jsonl.gz", "size_hint": 89646018, "expected_md5": "1c08c92860d8fa535b545eb530b27dbd", "cache_path": "v1.0/es/docs/docs-1.jsonl.gz" }, "v1.0/es/corpus/20": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-es/docs-20.jsonl.gz", "size_hint": 57162245, "expected_md5": "b561242ee7bc55e24cd21cd1a48a256c", "cache_path": "v1.0/es/docs/docs-20.jsonl.gz" }, "v1.0/es/corpus/2": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-es/docs-2.jsonl.gz", "size_hint": 85432948, "expected_md5": "fb75e0db47df90f0135df0acba637196", "cache_path": "v1.0/es/docs/docs-2.jsonl.gz" }, "v1.0/es/corpus/3": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-es/docs-3.jsonl.gz", "size_hint": 83318741, "expected_md5": "c8ae25908e11c995e3d2a822c6dd8ffc", "cache_path": "v1.0/es/docs/docs-3.jsonl.gz" }, "v1.0/es/corpus/4": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-es/docs-4.jsonl.gz", "size_hint": 80843535, "expected_md5": "671420453f20ef32a35cbf2c9f9c1310", "cache_path": "v1.0/es/docs/docs-4.jsonl.gz" }, "v1.0/es/corpus/5": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-es/docs-5.jsonl.gz", "size_hint": 79438738, "expected_md5": "b907c427cf796810f28ee9375f91bcdf", "cache_path": "v1.0/es/docs/docs-5.jsonl.gz" }, "v1.0/es/corpus/6": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-es/docs-6.jsonl.gz", "size_hint": 79152845, "expected_md5": "392e4b93c786551799fab1c2f5f161d8", "cache_path": "v1.0/es/docs/docs-6.jsonl.gz" }, "v1.0/es/corpus/7": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-es/docs-7.jsonl.gz", "size_hint": 78544890, "expected_md5": "f8cf2f4c34b28ca166143f5b94349500", "cache_path": "v1.0/es/docs/docs-7.jsonl.gz" }, "v1.0/es/corpus/8": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-es/docs-8.jsonl.gz", "size_hint": 77588034, "expected_md5": "d7f1edf8f756009db26deb49bf22f7ce", "cache_path": "v1.0/es/docs/docs-8.jsonl.gz" }, "v1.0/es/corpus/9": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-es/docs-9.jsonl.gz", "size_hint": 66337959, "expected_md5": "456ec112182b917d91d9322cf00786d2", "cache_path": "v1.0/es/docs/docs-9.jsonl.gz" }, "v1.0/fa/corpus/0": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fa/docs-0.jsonl.gz", "size_hint": 80831146, "expected_md5": "6b1135045aaa46c539a2960cda7c3b0a", "cache_path": "v1.0/fa/docs/docs-0.jsonl.gz" }, "v1.0/fa/corpus/1": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fa/docs-1.jsonl.gz", "size_hint": 56314013, "expected_md5": "a86e85011229e984dc44728baf46eebb", "cache_path": "v1.0/fa/docs/docs-1.jsonl.gz" }, "v1.0/fa/corpus/2": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fa/docs-2.jsonl.gz", "size_hint": 53167889, "expected_md5": "88ac988b8b61287bd55f9b977532ac5e", "cache_path": "v1.0/fa/docs/docs-2.jsonl.gz" }, "v1.0/fa/corpus/3": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fa/docs-3.jsonl.gz", "size_hint": 57027208, "expected_md5": "2e0030f301e5af7723dff4dce14f5343", "cache_path": "v1.0/fa/docs/docs-3.jsonl.gz" }, "v1.0/fa/corpus/4": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fa/docs-4.jsonl.gz", "size_hint": 22422574, "expected_md5": "dc78362be19963c12fd511c07d1386b3", "cache_path": "v1.0/fa/docs/docs-4.jsonl.gz" }, "v1.0/fi/corpus/0": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fi/docs-0.jsonl.gz", "size_hint": 79712094, "expected_md5": "1c7d419a877fd406db8321e56600e6c5", "cache_path": "v1.0/fi/docs/docs-0.jsonl.gz" }, "v1.0/fi/corpus/1": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fi/docs-1.jsonl.gz", "size_hint": 71207705, "expected_md5": "f45db81107c579315d0e5a9d2f33a57f", "cache_path": "v1.0/fi/docs/docs-1.jsonl.gz" }, "v1.0/fi/corpus/2": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fi/docs-2.jsonl.gz", "size_hint": 68005963, "expected_md5": "f6d202472a36d443f5e214a6de92b515", "cache_path": "v1.0/fi/docs/docs-2.jsonl.gz" }, "v1.0/fi/corpus/3": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fi/docs-3.jsonl.gz", "size_hint": 51120372, "expected_md5": "1ea7920d58d75a309462585f3fa479cf", "cache_path": "v1.0/fi/docs/docs-3.jsonl.gz" }, "v1.0/fr/corpus/0": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-0.jsonl.gz", "size_hint": 81033537, "expected_md5": "fa62c438d74f8ac9501db7af6aaca817", "cache_path": "v1.0/fr/docs/docs-0.jsonl.gz" }, "v1.0/fr/corpus/10": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-10.jsonl.gz", "size_hint": 67698213, "expected_md5": "8816919a5cfb523ac75579453dde6d47", "cache_path": "v1.0/fr/docs/docs-10.jsonl.gz" }, "v1.0/fr/corpus/11": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-11.jsonl.gz", "size_hint": 67898616, "expected_md5": "642128fb3b08f35d7c8d34f6994d98cc", "cache_path": "v1.0/fr/docs/docs-11.jsonl.gz" }, "v1.0/fr/corpus/12": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-12.jsonl.gz", "size_hint": 66768800, "expected_md5": "daa9926bdf888aed05dbba8aafa419dd", "cache_path": "v1.0/fr/docs/docs-12.jsonl.gz" }, "v1.0/fr/corpus/13": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-13.jsonl.gz", "size_hint": 65451686, "expected_md5": "12afc85cd7ddda45c3f5d3f199b596a7", "cache_path": "v1.0/fr/docs/docs-13.jsonl.gz" }, "v1.0/fr/corpus/14": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-14.jsonl.gz", "size_hint": 65202296, "expected_md5": "46a9acd1b09a81701b89e8b1859bf30c", "cache_path": "v1.0/fr/docs/docs-14.jsonl.gz" }, "v1.0/fr/corpus/15": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-15.jsonl.gz", "size_hint": 62889747, "expected_md5": "3968784422a38c29653c1636938442de", "cache_path": "v1.0/fr/docs/docs-15.jsonl.gz" }, "v1.0/fr/corpus/16": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-16.jsonl.gz", "size_hint": 62186443, "expected_md5": "a4b18549cb5bed8365442e5a434c18c0", "cache_path": "v1.0/fr/docs/docs-16.jsonl.gz" }, "v1.0/fr/corpus/17": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-17.jsonl.gz", "size_hint": 61659176, "expected_md5": "245749a12a976b0a11bac0a8fcc61b92", "cache_path": "v1.0/fr/docs/docs-17.jsonl.gz" }, "v1.0/fr/corpus/18": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-18.jsonl.gz", "size_hint": 61297567, "expected_md5": "2f81457bb7058c4708e15d73858dfc86", "cache_path": "v1.0/fr/docs/docs-18.jsonl.gz" }, "v1.0/fr/corpus/19": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-19.jsonl.gz", "size_hint": 62810856, "expected_md5": "96309f23dc95423c831a2ccf30b97ec3", "cache_path": "v1.0/fr/docs/docs-19.jsonl.gz" }, "v1.0/fr/corpus/1": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-1.jsonl.gz", "size_hint": 78469930, "expected_md5": "34262e282c58c31e965d89274df96e40", "cache_path": "v1.0/fr/docs/docs-1.jsonl.gz" }, "v1.0/fr/corpus/20": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-20.jsonl.gz", "size_hint": 62973327, "expected_md5": "db4a22111c1b3dbbbbac2d71531dd631", "cache_path": "v1.0/fr/docs/docs-20.jsonl.gz" }, "v1.0/fr/corpus/21": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-21.jsonl.gz", "size_hint": 61427017, "expected_md5": "f72f7bb7c14ef8a450715f8a952b2816", "cache_path": "v1.0/fr/docs/docs-21.jsonl.gz" }, "v1.0/fr/corpus/22": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-22.jsonl.gz", "size_hint": 58091930, "expected_md5": "666b5e7ddcc0918febb3331a14b3d612", "cache_path": "v1.0/fr/docs/docs-22.jsonl.gz" }, "v1.0/fr/corpus/23": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-23.jsonl.gz", "size_hint": 61344070, "expected_md5": "f9c3316e6407e17206a1eedacc6b776b", "cache_path": "v1.0/fr/docs/docs-23.jsonl.gz" }, "v1.0/fr/corpus/24": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-24.jsonl.gz", "size_hint": 61250359, "expected_md5": "d32c6a0cf9b59bba24774e4fd4f0691e", "cache_path": "v1.0/fr/docs/docs-24.jsonl.gz" }, "v1.0/fr/corpus/25": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-25.jsonl.gz", "size_hint": 56044622, "expected_md5": "a110c2681991390092e2e4920e28abf7", "cache_path": "v1.0/fr/docs/docs-25.jsonl.gz" }, "v1.0/fr/corpus/26": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-26.jsonl.gz", "size_hint": 64189087, "expected_md5": "2eb3482ab52c21ebf16c22184a4816c4", "cache_path": "v1.0/fr/docs/docs-26.jsonl.gz" }, "v1.0/fr/corpus/27": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-27.jsonl.gz", "size_hint": 65662563, "expected_md5": "d2c4f0443b2ccd875b45b5519fa6188c", "cache_path": "v1.0/fr/docs/docs-27.jsonl.gz" }, "v1.0/fr/corpus/28": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-28.jsonl.gz", "size_hint": 65310822, "expected_md5": "5f66fb25b1d295614b8a92c4f9636d05", "cache_path": "v1.0/fr/docs/docs-28.jsonl.gz" }, "v1.0/fr/corpus/29": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-29.jsonl.gz", "size_hint": 18176293, "expected_md5": "1f4c400a40b7020c797f105457e451d2", "cache_path": "v1.0/fr/docs/docs-29.jsonl.gz" }, "v1.0/fr/corpus/2": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-2.jsonl.gz", "size_hint": 70017319, "expected_md5": "a051dc99b9ffb81c1eaf394627064927", "cache_path": "v1.0/fr/docs/docs-2.jsonl.gz" }, "v1.0/fr/corpus/3": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-3.jsonl.gz", "size_hint": 66942793, "expected_md5": "f8a18f6ae5b2ebc58a39791469c38bdd", "cache_path": "v1.0/fr/docs/docs-3.jsonl.gz" }, "v1.0/fr/corpus/4": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-4.jsonl.gz", "size_hint": 59460892, "expected_md5": "5a240afc021ecb5dc27285abe4b6d923", "cache_path": "v1.0/fr/docs/docs-4.jsonl.gz" }, "v1.0/fr/corpus/5": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-5.jsonl.gz", "size_hint": 62629125, "expected_md5": "a03ba537c9d68ecef50aa5a4777407cc", "cache_path": "v1.0/fr/docs/docs-5.jsonl.gz" }, "v1.0/fr/corpus/6": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-6.jsonl.gz", "size_hint": 69205967, "expected_md5": "499367e79f29e565cf9bfcbe3ab2adea", "cache_path": "v1.0/fr/docs/docs-6.jsonl.gz" }, "v1.0/fr/corpus/7": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-7.jsonl.gz", "size_hint": 68194158, "expected_md5": "652114ff90f8cf484c8701e5f8ea5d02", "cache_path": "v1.0/fr/docs/docs-7.jsonl.gz" }, "v1.0/fr/corpus/8": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-8.jsonl.gz", "size_hint": 68757697, "expected_md5": "0d7e5b3231ab67d6026f8237bcf0be0f", "cache_path": "v1.0/fr/docs/docs-8.jsonl.gz" }, "v1.0/fr/corpus/9": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-fr/docs-9.jsonl.gz", "size_hint": 68056019, "expected_md5": "f916ce6802d20b4c3f9905363d3bb31f", "cache_path": "v1.0/fr/docs/docs-9.jsonl.gz" }, "v1.0/hi/corpus/0": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-hi/docs-0.jsonl.gz", "size_hint": 95650487, "expected_md5": "ffefe282e6be09f06a69d2945f402bda", "cache_path": "v1.0/hi/docs/docs-0.jsonl.gz" }, "v1.0/hi/corpus/1": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-hi/docs-1.jsonl.gz", "size_hint": 1161610, "expected_md5": "3fe3f9f265b2f52a25b4a49b80c82cf5", "cache_path": "v1.0/hi/docs/docs-1.jsonl.gz" }, "v1.0/id/corpus/0": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-id/docs-0.jsonl.gz", "size_hint": 68482339, "expected_md5": "e2ff61190833252d531dea1f90d90426", "cache_path": "v1.0/id/docs/docs-0.jsonl.gz" }, "v1.0/id/corpus/1": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-id/docs-1.jsonl.gz", "size_hint": 39714366, "expected_md5": "ffe27c80bfcb8061954d64282cb0f71c", "cache_path": "v1.0/id/docs/docs-1.jsonl.gz" }, "v1.0/id/corpus/2": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-id/docs-2.jsonl.gz", "size_hint": 61386913, "expected_md5": "c85ce23396c8c0e61dc11806c32495e2", "cache_path": "v1.0/id/docs/docs-2.jsonl.gz" }, "v1.0/ja/corpus/0": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ja/docs-0.jsonl.gz", "size_hint": 87302446, "expected_md5": "f064d7aaea82b1619d1709c36483c461", "cache_path": "v1.0/ja/docs/docs-0.jsonl.gz" }, "v1.0/ja/corpus/10": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ja/docs-10.jsonl.gz", "size_hint": 74375206, "expected_md5": "651d84515e3f23a7dfebbb9dd1a97e05", "cache_path": "v1.0/ja/docs/docs-10.jsonl.gz" }, "v1.0/ja/corpus/11": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ja/docs-11.jsonl.gz", "size_hint": 73261365, "expected_md5": "88f8bb6f3a702f0228523a02bc913e55", "cache_path": "v1.0/ja/docs/docs-11.jsonl.gz" }, "v1.0/ja/corpus/12": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ja/docs-12.jsonl.gz", "size_hint": 71472211, "expected_md5": "2e5156a5f0580f3b2aae763a55d323f3", "cache_path": "v1.0/ja/docs/docs-12.jsonl.gz" }, "v1.0/ja/corpus/13": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ja/docs-13.jsonl.gz", "size_hint": 64646621, "expected_md5": "3a080cb1f0e82843903c300ea3330eef", "cache_path": "v1.0/ja/docs/docs-13.jsonl.gz" }, "v1.0/ja/corpus/1": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ja/docs-1.jsonl.gz", "size_hint": 84310580, "expected_md5": "903c2bc115f8ef719b3d0e21c2b13daf", "cache_path": "v1.0/ja/docs/docs-1.jsonl.gz" }, "v1.0/ja/corpus/2": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ja/docs-2.jsonl.gz", "size_hint": 82263425, "expected_md5": "57245e8a46d1cf289bc5377fd0d54918", "cache_path": "v1.0/ja/docs/docs-2.jsonl.gz" }, "v1.0/ja/corpus/3": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ja/docs-3.jsonl.gz", "size_hint": 79819291, "expected_md5": "9771ca5370e49dbcae101650293e0cbc", "cache_path": "v1.0/ja/docs/docs-3.jsonl.gz" }, "v1.0/ja/corpus/4": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ja/docs-4.jsonl.gz", "size_hint": 76369546, "expected_md5": "eb4abc18ea71876d8a974813701e52d2", "cache_path": "v1.0/ja/docs/docs-4.jsonl.gz" }, "v1.0/ja/corpus/5": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ja/docs-5.jsonl.gz", "size_hint": 73543156, "expected_md5": "aeceab57d0e29f028cdad7724eefae2b", "cache_path": "v1.0/ja/docs/docs-5.jsonl.gz" }, "v1.0/ja/corpus/6": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ja/docs-6.jsonl.gz", "size_hint": 75527173, "expected_md5": "f2e9f77af251bff8b8dbac3f0972abd9", "cache_path": "v1.0/ja/docs/docs-6.jsonl.gz" }, "v1.0/ja/corpus/7": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ja/docs-7.jsonl.gz", "size_hint": 75265643, "expected_md5": "81e4e762a82d13fefd8d453846f286ac", "cache_path": "v1.0/ja/docs/docs-7.jsonl.gz" }, "v1.0/ja/corpus/8": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ja/docs-8.jsonl.gz", "size_hint": 74070028, "expected_md5": "7f008e134c6acbf5ea2aeee2f2b16157", "cache_path": "v1.0/ja/docs/docs-8.jsonl.gz" }, "v1.0/ja/corpus/9": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ja/docs-9.jsonl.gz", "size_hint": 72893709, "expected_md5": "c354f0d766ff28950b2aea0227141b5c", "cache_path": "v1.0/ja/docs/docs-9.jsonl.gz" }, "v1.0/ko/corpus/0": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ko/docs-0.jsonl.gz", "size_hint": 87965596, "expected_md5": "dcae1fe2c7c966f4bd92c09aa9fba88f", "cache_path": "v1.0/ko/docs/docs-0.jsonl.gz" }, "v1.0/ko/corpus/1": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ko/docs-1.jsonl.gz", "size_hint": 75422723, "expected_md5": "8068bb2ca5796bbca7af267527518d86", "cache_path": "v1.0/ko/docs/docs-1.jsonl.gz" }, "v1.0/ko/corpus/2": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ko/docs-2.jsonl.gz", "size_hint": 62582229, "expected_md5": "ea72e30dd4ed336b37422f559f60d4ed", "cache_path": "v1.0/ko/docs/docs-2.jsonl.gz" }, "v1.0/ru/corpus/0": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ru/docs-0.jsonl.gz", "size_hint": 100436081, "expected_md5": "4373a43259b241792bcb6e979943db29", "cache_path": "v1.0/ru/docs/docs-0.jsonl.gz" }, "v1.0/ru/corpus/10": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ru/docs-10.jsonl.gz", "size_hint": 75656001, "expected_md5": "57b0d428b8d0004387eaa507e2336d79", "cache_path": "v1.0/ru/docs/docs-10.jsonl.gz" }, "v1.0/ru/corpus/11": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ru/docs-11.jsonl.gz", "size_hint": 78271034, "expected_md5": "da7c457c057cb525ce57980a7dde0989", "cache_path": "v1.0/ru/docs/docs-11.jsonl.gz" }, "v1.0/ru/corpus/12": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ru/docs-12.jsonl.gz", "size_hint": 76539873, "expected_md5": "ae23ce2efd5216008d5bd4d353b844ac", "cache_path": "v1.0/ru/docs/docs-12.jsonl.gz" }, "v1.0/ru/corpus/13": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ru/docs-13.jsonl.gz", "size_hint": 78171375, "expected_md5": "6edf74f3a33c6de035881ad70bb98cb8", "cache_path": "v1.0/ru/docs/docs-13.jsonl.gz" }, "v1.0/ru/corpus/14": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ru/docs-14.jsonl.gz", "size_hint": 78271662, "expected_md5": "ec732d5a4a36a0cee67f631f9b7ca976", "cache_path": "v1.0/ru/docs/docs-14.jsonl.gz" }, "v1.0/ru/corpus/15": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ru/docs-15.jsonl.gz", "size_hint": 79432861, "expected_md5": "117e3c9b6c2538144717a22319c8c701", "cache_path": "v1.0/ru/docs/docs-15.jsonl.gz" }, "v1.0/ru/corpus/16": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ru/docs-16.jsonl.gz", "size_hint": 81415343, "expected_md5": "cae1d225074503572b377254d74f90c3", "cache_path": "v1.0/ru/docs/docs-16.jsonl.gz" }, "v1.0/ru/corpus/17": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ru/docs-17.jsonl.gz", "size_hint": 81955644, "expected_md5": "e7468fe6203f581175c2527bdc6eb5ed", "cache_path": "v1.0/ru/docs/docs-17.jsonl.gz" }, "v1.0/ru/corpus/18": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ru/docs-18.jsonl.gz", "size_hint": 81041807, "expected_md5": "c6caaa99bf8c2746a88d5f2d0dae6a49", "cache_path": "v1.0/ru/docs/docs-18.jsonl.gz" }, "v1.0/ru/corpus/19": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ru/docs-19.jsonl.gz", "size_hint": 6749385, "expected_md5": "4ddfc2b723b470597caa1a8543d5ee9f", "cache_path": "v1.0/ru/docs/docs-19.jsonl.gz" }, "v1.0/ru/corpus/1": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ru/docs-1.jsonl.gz", "size_hint": 98036311, "expected_md5": "5283c42d6542e5c8dc2c1be5789f0b7a", "cache_path": "v1.0/ru/docs/docs-1.jsonl.gz" }, "v1.0/ru/corpus/2": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ru/docs-2.jsonl.gz", "size_hint": 90050391, "expected_md5": "e2a7250e04a82f5f257b48d3950410c7", "cache_path": "v1.0/ru/docs/docs-2.jsonl.gz" }, "v1.0/ru/corpus/3": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ru/docs-3.jsonl.gz", "size_hint": 89443294, "expected_md5": "e0c6f72fbbb5ebb418387b9c9a22696f", "cache_path": "v1.0/ru/docs/docs-3.jsonl.gz" }, "v1.0/ru/corpus/4": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ru/docs-4.jsonl.gz", "size_hint": 82032411, "expected_md5": "0b351f8491f1e8a68db00bd56de41862", "cache_path": "v1.0/ru/docs/docs-4.jsonl.gz" }, "v1.0/ru/corpus/5": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ru/docs-5.jsonl.gz", "size_hint": 88457515, "expected_md5": "fe82dd5ec8fede641829e8acb71da04a", "cache_path": "v1.0/ru/docs/docs-5.jsonl.gz" }, "v1.0/ru/corpus/6": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ru/docs-6.jsonl.gz", "size_hint": 81102964, "expected_md5": "081133522ff711ed40ae47c8fd3c56e2", "cache_path": "v1.0/ru/docs/docs-6.jsonl.gz" }, "v1.0/ru/corpus/7": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ru/docs-7.jsonl.gz", "size_hint": 80229752, "expected_md5": "9c1297361dce8919649a09f7c72d49bb", "cache_path": "v1.0/ru/docs/docs-7.jsonl.gz" }, "v1.0/ru/corpus/8": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ru/docs-8.jsonl.gz", "size_hint": 74478227, "expected_md5": "e488d2d37394edf07cee425dc758c11a", "cache_path": "v1.0/ru/docs/docs-8.jsonl.gz" }, "v1.0/ru/corpus/9": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-ru/docs-9.jsonl.gz", "size_hint": 73515957, "expected_md5": "51b4da27835ca5febda7848d6754ecac", "cache_path": "v1.0/ru/docs/docs-9.jsonl.gz" }, "v1.0/sw/corpus/0": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-sw/docs-0.jsonl.gz", "size_hint": 10199394, "expected_md5": "afdb146539d0f488f14c2833e14df799", "cache_path": "v1.0/sw/docs/docs-0.jsonl.gz" }, "v1.0/te/corpus/0": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-te/docs-0.jsonl.gz", "size_hint": 68858140, "expected_md5": "7b1067004a12a018eb4ca9e7e7655600", "cache_path": "v1.0/te/docs/docs-0.jsonl.gz" }, "v1.0/te/corpus/1": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-te/docs-1.jsonl.gz", "size_hint": 3454223, "expected_md5": "703a0ac0d66c6fa6b684df9d7d6afd20", "cache_path": "v1.0/te/docs/docs-1.jsonl.gz" }, "v1.0/th/corpus/0": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-th/docs-0.jsonl.gz", "size_hint": 101610412, "expected_md5": "060bf4cb7fa0127a571023709f117554", "cache_path": "v1.0/th/docs/docs-0.jsonl.gz" }, "v1.0/th/corpus/1": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-th/docs-1.jsonl.gz", "size_hint": 8027002, "expected_md5": "2e05644bee9ab5da8f7f7177f0c7bafc", "cache_path": "v1.0/th/docs/docs-1.jsonl.gz" }, "v1.0/yo/corpus/0": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-yo/docs-0.jsonl.gz", "size_hint": 10946, "expected_md5": "07bab606c31660be729353419ff6537d", "cache_path": "v1.0/yo/docs/docs-1.jsonl.gz" }, "v1.0/zh/corpus/0": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-zh/docs-0.jsonl.gz", "size_hint": 97061486, "expected_md5": "68248e7866b0571e750140a4baf31ad8", "cache_path": "v1.0/zh/docs/docs-0.jsonl.gz" }, "v1.0/zh/corpus/1": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-zh/docs-1.jsonl.gz", "size_hint": 85538355, "expected_md5": "c3bc8bd34a53d9589c29f580b94c2d17", "cache_path": "v1.0/zh/docs/docs-1.jsonl.gz" }, "v1.0/zh/corpus/2": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-zh/docs-2.jsonl.gz", "size_hint": 80287705, "expected_md5": "eefbf18d0e8edcee9db00a748ce08f86", "cache_path": "v1.0/zh/docs/docs-2.jsonl.gz" }, "v1.0/zh/corpus/3": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-zh/docs-3.jsonl.gz", "size_hint": 76098139, "expected_md5": "cc7924a7bacc7abd0974fd07168064d1", "cache_path": "v1.0/zh/docs/docs-3.jsonl.gz" }, "v1.0/zh/corpus/4": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-zh/docs-4.jsonl.gz", "size_hint": 60594024, "expected_md5": "18879a45e3b27f0b0b588e68a697e282", "cache_path": "v1.0/zh/docs/docs-4.jsonl.gz" }, "v1.0/zh/corpus/5": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-zh/docs-5.jsonl.gz", "size_hint": 50492481, "expected_md5": "be7e9f9b61c23306e6feedb2e26aab20", "cache_path": "v1.0/zh/docs/docs-5.jsonl.gz" }, "v1.0/zh/corpus/6": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-zh/docs-6.jsonl.gz", "size_hint": 76002940, "expected_md5": "6a6359359436b4f89c5be50d856e625d", "cache_path": "v1.0/zh/docs/docs-6.jsonl.gz" }, "v1.0/zh/corpus/7": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-zh/docs-7.jsonl.gz", "size_hint": 71709349, "expected_md5": "06aa37a25a84c6830ae5fb57ff877b05", "cache_path": "v1.0/zh/docs/docs-7.jsonl.gz" }, "v1.0/zh/corpus/8": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-zh/docs-8.jsonl.gz", "size_hint": 72069621, "expected_md5": "3ec702e19ef98ed6c9f639c936e293cd", "cache_path": "v1.0/zh/docs/docs-8.jsonl.gz" }, "v1.0/zh/corpus/9": { "url": "https://huggingface.co/datasets/miracl/miracl-corpus/resolve/main/miracl-corpus-v1.0-zh/docs-9.jsonl.gz", "size_hint": 63401726, "expected_md5": "841c4f70b603aee993451b3b75ab5236", "cache_path": "v1.0/zh/docs/docs-9.jsonl.gz" } }, "mmarco": { "zh/docs": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/collections/chinese_collection.tsv", "expected_md5": "d5672e88206038d8a3d261987b3bd788", "size_hint": 2227628877, "cache_path": "chinese_collection.tsv" }, "zh/queries/dev": { "url": "https://macavaney.us/files/mmarco/83820cac1d3e27a7c911d5116ebce558", "expected_md5": "83820cac1d3e27a7c911d5116ebce558", "size_hint": 4404468, "cache_path": "chinese_queries.dev.tsv" }, "zh/queries/dev/v1.1": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/queries/dev/chinese_queries.dev.tsv", "expected_md5": "9ea0d3e92aaf87d65c07c297893d0ff6", "size_hint": 4410653, "cache_path": "chinese_queries.dev.v1.1.tsv" }, "zh/scoreddocs/dev/v1.1": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/runs/run.bm25_chinese-trec.txt", "expected_md5": "36655599b6d0d0ae95dd49baa2e15acc", "size_hint": 41505364, "cache_path": "run.bm25_chinese-trec.txt" }, "zh/queries/train": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/queries/train/chinese_queries.train.tsv", "expected_md5": "21649d058877379bc8184923ab8ec408", "size_hint": 35231242, "cache_path": "chinese_queries.train.tsv" }, "fr/docs": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/collections/french_collection.tsv", "expected_md5": "941a8e717efb1ab74b9017976c08f73a", "size_hint": 3469351128, "cache_path": "french_collection.tsv" }, "fr/queries/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/queries/dev/french_queries.dev.tsv", "expected_md5": "a86c46314a0abfb090c542e672bb66e2", "size_hint": 5523530, "cache_path": "french_queries.dev.tsv" }, "fr/scoreddocs/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/runs/run.bm25_french-trec.txt", "expected_md5": "44df2483ab31dd269f99e0f2e925df29", "size_hint": 272726176, "cache_path": "run.bm25_french-trec.txt" }, "fr/queries/train": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/queries/train/french_queries.train.tsv", "expected_md5": "4c9f45b9c7497d6efb44e593d2f5af4d", "size_hint": 43700586, "cache_path": "french_queries.train.tsv" }, "de/docs": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/collections/german_collection.tsv", "expected_md5": "b0d40bb296c3ec903926243e8397560e", "size_hint": 3417960916, "cache_path": "german_collection.tsv" }, "de/queries/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/queries/dev/german_queries.dev.tsv", "expected_md5": "8d6383b34cd332f5f21c70c3e6a97579", "size_hint": 5039578, "cache_path": "german_queries.dev.tsv" }, "de/scoreddocs/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/runs/run.bm25_german-trec.txt", "expected_md5": "2e9fd71f8bb9770ef86971cacf7e9119", "size_hint": 264681166, "cache_path": "run.bm25_german-trec.txt" }, "de/queries/train": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/queries/train/german_queries.train.tsv", "expected_md5": "711b9e9c2163fe07468d6303bbf038f8", "size_hint": 39894960, "cache_path": "german_queries.train.tsv" }, "id/docs": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/collections/indonesian_collection.tsv", "expected_md5": "5e9fd243bfcf160a1177c898796ae53e", "size_hint": 3118483579, "cache_path": "indonesian_collection.tsv" }, "id/queries/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/queries/dev/indonesian_queries.dev.tsv", "expected_md5": "3d5f2261edb985d4fe052ed9e379a42a", "size_hint": 4693330, "cache_path": "indonesian_queries.dev.tsv" }, "id/scoreddocs/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/runs/run.bm25_indonesian-trec.txt", "expected_md5": "4c8b58801af5d691f489b3bb4765ec79", "size_hint": 275468922, "cache_path": "run.bm25_indonesian-trec.txt" }, "id/queries/train": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/queries/train/indonesian_queries.train.tsv", "expected_md5": "e74bc49a64adf32d105eeace23dc1a58", "size_hint": 37170090, "cache_path": "indonesian_queries.train.tsv" }, "it/docs": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/collections/italian_collection.tsv", "expected_md5": "c87e107fdd99f78a8789c984c96f3e46", "size_hint": 3318354977, "cache_path": "italian_collection.tsv" }, "it/queries/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/queries/dev/italian_queries.dev.tsv", "expected_md5": "d9df4b10d892288c81611e7ed74e1549", "size_hint": 5707295, "cache_path": "italian_queries.dev.tsv" }, "it/scoreddocs/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/runs/run.bm25_italian-trec.txt", "expected_md5": "b1635aa57fb329b441cab2bda35c6883", "size_hint": 280099315, "cache_path": "run.bm25_italian-trec.txt" }, "it/queries/train": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/queries/train/italian_queries.train.tsv", "expected_md5": "2aa7e48decfbfbaa327aa01f0d16bc3f", "size_hint": 45148915, "cache_path": "italian_queries.train.tsv" }, "ru/docs": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/collections/russian_collection.tsv", "expected_md5": "27f64ca50b1862d285c53a1bcdc26793", "size_hint": 5500129241, "cache_path": "russian_collection.tsv" }, "ru/queries/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/queries/dev/russian_queries.dev.tsv", "expected_md5": "4d803553d69f967764360570c3e66c84", "size_hint": 8539431, "cache_path": "russian_queries.dev.tsv" }, "ru/scoreddocs/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/runs/run.bm25_russian-trec.txt", "expected_md5": "1f8daf67b9624146dbe5a392000f78f3", "size_hint": 279699867, "cache_path": "run.bm25_russian-trec.txt" }, "ru/queries/train": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/queries/train/russian_queries.train.tsv", "expected_md5": "e4c09d563124569a632780c3ed1179b2", "size_hint": 67849583, "cache_path": "russian_queries.train.tsv" }, "es/docs": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/collections/spanish_collection.tsv", "expected_md5": "c82d3e5998f4cefb5a730a337680fac0", "size_hint": 3382637867, "cache_path": "spanish_collection.tsv" }, "es/queries/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/queries/dev/spanish_queries.dev.tsv", "expected_md5": "1359d9516bf5dd9bf732012a25e7f536", "size_hint": 5294892, "cache_path": "spanish_queries.dev.tsv" }, "es/scoreddocs/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/runs/run.bm25_spanish-trec.txt", "expected_md5": "fe545532f65b952d538ac6bda169c196", "size_hint": 272702362, "cache_path": "run.bm25_spanish-trec.txt" }, "es/queries/train": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/queries/train/spanish_queries.train.tsv", "expected_md5": "676e7011d020383422556d2c4b39b67d", "size_hint": 41890459, "cache_path": "spanish_queries.train.tsv" }, "pt/docs": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/collections/portuguese_collection.tsv", "expected_md5": "77260081f0332befa6aa3e6c922b8fb9", "size_hint": 3182982454, "cache_path": "portuguese_collection.tsv" }, "pt/queries/dev": { "url": "https://macavaney.us/files/mmarco/262ce189e3d57059b2795c16db44bb81", "expected_md5": "262ce189e3d57059b2795c16db44bb81", "size_hint": 4962136, "cache_path": "portuguese_queries.dev.tsv" }, "pt/queries/train": { "url": "https://macavaney.us/files/mmarco/df2ed4ef0bdb93405ba276a92530fc03", "expected_md5": "df2ed4ef0bdb93405ba276a92530fc03", "size_hint": 39231205, "cache_path": "portuguese_queries.train.tsv" }, "pt/queries/dev/v1.1": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/queries/dev/portuguese_queries.dev.tsv", "expected_md5": "4210db124ff1e3b7c803b9cb666c5e44", "size_hint": 4958298, "cache_path": "portuguese_queries.dev.v1.1.tsv" }, "pt/scoreddocs/dev/v1.1": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/runs/run.bm25_portuguese-trec.txt", "expected_md5": "8f8e6ecd4761bd2355f126ab289f57ca", "size_hint": 280708499, "cache_path": "run.bm25_portuguese-trec.txt" }, "pt/queries/train/v1.1": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/helsinki/queries/train/portuguese_queries.train.tsv", "expected_md5": "c253c476ff1ad1d51bae169cea180acd", "size_hint": 39210147, "cache_path": "portuguese_queries.train.v1.1.tsv" }, "train/triples": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/triples.train.ids.small.tsv", "expected_md5": "cc7865df9f2345132dea1c0746a4699c", "size_hint": 905211990, "cache_path": "triples.train.ids.small.tsv" }, "dev/qrels": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/qrels.dev.tsv", "expected_md5": "9157ccaeaa8227f91722ba5770787b16", "size_hint": 1201626, "cache_path": "qrels.dev.tsv" }, "dev/qrels-small": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/qrels.dev.small.tsv", "expected_md5": "38a80559a561707ac2ec0f150ecd1e8a", "size_hint": 143300, "cache_path": "qrels.dev.small.tsv" }, "v2/ar/docs": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/collections/arabic_collection.tsv", "expected_md5": "b73406ce3a3d31edea240603d031be7a", "size_hint": 4664307196, "cache_path": "v2/arabic_collection.tsv" }, "v2/zh/docs": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/collections/chinese_collection.tsv", "expected_md5": "d176049b56f65bd49248003b9ea8b2b0", "size_hint": 2720255044, "cache_path": "v2/chinese_collection.tsv" }, "v2/dt/docs": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/collections/dutch_collection.tsv", "expected_md5": "4a29599f160d0a696c7c3d3010da1912", "size_hint": 3362632528, "cache_path": "v2/dutch_collection.tsv" }, "v2/fr/docs": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/collections/french_collection.tsv", "expected_md5": "44fd86303b47d2c8c2f9f547cd67686f", "size_hint": 3656218984, "cache_path": "v2/french_collection.tsv" }, "v2/de/docs": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/collections/german_collection.tsv", "expected_md5": "dfedd22fef7e7e44966994e06cd7ec57", "size_hint": 3488187566, "cache_path": "v2/german_collection.tsv" }, "v2/hi/docs": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/collections/hindi_collection.tsv", "expected_md5": "4551a71b468cc109b1f985f6b1c3afe0", "size_hint": 7649320531, "cache_path": "v2/hindi_collection.tsv" }, "v2/id/docs": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/collections/indonesian_collection.tsv", "expected_md5": "d1ba5ff9788f9b11b497e3f75749409c", "size_hint": 3297783371, "cache_path": "v2/indonesian_collection.tsv" }, "v2/it/docs": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/collections/italian_collection.tsv", "expected_md5": "9c3aa4c4342074e8d37d75c9ffe5f22e", "size_hint": 3444092853, "cache_path": "v2/italian_collection.tsv" }, "v2/ja/docs": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/collections/japanese_collection.tsv", "expected_md5": "85614bd4dac3aa221c0b657d5cc71695", "size_hint": 3924982422, "cache_path": "v2/japanese_collection.tsv" }, "v2/pt/docs": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/collections/portuguese_collection.tsv", "expected_md5": "dde80fa2cc5782cae4c40d1127e51958", "size_hint": 3431011785, "cache_path": "v2/portuguese_collection.tsv" }, "v2/ru/docs": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/collections/russian_collection.tsv", "expected_md5": "9a0cef71748039a6112b0775592eb84d", "size_hint": 5769514997, "cache_path": "v2/russian_collection.tsv" }, "v2/es/docs": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/collections/spanish_collection.tsv", "expected_md5": "a9eac6b39239121795171da8c86db932", "size_hint": 3571559558, "cache_path": "v2/spanish_collection.tsv" }, "v2/vi/docs": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/collections/vietnamese_collection.tsv", "expected_md5": "dd68020237857a508e7abe2250dad28b", "size_hint": 4140054533, "cache_path": "v2/vietnamese_collection.tsv" }, "v2/ar/scoreddocs/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/runs/run.bm25_arabic-msmarco.txt", "expected_md5": "fc130f6ba64e7d0029c7697525ab728c", "size_hint": 130628203, "cache_path": "v2/run.bm25_arabic-msmarco.txt" }, "v2/zh/scoreddocs/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/runs/run.bm25_chinese-msmarco.txt", "expected_md5": "cad0177b9211526795630801ec219a36", "size_hint": 133131838, "cache_path": "v2/run.bm25_chinese-msmarco.txt" }, "v2/dt/scoreddocs/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/runs/run.bm25_dutch-msmarco.txt", "expected_md5": "4263908d9dfcdcd9ddb681cb529ca794", "size_hint": 126028515, "cache_path": "v2/run.bm25_dutch-msmarco.txt" }, "v2/fr/scoreddocs/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/runs/run.bm25_french-msmarco.txt", "expected_md5": "12d5e2d412905dbfa0c188a7c01e2a6a", "size_hint": 130311130, "cache_path": "v2/run.bm25_french-msmarco.txt" }, "v2/de/scoreddocs/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/runs/run.bm25_german-msmarco.txt", "expected_md5": "2a7ac8fc322c2ee9f0869f3de055d832", "size_hint": 125641364, "cache_path": "v2/run.bm25_german-msmarco.txt" }, "v2/hi/scoreddocs/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/runs/run.bm25_hindi-msmarco.txt", "expected_md5": "befe2edc072bba1fe02dae27dd91b586", "size_hint": 132794501, "cache_path": "v2/run.bm25_hindi-msmarco.txt" }, "v2/id/scoreddocs/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/runs/run.bm25_indonesian-msmarco.txt", "expected_md5": "50da2b41f7286152aef97279f4a604e6", "size_hint": 129538847, "cache_path": "v2/run.bm25_indonesian-msmarco.txt" }, "v2/it/scoreddocs/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/runs/run.bm25_italian-msmarco.txt", "expected_md5": "966ed936d26b0bf54feb4ccd83a8757c", "size_hint": 132615737, "cache_path": "v2/run.bm25_italian-msmarco.txt" }, "v2/ja/scoreddocs/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/runs/run.bm25_japanese-msmarco.txt", "expected_md5": "a3908dd2cec7c1d66008eab3b455b4d6", "size_hint": 130027238, "cache_path": "v2/run.bm25_japanese-msmarco.txt" }, "v2/pt/scoreddocs/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/runs/run.bm25_portuguese-msmarco.txt", "expected_md5": "2cb2166103a874f6a7c364764ee5a9fe", "size_hint": 133067761, "cache_path": "v2/run.bm25_portuguese-msmarco.txt" }, "v2/ru/scoreddocs/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/runs/run.bm25_russian-msmarco.txt", "expected_md5": "b0f3de7ab4bb72bea524775816327df8", "size_hint": 132235194, "cache_path": "v2/run.bm25_russian-msmarco.txt" }, "v2/es/scoreddocs/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/runs/run.bm25_spanish-msmarco.txt", "expected_md5": "35f59b4fd0f6c81099ed36d82ead36e8", "size_hint": 129268683, "cache_path": "v2/run.bm25_spanish-msmarco.txt" }, "v2/vi/scoreddocs/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/runs/run.bm25_vietnamese-msmarco.txt", "expected_md5": "1b5691e13e7c3053f825fcf47f663ea5", "size_hint": 133077756, "cache_path": "v2/run.bm25_vietnamese-msmarco.txt" }, "v2/ar/queries/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/queries/dev/arabic_queries.dev.tsv", "expected_md5": "d93efe298008c35389afacb3d9fedb06", "size_hint": 6545729, "cache_path": "v2/arabic.queries.dev.tsv" }, "v2/zh/queries/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/queries/dev/chinese_queries.dev.tsv", "expected_md5": "30c76201ecddb05d9b125a5f0ef5a6bb", "size_hint": 4002662, "cache_path": "v2/chinese.queries.dev.tsv" }, "v2/dt/queries/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/queries/dev/dutch_queries.dev.tsv", "expected_md5": "5bcc4ba604106fd6bb7e17030fcdc033", "size_hint": 4828536, "cache_path": "v2/dutch.queries.dev.tsv" }, "v2/fr/queries/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/queries/dev/french_queries.dev.tsv", "expected_md5": "c2a393af030f845041b85b44ca60680a", "size_hint": 5408487, "cache_path": "v2/french.queries.dev.tsv" }, "v2/de/queries/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/queries/dev/german_queries.dev.tsv", "expected_md5": "b420bdc83096caa07bd06667658d08c7", "size_hint": 4975836, "cache_path": "v2/german.queries.dev.tsv" }, "v2/hi/queries/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/queries/dev/hindi_queries.dev.tsv", "expected_md5": "880ec0423ebc6345450d86c030d29f1b", "size_hint": 10389233, "cache_path": "v2/hindi.queries.dev.tsv" }, "v2/id/queries/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/queries/dev/indonesian_queries.dev.tsv", "expected_md5": "2f9d37e7baaf7a3834af2d84ddafa376", "size_hint": 4660699, "cache_path": "v2/indonesian.queries.dev.tsv" }, "v2/it/queries/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/queries/dev/italian_queries.dev.tsv", "expected_md5": "9172fc14d18b1181d7f6c3120a66a8f9", "size_hint": 5044211, "cache_path": "v2/italian.queries.dev.tsv" }, "v2/ja/queries/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/queries/dev/japanese_queries.dev.tsv", "expected_md5": "23c86ba93c63891a95382d8e8198199f", "size_hint": 5823193, "cache_path": "v2/japanese.queries.dev.tsv" }, "v2/pt/queries/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/queries/dev/portuguese_queries.dev.tsv", "expected_md5": "73fb7009307c6aecd661184bce75cb5d", "size_hint": 4966328, "cache_path": "v2/portuguese.queries.dev.tsv" }, "v2/ru/queries/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/queries/dev/russian_queries.dev.tsv", "expected_md5": "b1fe46eea62d4c5c3776c1bc7c38034e", "size_hint": 7853062, "cache_path": "v2/russian.queries.dev.tsv" }, "v2/es/queries/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/queries/dev/spanish_queries.dev.tsv", "expected_md5": "b2f3c62c6d71700b9af610e7a29fef61", "size_hint": 5241697, "cache_path": "v2/spanish.queries.dev.tsv" }, "v2/vi/queries/dev": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/queries/dev/vietnamese_queries.dev.tsv", "expected_md5": "99df771fa60888632dca02431998cec5", "size_hint": 5775517, "cache_path": "v2/vietnamese.queries.dev.tsv" }, "v2/ar/queries/train": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/queries/train/arabic_queries.train.tsv", "expected_md5": "08ab9eed247819db2f1f013f7e06f0d6", "size_hint": 51869220, "cache_path": "v2/arabic.queries.train.tsv" }, "v2/zh/queries/train": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/queries/train/chinese_queries.train.tsv", "expected_md5": "166018bdb4d1a279c5083897d7f6752d", "size_hint": 31567548, "cache_path": "v2/chinese.queries.train.tsv" }, "v2/dt/queries/train": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/queries/train/dutch_queries.train.tsv", "expected_md5": "6c304cf6dbdaff18a876cc6012168d30", "size_hint": 38224557, "cache_path": "v2/dutch.queries.train.tsv" }, "v2/fr/queries/train": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/queries/train/french_queries.train.tsv", "expected_md5": "cbedd13e92ad9d70d049e0262e6956dd", "size_hint": 42857975, "cache_path": "v2/french.queries.train.tsv" }, "v2/de/queries/train": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/queries/train/german_queries.train.tsv", "expected_md5": "41461df4fd61a98d76d846daa2b797c2", "size_hint": 39404290, "cache_path": "v2/german.queries.train.tsv" }, "v2/hi/queries/train": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/queries/train/hindi_queries.train.tsv", "expected_md5": "b41f3f58b7c11ade80966276ceed50a8", "size_hint": 82511017, "cache_path": "v2/hindi.queries.train.tsv" }, "v2/id/queries/train": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/queries/train/indonesian_queries.train.tsv", "expected_md5": "5684c5bab64544f3ed62b050bdcf477d", "size_hint": 36857620, "cache_path": "v2/indonesian.queries.train.tsv" }, "v2/it/queries/train": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/queries/train/italian_queries.train.tsv", "expected_md5": "d40f32b4a4fb30c938e4b09adc1d1d81", "size_hint": 39923771, "cache_path": "v2/italian.queries.train.tsv" }, "v2/ja/queries/train": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/queries/train/japanese_queries.train.tsv", "expected_md5": "eeea5ea876f96eeeb66fb7f8d29055c9", "size_hint": 46027536, "cache_path": "v2/japanese.queries.train.tsv" }, "v2/pt/queries/train": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/queries/train/portuguese_queries.train.tsv", "expected_md5": "816a2b07fe7067d438f785ee9bd8ef88", "size_hint": 39281063, "cache_path": "v2/portuguese.queries.train.tsv" }, "v2/ru/queries/train": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/queries/train/russian_queries.train.tsv", "expected_md5": "9f9a76edc95fb91477683c610d287327", "size_hint": 62396828, "cache_path": "v2/russian.queries.train.tsv" }, "v2/es/queries/train": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/queries/train/spanish_queries.train.tsv", "expected_md5": "7f54a66db9c928b1245dd0709e6b1caf", "size_hint": 41464870, "cache_path": "v2/spanish.queries.train.tsv" }, "v2/vi/queries/train": { "url": "https://huggingface.co/datasets/unicamp-dl/mmarco/resolve/main/data/google/queries/train/vietnamese_queries.train.tsv", "expected_md5": "af28d1dde86c5bdd27751e3a14a55252", "size_hint": 45651702, "cache_path": "v2/vietnamese.queries.train.tsv" } }, "mr-tydi": { "ar": { "url": "https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/mrtydi-v1.0-arabic.tar.gz", "expected_md5": "a0dd1e06c27486b09762c033bde42b70", "size_hint": 321016840 }, "bn": { "url": "https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/mrtydi-v1.0-bengali.tar.gz", "expected_md5": "06ed183ce7c407f851e5f07370fdbbbb", "size_hint": 59707233 }, "en": { "url": "https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/mrtydi-v1.0-english.tar.gz", "expected_md5": "031277b7a7912aedb3a5ae58b93ec2c0", "size_hint": 4964684589 }, "fi": { "url": "https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/mrtydi-v1.0-finnish.tar.gz", "expected_md5": "92579475f609bc986aa3eb8260a7328a", "size_hint": 268373209 }, "id": { "url": "https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/mrtydi-v1.0-indonesian.tar.gz", "expected_md5": "708610c85ce2953ab281fd66c34d4f84", "size_hint": 168175031 }, "ja": { "url": "https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/mrtydi-v1.0-japanese.tar.gz", "expected_md5": "feff865aada3a55cafb8756bd2bf89af", "size_hint": 1054574801 }, "ko": { "url": "https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/mrtydi-v1.0-korean.tar.gz", "expected_md5": "ccf88f800e87cb62b735cb283ab6f50c", "size_hint": 222544514 }, "ru": { "url": "https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/mrtydi-v1.0-russian.tar.gz", "expected_md5": "fab64459133bc93a0bec2f0559bfb423", "size_hint": 1549150495 }, "sw": { "url": "https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/mrtydi-v1.0-swahili.tar.gz", "expected_md5": "1042af38a358bd3a60e0f548f9986f8a", "size_hint": 10452957 }, "te": { "url": "https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/mrtydi-v1.0-telugu.tar.gz", "expected_md5": "a2174529c3154fe9fa50179cb1584a0d", "size_hint": 73416550 }, "th": { "url": "https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/mrtydi-v1.0-thai.tar.gz", "expected_md5": "351d17d7e8614447f2d350bb736ea718", "size_hint": 112677400 } }, "msmarco-document": { "docs": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-docs.trec.gz", "size_hint": 8501799926, "expected_md5": "d4863e4f342982b51b9a8fc668b2d0c0", "cache_path": "collection.tsv.gz", "download_args": {"headers": {"X-Ms-Version": "2019-12-12"}} }, "train/queries": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-doctrain-queries.tsv.gz", "size_hint": 6457962, "expected_md5": "4086d31a9cf2d7b69c4932609058111d", "cache_path": "train/queries.tsv.gz" }, "train/qrels": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-doctrain-qrels.tsv.gz", "size_hint": 2385717, "expected_md5": "9d1609e240113b0504fd2e61cb36d924", "cache_path": "train/qrels.gz" }, "train/scoreddocs": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-doctrain-top100.gz", "size_hint": 403564127, "expected_md5": "be32fa12eb71e93014c84775d7465976", "cache_path": "train/ms.run.gz" }, "dev/queries": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-docdev-queries.tsv.gz", "size_hint": 91837, "expected_md5": "ac20593d71b9c32ab2633230f9cdf10d", "cache_path": "dev/queries.tsv.gz" }, "dev/qrels": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-docdev-qrels.tsv.gz", "size_hint": 38553, "expected_md5": "5eeafaeb4960979a62e7fed93273254e", "cache_path": "dev/qrels.gz" }, "dev/scoreddocs": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-docdev-top100.gz", "size_hint": 5701839, "expected_md5": "ac10255edf321821b0ccd0f123037780", "cache_path": "dev/ms.run.gz" }, "eval/queries": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/docleaderboard-queries.tsv.gz", "size_hint": 102131, "expected_md5": "50fe4285d64444c9ffc933b66a79f775", "cache_path": "eval/queries.tsv.gz" }, "eval/scoreddocs": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/docleaderboard-top100.tsv.gz", "size_hint": 6362021, "expected_md5": "a039a00356c09606962f3c07c68d02ef", "cache_path": "eval/ms.run.gz" }, "trec-dl-2019/queries": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", "size_hint": 4276, "expected_md5": "eda71eccbe4d251af83150abe065368c", "cache_path": "trec-dl-2019/queries.tsv.gz" }, "trec-dl-2019/qrels": { "url": "https://trec.nist.gov/data/deep/2019qrels-docs.txt", "irds_mirror": true, "size_hint": 339438, "expected_md5": "d7ef53b995ef7e01676ea85d7ec01dda", "cache_path": "trec-dl-2019/qrels" }, "trec-dl-2019/scoreddocs": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-doctest2019-top100.gz", "size_hint": 220457, "expected_md5": "91071b89dd52124057a87d53cd22028d", "cache_path": "trec-dl-2019/ms.run.gz" }, "trec-dl-2020/queries": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz", "size_hint": 4131, "expected_md5": "00a406fb0d14ed3752d70d1e4eb98600", "cache_path": "trec-dl-2020/queries.tsv.gz" }, "trec-dl-2020/scoreddocs": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-doctest2020-top100.gz", "size_hint": 208679, "expected_md5": "96f39dae3443736bd6393bd09a5a0a20", "cache_path": "trec-dl-2020/ms.run.gz" }, "trec-dl-2020/qrels": { "url": "https://trec.nist.gov/data/deep/2020qrels-docs.txt", "irds_mirror": true, "size_hint": 182852, "expected_md5": "e10f3545583b124a4ed5e7992293e15a", "cache_path": "trec-dl-2020/qrels" }, "orcas/queries": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/orcas-doctrain-queries.tsv.gz", "size_hint": 104209356, "expected_md5": "519c5f522294406e3b0574d7d53cf233", "cache_path": "orcas/queries.tsv.gz" }, "orcas/qrels": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/orcas-doctrain-qrels.tsv.gz", "size_hint": 109824304, "expected_md5": "3f94db106374be649782022c3018acd0", "cache_path": "orcas/qrels.gz" }, "orcas/scoreddocs": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/orcas-doctrain-top100.gz", "size_hint": 10724320629, "expected_md5": "118d0884638fd405e111157a124ef0b2", "cache_path": "orcas/ms.run.gz", "download_args": {"headers": {"X-Ms-Version": "2019-12-12"}} }, "trec-dl-hard/qrels": { "url": "https://raw.githubusercontent.com/grill-lab/DL-Hard/main/dataset/dl_hard-doc.qrels", "size_hint": 173803, "expected_md5": "06dfe71d497e081a7c4c1294979edb7d", "cache_path": "trec-dl-hard/qrels" }, "anchor-text": { "url": "https://huggingface.co/datasets/webis/ms-marco-anchor-text/resolve/main/ms-marco-v1/anchor-text/common-crawl-union-2016-to-2021-to-ms-marco-v1.jsonl.gz", "size_hint": 407851729, "expected_md5": "4f7af19c455976f7c2606b97ffb7a89f", "cache_path": "anchor-text-separate-v1.jsonl.gz" } }, "msmarco-document-v2": { "docs": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco_v2_doc.tar", "size_hint": 34648862720, "expected_md5": "eea90100409a254fdb157b8e4e349deb", "cache_path": "msmarco_v2_doc.tar", "download_args": {"headers": {"X-Ms-Version": "2019-12-12"}} }, "train_queries": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_train_queries.tsv", "size_hint": 13511656, "expected_md5": "7821d8bef3971e12780a80a89a3e5cbd", "cache_path": "train/queries.tsv" }, "train_scoreddocs": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_train_top100.txt.gz", "size_hint": 424107669, "expected_md5": "b4d5915172d5f54bd23c31e966c114de", "cache_path": "train/top100.trec.gz" }, "train_qrels": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_train_qrels.tsv", "size_hint": 12450533, "expected_md5": "2f788d031c2ca29c4c482167fa5966de", "cache_path": "train/qrels" }, "dev1_queries": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_dev_queries.tsv", "size_hint": 191992, "expected_md5": "b05dc19f1d2b8ad729f189328a685aa1", "cache_path": "dev1/queries.tsv" }, "dev1_scoreddocs": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_dev_top100.txt.gz", "size_hint": 5830666, "expected_md5": "4dd27d511748bede545cd7ae3fc92bf4", "cache_path": "dev1/top100.trec" }, "dev1_qrels": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_dev_qrels.tsv", "size_hint": 177593, "expected_md5": "aad92d731892ccb0cf9c4c2e37e0f0f1", "cache_path": "dev1/qrels" }, "dev2_queries": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_dev2_queries.tsv", "size_hint": 209911, "expected_md5": "f000319f1893a7acdd60fdcae0703b95", "cache_path": "dev2/queries.tsv" }, "dev2_scoreddocs": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_dev2_top100.txt.gz", "size_hint": 6412563, "expected_md5": "e03b5404e9027569c1aa794b1408d8a5", "cache_path": "dev2/top100.trec" }, "dev2_qrels": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_dev2_qrels.tsv", "size_hint": 195474, "expected_md5": "f2eead4b192683ae5fbd66f4d3f08b96", "cache_path": "dev2/qrels" }, "trec-dl-2019/queries": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", "size_hint": 4276, "expected_md5": "eda71eccbe4d251af83150abe065368c", "cache_path": "trec-dl-2019/queries.tsv.gz" }, "trec_dl_2019_qrels": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_trec2019_qrels.txt.gz", "size_hint": 107475, "expected_md5": "aa37367d6db77c63d1ee392de6d446e2", "cache_path": "trec-dl-2019/qrels.gz" }, "trec-dl-2020/queries": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz", "size_hint": 4131, "expected_md5": "00a406fb0d14ed3752d70d1e4eb98600", "cache_path": "trec-dl-2020/queries.tsv.gz" }, "trec_dl_2020_qrels": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_trec2020_qrels.txt.gz", "size_hint": 62387, "expected_md5": "06a0a6f29621ee477a093d9dba8e5be9", "cache_path": "trec-dl-2020/qrels.gz" }, "trec-dl-2021/queries": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/2021_queries.tsv", "size_hint": 24585, "expected_md5": "46d863434dda18300f5af33ee29c4b28", "cache_path": "trec-dl-2021/queries.tsv" }, "trec-dl-2021/scoreddocs": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/2021_document_top100.txt.gz", "size_hint": 618228, "expected_md5": "0bc85e3f2a6f798b91e18f0cd4a6bc6b", "cache_path": "trec-dl-2021/top100.trec.gz" }, "trec-dl-2021/qrels": { "url": "https://trec.nist.gov/data/deep/2021.qrels.docs.final.txt", "size_hint": 478328, "irds_mirror": true, "expected_md5": "3b266fdaf27f3775e04028765a4839d3", "cache_path": "trec-dl-2021/qrels" }, "anchor-text": { "url": "https://huggingface.co/datasets/webis/ms-marco-anchor-text/resolve/main/ms-marco-v2/anchor-text/common-crawl-union-2016-to-2021-to-ms-marco-v2.jsonl.gz", "size_hint": 751699569, "expected_md5": "8b96dbaf4efcae08e0ee307e03f3434d", "cache_path": "anchor-text-separate-v2.jsonl.gz" }, "trec-dl-2022/queries": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/2022_queries.tsv", "size_hint": 21508, "expected_md5": "f1bfd53d80e81e58207ce557fd2211a0", "cache_path": "trec-dl-2022/queries.tsv" }, "trec-dl-2022/scoreddocs": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/2022_document_top100.txt.gz", "size_hint": 642721, "expected_md5": "93f70329ce1b9ce913a5f87008736ff2", "cache_path": "trec-dl-2022/top100.txt.gz" }, "trec-dl-2022/qrels": { "url": "https://trec.nist.gov/data/deep/2022.qrels.docs.inferred.txt", "irds_mirror": true, "size_hint": 13808681, "expected_md5": "cca2e4db9d842e6262500532809bd571", "cache_path": "trec-dl-2022/qrels.inferred.txt" }, "trec-dl-2023/queries": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/2023_queries.tsv", "size_hint": 38128, "expected_md5": "7df9e17b47cc9aa5d1c9fd5b313e273c", "cache_path": "trec-dl-2023/queries.tsv" }, "trec-dl-2023/scoreddocs": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/2023_document_top100.txt.gz", "size_hint": 902168, "expected_md5": "0f5d548e53afb9e319c837ad67f9046a", "cache_path": "trec-dl-2023/top100.txt.gz" }, "trec-dl-2023/qrels": { "url": "https://trec.nist.gov/data/deep/2023.qrels.docs.withDupes.txt", "irds_mirror": true, "size_hint": 675015, "expected_md5": "1e9c540b3cb03bcc975a583586c04090", "cache_path": "trec-dl-2023/qrels.withDupes.txt" } }, "msmarco-passage": { "collectionandqueries": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/collectionandqueries.tar.gz", "size_hint": 1057717952, "expected_md5": "31644046b18952c1386cd4564ba2ae69", "cache_path": "collectionandqueries.tar.gz", "download_args": {"headers": {"X-Ms-Version": "2019-12-12"}} }, "queries": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz", "size_hint": 18882551, "expected_md5": "c177b2795d5f2dcc524cf00fcd973be1", "cache_path": "queries.tar.gz" }, "medmarco_ids": { "url": "https://raw.githubusercontent.com/Georgetown-IR-Lab/covid-neural-ir/master/med-msmarco-train.txt", "size_hint": 548428, "expected_md5": "dc5199de7d4a872c361f89f08b1163ef", "cache_path": "medmarco.qids" }, "train/qrels": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/qrels.train.tsv", "size_hint": 10589532, "expected_md5": "733fb9fe12d93e497f7289409316eccf", "cache_path": "train/qrels" }, "train/docpairs": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/qidpidtriples.train.full.tsv.gz", "size_hint": 2633557579, "expected_md5": "215a5204288820672f5e9451d9e202c5", "cache_path": "train/docpairs.tsv.gz", "download_args": {"headers": {"X-Ms-Version": "2019-12-12"}} }, "train/docpairs/v2": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/qidpidtriples.train.full.2.tsv.gz", "size_hint": 1841693309, "expected_md5": "219083e80a0a751c08b968c2f31a4e0b", "cache_path": "train/qidpidtriples.train.full.2.tsv.gz", "download_args": {"headers": {"X-Ms-Version": "2019-12-12"}} }, "train/docpairs/small": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/triples.train.small.tar.gz", "size_hint": 7930881353, "expected_md5": "c13bf99ff23ca691105ad12eab837f84", "download_args": {"headers": {"X-Ms-Version": "2019-12-12"}} }, "train/scoreddocs": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/top1000.train.tar.gz", "size_hint": 11519984492, "expected_md5": "d99fdbd5b2ea84af8aa23194a3263052", "download_args": {"headers": {"X-Ms-Version": "2019-12-12"}} }, "dev/qrels": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/qrels.dev.tsv", "size_hint": 1201626, "expected_md5": "9157ccaeaa8227f91722ba5770787b16", "cache_path": "dev/qrels" }, "dev/scoreddocs": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/top1000.dev.tar.gz", "size_hint": 687414398, "expected_md5": "8c140662bdf123a98fbfe3bb174c5831", "download_args": {"headers": {"X-Ms-Version": "2019-12-12"}} }, "eval/scoreddocs": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/top1000.eval.tar.gz", "size_hint": 673440221, "expected_md5": "73778cd99f6e0632d12d0b5731b20a02", "download_args": {"headers": {"X-Ms-Version": "2019-12-12"}} }, "trec-dl-2019/qrels": { "url": "https://trec.nist.gov/data/deep/2019qrels-pass.txt", "irds_mirror": true, "size_hint": 187092, "expected_md5": "2f4be390198da108f6845c822e5ada14", "cache_path": "trec-dl-2019/qrels" }, "trec-dl-2019/queries": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", "size_hint": 4276, "expected_md5": "eda71eccbe4d251af83150abe065368c" }, "trec-dl-2019/scoreddocs": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-passagetest2019-top1000.tsv.gz", "size_hint": 26634062, "expected_md5": "ec9e012746aa9763c7ff10b3336a3ce1" }, "trec-dl-2020/queries": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz", "size_hint": 4131, "expected_md5": "00a406fb0d14ed3752d70d1e4eb98600", "cache_path": "trec-dl-2020/queries.tsv.gz" }, "trec-dl-2020/scoreddocs": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-passagetest2020-top1000.tsv.gz", "size_hint": 26230218, "expected_md5": "aa6fbc51d66bd1dc745964c0e140a727" }, "trec-dl-2020/qrels": { "url": "https://trec.nist.gov/data/deep/2020qrels-pass.txt", "irds_mirror": true, "size_hint": 218617, "expected_md5": "0355ccee7509ac0463e8278186cdd8d1", "cache_path": "trec-dl-2020/qrels" }, "trec-dl-hard/qrels": { "url": "https://raw.githubusercontent.com/grill-lab/DL-Hard/main/dataset/dl_hard-passage.qrels", "size_hint": 83362, "expected_md5": "8583c2cbad56eeacb449586fe1d2a471", "cache_path": "trec-dl-hard/qrels" } }, "msmarco-passage-v2": { "passages": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco_v2_passage.tar", "size_hint": 21768192000, "expected_md5": "05946bac48a8ffee62e160213eab3fda", "cache_path": "msmarco_v2_passage.tar", "download_args": {"headers": {"X-Ms-Version": "2019-12-12"}} }, "train/queries": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_train_queries.tsv", "size_hint": 11608838, "expected_md5": "1835f44e6792c51aa98eed722a8dcc11", "cache_path": "train/queries.tsv" }, "train/scoreddocs": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_train_top100.txt.gz", "size_hint": 340634991, "expected_md5": "7cd731ed984fccb2396f11a284cea800", "cache_path": "train/top100.txt.gz" }, "train/qrels": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_train_qrels.tsv", "size_hint": 11620946, "expected_md5": "a2e37e9a9c7ca13d6e38be0512a52017", "cache_path": "train/qrels.tsv" }, "dev1/queries": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_dev_queries.tsv", "size_hint": 164507, "expected_md5": "0fa4c6d64a653142ade9fc61d7484239", "cache_path": "dev1/queries.tsv" }, "dev1/scoreddocs": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_dev_top100.txt.gz", "size_hint": 4882727, "expected_md5": "fee817a3ee273be8623379e5d3108c0b", "cache_path": "dev1/top100.txt.gz" }, "dev1/qrels": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_dev_qrels.tsv", "size_hint": 165024, "expected_md5": "10f9263260d206d8fb8f13864aea123a", "cache_path": "dev1/qrels.tsv" }, "dev2/queries": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_dev2_queries.tsv", "size_hint": 179603, "expected_md5": "565b84dfa7ccd2f4251fa2debea5947a", "cache_path": "dev2/queries.tsv" }, "dev2/scoreddocs": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_dev2_top100.txt.gz", "size_hint": 5355464, "expected_md5": "da532bf26169a3a2074fae774471cc9f", "cache_path": "dev2/top100.txt.gz" }, "dev2/qrels": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_dev2_qrels.tsv", "size_hint": 181612, "expected_md5": "8ed8577fa459d34b59cf69b4daa2baeb", "cache_path": "dev2/qrels.tsv" }, "trec-dl-2021/queries": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/2021_queries.tsv", "size_hint": 24585, "expected_md5": "46d863434dda18300f5af33ee29c4b28", "cache_path": "trec-dl-2021/queries.tsv" }, "trec-dl-2021/scoreddocs": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/2021_passage_top100.txt.gz", "size_hint": 604533, "expected_md5": "e2be2d307da26d1a3f76eb95507672a3", "cache_path": "trec-dl-2021/top100.trec.gz" }, "trec-dl-2021/qrels": { "url": "https://trec.nist.gov/data/deep/2021.qrels.pass.final.txt", "irds_mirror": true, "size_hint": 433887, "expected_md5": "c5b76ec95b589732edc9040302e22a2b", "cache_path": "trec-dl-2021/qrels" }, "trec-dl-2022/queries": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/2022_queries.tsv", "size_hint": 21508, "expected_md5": "f1bfd53d80e81e58207ce557fd2211a0", "cache_path": "trec-dl-2022/queries.tsv" }, "trec-dl-2022/scoreddocs": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/2022_passage_top100.txt.gz", "size_hint": 630095, "expected_md5": "36004dfad64826167aeecddff1d490a6", "cache_path": "trec-dl-2022/top100.txt.gz" }, "trec-dl-2022/qrels": { "url": "https://trec.nist.gov/data/deep/2022.qrels.pass.withDupes.txt", "irds_mirror": true, "size_hint": 15800539, "expected_md5": "b36484d6cfd039664a570a4bf04f0eeb", "cache_path": "trec-dl-2022/qrels.withDupes.txt" }, "trec-dl-2023/queries": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/2023_queries.tsv", "size_hint": 38128, "expected_md5": "7df9e17b47cc9aa5d1c9fd5b313e273c", "cache_path": "trec-dl-2023/queries.tsv" }, "trec-dl-2023/scoreddocs": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/2023_passage_top100.txt.gz", "size_hint": 888898, "expected_md5": "c339ed75e1556cacb387899f34cadad1", "cache_path": "trec-dl-2023/top100.txt.gz" }, "trec-dl-2023/qrels": { "url": "https://trec.nist.gov/data/deep/2023.qrels.pass.withDupes.txt", "irds_mirror": true, "size_hint": 912450, "expected_md5": "3a742d51ae65da2ece9c09b304b9e358", "cache_path": "trec-dl-2023/qrels.withDupes.txt" }, "dedup_positions": { "url": "https://huggingface.co/datasets/macavaney/msmarco-passage-v2-duplicate-ids/resolve/main/dedup_positions.tar.gz", "size_hint": 388936118, "expected_md5": "08bfed7742450ccaae64f8bf12033886", "cache_path": "dedup_positions.tar.gz" } }, "msmarco-qna": { "train": { "url": "https://msmarco.z22.web.core.windows.net/msmarco/train_v2.1.json.gz", "size_hint": 1112116929, "expected_md5": "576230a745a06943c3a49e76acea1d9d", "cache_path": "train_v2.1.json.gz" }, "dev": { "url": "https://msmarco.z22.web.core.windows.net/msmarco/dev_v2.1.json.gz", "size_hint": 138303699, "expected_md5": "5e14839f31c933560fbb3bae4ce67829", "cache_path": "dev_v2.1.json.gz" }, "eval": { "url": "https://msmarco.z22.web.core.windows.net/msmarco/eval_v2.1_public.json.gz", "size_hint": 133851237, "expected_md5": "5fcca9336c7486498c3e1cf81fa89f74", "cache_path": "eval_v2.1_public.json.gz" } }, "nano-beir": { "arguana/docs" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoArguAna/resolve/main/corpus/train-00000-of-00001.parquet", "size_hint": 3635, "expected_md5": "ee563dc09a91032b494f469b6c807a9c", "cache_path": "arguana/corpus.parquet" }, "arguana/qrels" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoArguAna/resolve/main/qrels/train-00000-of-00001.parquet", "size_hint": 50, "expected_md5": "5fab96653b0455879606e82071ef4b21", "cache_path": "arguana/qrels.parquet" }, "arguana/queries" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoArguAna/resolve/main/queries/train-00000-of-00001.parquet", "size_hint": 50, "expected_md5": "5ad0a26fad750dc74eae00ab753731b8", "cache_path": "arguana/queries.parquet" }, "climate-fever/docs" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoClimateFEVER/resolve/main/corpus/train-00000-of-00001.parquet", "size_hint": 3408, "expected_md5": "97a57296bbb7e48ffc8b763c4b2a188b", "cache_path": "climate-fever/corpus.parquet" }, "climate-fever/qrels" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoClimateFEVER/resolve/main/qrels/train-00000-of-00001.parquet", "size_hint": 148, "expected_md5": "3042ccd6054266951ac1338c57a2bb85", "cache_path": "climate-fever/qrels.parquet" }, "climate-fever/queries" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoClimateFEVER/resolve/main/queries/train-00000-of-00001.parquet", "size_hint": 50, "expected_md5": "0f4d43544ae8755e0557f89be3ae7e74", "cache_path": "climate-fever/queries.parquet" }, "dbpedia-entity/docs" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoDBPedia/resolve/main/corpus/train-00000-of-00001.parquet", "size_hint": 6045, "expected_md5": "cee2bc11f2b84fe636b5f219246221e0", "cache_path": "dbpedia-entity/corpus.parquet" }, "dbpedia-entity/qrels" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoDBPedia/resolve/main/qrels/train-00000-of-00001.parquet", "size_hint": 1158, "expected_md5": "8a9fa7e95c276d810ddbb53b8054801d", "cache_path": "dbpedia-entity/qrels.parquet" }, "dbpedia-entity/queries" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoDBPedia/resolve/main/queries/train-00000-of-00001.parquet", "size_hint": 50, "expected_md5": "ec7d771ba38310699a5b323aa693febe", "cache_path": "dbpedia-entity/queries.parquet" }, "fever/docs" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoFEVER/resolve/main/corpus/train-00000-of-00001.parquet", "size_hint": 4996, "expected_md5": "0b052fca4b4c975088bc2f01d7f2b749", "cache_path": "fever/corpus.parquet" }, "fever/qrels" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoFEVER/resolve/main/qrels/train-00000-of-00001.parquet", "size_hint": 57, "expected_md5": "a25763bbae313d39668c37dbadb70cd3", "cache_path": "fever/qrels.parquet" }, "fever/queries" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoFEVER/resolve/main/queries/train-00000-of-00001.parquet", "size_hint": 50, "expected_md5": "43122d82b3df866afad1f37a28e6ba60", "cache_path": "fever/queries.parquet" }, "fiqa/docs" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoFiQA2018/resolve/main/corpus/train-00000-of-00001.parquet", "size_hint": 4598, "expected_md5": "873e5dcf7920fc8c0ff210cf06907607", "cache_path": "fiqa/corpus.parquet" }, "fiqa/qrels" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoFiQA2018/resolve/main/qrels/train-00000-of-00001.parquet", "size_hint": 123, "expected_md5": "f22d89ef6658379498bc92e92cf92d26", "cache_path": "fiqa/qrels.parquet" }, "fiqa/queries" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoFiQA2018/resolve/main/queries/train-00000-of-00001.parquet", "size_hint": 50, "expected_md5": "8b25da8b2d2e40a483d7a782a048a950", "cache_path": "fiqa/queries.parquet" }, "hotpotqa/docs" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoHotpotQA/resolve/main/corpus/train-00000-of-00001.parquet", "size_hint": 5090, "expected_md5": "da3d14375223d0e5dad172acdbb1c332", "cache_path": "hotpotqa/corpus.parquet" }, "hotpotqa/qrels" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoHotpotQA/resolve/main/qrels/train-00000-of-00001.parquet", "size_hint": 100, "expected_md5": "3ed8424185156af1fe4c55aac84bce92", "cache_path": "hotpotqa/qrels.parquet" }, "hotpotqa/queries" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoHotpotQA/resolve/main/queries/train-00000-of-00001.parquet", "size_hint": 50, "expected_md5": "c1f13d4ed9f88a89285e39d9e13f5c38", "cache_path": "hotpotqa/queries.parquet" }, "msmarco/docs" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoMSMARCO/resolve/main/corpus/train-00000-of-00001.parquet", "size_hint": 5043, "expected_md5": "15ab2a1d3674e562d088e64dc5781373", "cache_path": "msmarco/corpus.parquet" }, "msmarco/qrels" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoMSMARCO/resolve/main/qrels/train-00000-of-00001.parquet", "size_hint": 50, "expected_md5": "c72caacf4bc04efa8af6b2ac2944ea92", "cache_path": "msmarco/qrels.parquet" }, "msmarco/queries" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoMSMARCO/resolve/main/queries/train-00000-of-00001.parquet", "size_hint": 50, "expected_md5": "9ab46c94f88bfed1e338dc0887568f03", "cache_path": "msmarco/queries.parquet" }, "nfcorpus/docs" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoNFCorpus/resolve/main/corpus/train-00000-of-00001.parquet", "size_hint": 2953, "expected_md5": "deb02538a3b030843a8ccca212dc6a3c", "cache_path": "nfcorpus/corpus.parquet" }, "nfcorpus/qrels" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoNFCorpus/resolve/main/qrels/train-00000-of-00001.parquet", "size_hint": 2518, "expected_md5": "402762544651e0fc6df10cd76e2cbfae", "cache_path": "nfcorpus/qrels.parquet" }, "nfcorpus/queries" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoNFCorpus/resolve/main/queries/train-00000-of-00001.parquet", "size_hint": 50, "expected_md5": "6578fc992124bb997e32bd1192c1b475", "cache_path": "nfcorpus/queries.parquet" }, "nq/docs" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoNQ/resolve/main/corpus/train-00000-of-00001.parquet", "size_hint": 5035, "expected_md5": "aaf73189d2b8d0de039dc2619d7a29ef", "cache_path": "nq/corpus.parquet" }, "nq/qrels" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoNQ/resolve/main/qrels/train-00000-of-00001.parquet", "size_hint": 57, "expected_md5": "fc2b183187c4f99e0939b55dc11b1908", "cache_path": "nq/qrels.parquet" }, "nq/queries" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoNQ/resolve/main/queries/train-00000-of-00001.parquet", "size_hint": 50, "expected_md5": "87de29b17a669fc61d1f192bd5099887", "cache_path": "nq/queries.parquet" }, "quora/docs" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoQuoraRetrieval/resolve/main/corpus/train-00000-of-00001.parquet", "size_hint": 5046, "expected_md5": "a8cd6d41cbeb6bda2a3ed90526de4521", "cache_path": "quora/corpus.parquet" }, "quora/qrels" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoQuoraRetrieval/resolve/main/qrels/train-00000-of-00001.parquet", "size_hint": 70, "expected_md5": "650ca21cef9a73b83e3845c93e3a230a", "cache_path": "quora/qrels.parquet" }, "quora/queries" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoQuoraRetrieval/resolve/main/queries/train-00000-of-00001.parquet", "size_hint": 50, "expected_md5": "590ed67697e12f21fafe436c07e6f772", "cache_path": "quora/queries.parquet" }, "scidocs/docs" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoSCIDOCS/resolve/main/corpus/train-00000-of-00001.parquet", "size_hint": 2210, "expected_md5": "60f9f2a7206f0e8a0fc185df2b7ab00b", "cache_path": "scidocs/corpus.parquet" }, "scidocs/qrels" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoSCIDOCS/resolve/main/qrels/train-00000-of-00001.parquet", "size_hint": 244, "expected_md5": "e21c1514761b06ac6bba5a8f958d8803", "cache_path": "scidocs/qrels.parquet" }, "scidocs/queries" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoSCIDOCS/resolve/main/queries/train-00000-of-00001.parquet", "size_hint": 50, "expected_md5": "940e2220bc9667138774a36f6c771c97", "cache_path": "scidocs/queries.parquet" }, "scifact/docs" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoSciFact/resolve/main/corpus/train-00000-of-00001.parquet", "size_hint": 2919, "expected_md5": "9f82bfc70f9b7d7f0275fb6b8ee38876", "cache_path": "scifact/corpus.parquet" }, "scifact/qrels" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoSciFact/resolve/main/qrels/train-00000-of-00001.parquet", "size_hint": 56, "expected_md5": "1155d79cf3854fa43c72c8458f34f744", "cache_path": "scifact/qrels.parquet" }, "scifact/queries" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoSciFact/resolve/main/queries/train-00000-of-00001.parquet", "size_hint": 50, "expected_md5": "0218d8ad9c0bd8671002f16edb497ff3", "cache_path": "scifact/queries.parquet" }, "webis-touche2020/docs" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoTouche2020/resolve/main/corpus/train-00000-of-00001.parquet", "size_hint": 5745, "expected_md5": "fdea27196d1234ab4aa2c49c9a849840", "cache_path": "webis-touche2020/corpus.parquet" }, "webis-touche2020/qrels" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoTouche2020/resolve/main/qrels/train-00000-of-00001.parquet", "size_hint": 932, "expected_md5": "4b3de9c60d59be85ae4483e1210caff9", "cache_path": "webis-touche2020/qrels.parquet" }, "webis-touche2020/queries" : { "url": "https://huggingface.co/datasets/zeta-alpha-ai/NanoTouche2020/resolve/main/queries/train-00000-of-00001.parquet", "size_hint": 49, "expected_md5": "0ba668cdae413f411644871f544a9d1b", "cache_path": "webis-touche2020/queries.parquet" } }, "neumarco": { "main": { "url": "https://livejohnshopkins-my.sharepoint.com/:u:/g/personal/dlawrie1_jh_edu/EQcICtPaSqFNoCZHtoeZszoB7FC362BvaPvieUSk2j30tA?download=1", "size_hint": 3723728998, "expected_md5": "733181c211959a7c09c695bfcddaea54", "cache_path": "neuMSMARCO.tar.gz" } }, "nfcorpus": { "main": { "url": "https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/nfcorpus.tar.gz", "size_hint": 31039523, "expected_md5": "49c061fbadc52ba4d35d0e42e2d742fd", "cache_path": "nfcorpus.tar.gz" } }, "natural-questions": { "nq-dev-00": { "url": "https://storage.googleapis.com/natural_questions/v1.0/dev/nq-dev-00.jsonl.gz", "size_hint": 219593373, "expected_md5": "21df324e9d0725c7cbc5ec06a34b630b", "stream": true, "skip_local": true }, "nq-dev-01": { "url": "https://storage.googleapis.com/natural_questions/v1.0/dev/nq-dev-01.jsonl.gz", "size_hint": 200209706, "expected_md5": "3de5cd9d66b705f3ef0462c9bcde1c4b", "stream": true, "skip_local": true }, "nq-dev-02": { "url": "https://storage.googleapis.com/natural_questions/v1.0/dev/nq-dev-02.jsonl.gz", "size_hint": 210446574, "expected_md5": "d7b6f2e7f296006ad2f3de291d5960ce", "stream": true, "skip_local": true }, "nq-dev-03": { "url": "https://storage.googleapis.com/natural_questions/v1.0/dev/nq-dev-03.jsonl.gz", "size_hint": 216859801, "expected_md5": "0d93b1e520328c50e0f4582dc89023c7", "stream": true, "skip_local": true }, "nq-dev-04": { "url": "https://storage.googleapis.com/natural_questions/v1.0/dev/nq-dev-04.jsonl.gz", "size_hint": 220929521, "expected_md5": "cc27dc8fc0a2d2753e2ff60a7e6fb976", "stream": true, "skip_local": true }, "nq-train-00": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-00.jsonl.gz", "size_hint": 858728609, "expected_md5": "22c9c2954ea80ff33f9667a9b398c86c", "stream": true, "skip_local": true }, "nq-train-01": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-01.jsonl.gz", "size_hint": 891498165, "expected_md5": "2b76373340261ef019434235a2671a49", "stream": true, "skip_local": true }, "nq-train-02": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-02.jsonl.gz", "size_hint": 885374316, "expected_md5": "785f6ca88ba7f039240218d20644f121", "stream": true, "skip_local": true }, "nq-train-03": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-03.jsonl.gz", "size_hint": 885313666, "expected_md5": "4386753b84222234e12455cf99f116b9", "stream": true, "skip_local": true }, "nq-train-04": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-04.jsonl.gz", "size_hint": 890873425, "expected_md5": "e02d403b1bacee5404f1db48fa3250d9", "stream": true, "skip_local": true }, "nq-train-05": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-05.jsonl.gz", "size_hint": 873023109, "expected_md5": "39c46c3b5ab81ba92766dcb796037fd4", "stream": true, "skip_local": true }, "nq-train-06": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-06.jsonl.gz", "size_hint": 866509301, "expected_md5": "e2c8f7434a40cc3740be8e0597c0db78", "stream": true, "skip_local": true }, "nq-train-07": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-07.jsonl.gz", "size_hint": 838940867, "expected_md5": "3fc55e3e1ef3d834fed665840522e841", "stream": true, "skip_local": true }, "nq-train-08": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-08.jsonl.gz", "size_hint": 902610214, "expected_md5": "660990d45610d4d4f37fcd21965b183e", "stream": true, "skip_local": true }, "nq-train-09": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-09.jsonl.gz", "size_hint": 883494801, "expected_md5": "f4335cb2b08d3166ef5fa007560d0f95", "stream": true, "skip_local": true }, "nq-train-10": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-10.jsonl.gz", "size_hint": 876311133, "expected_md5": "66c607ec3dae612422dd44a0a4ead32b", "stream": true, "skip_local": true }, "nq-train-11": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-11.jsonl.gz", "size_hint": 878127326, "expected_md5": "a9b36187b8ccee70f461e8ae06109cb4", "stream": true, "skip_local": true }, "nq-train-12": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-12.jsonl.gz", "size_hint": 889257016, "expected_md5": "feeec2103ce408899c721fc937dab83b", "stream": true, "skip_local": true }, "nq-train-13": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-13.jsonl.gz", "size_hint": 891769129, "expected_md5": "97f3068af15e6a0c1d107bfa6dc50f70", "stream": true, "skip_local": true }, "nq-train-14": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-14.jsonl.gz", "size_hint": 892523839, "expected_md5": "c4ea6358f617fbb8f9e65760e88b2028", "stream": true, "skip_local": true }, "nq-train-15": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-15.jsonl.gz", "size_hint": 910660095, "expected_md5": "e3bf5b86c977b41bce825d853fba8e3e", "stream": true, "skip_local": true }, "nq-train-16": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-16.jsonl.gz", "size_hint": 878177689, "expected_md5": "92abe032b4608bf35f08bcf8295eb1d3", "stream": true, "skip_local": true }, "nq-train-17": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-17.jsonl.gz", "size_hint": 872805189, "expected_md5": "2c3acddbce0f5221f24e9db4e1d7662a", "stream": true, "skip_local": true }, "nq-train-18": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-18.jsonl.gz", "size_hint": 875275428, "expected_md5": "a301430662fb1f25a73359f521d1da47", "stream": true, "skip_local": true }, "nq-train-19": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-19.jsonl.gz", "size_hint": 862034169, "expected_md5": "9e0e96cca9e3594f885d8e6063cbd0c7", "stream": true, "skip_local": true }, "nq-train-20": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-20.jsonl.gz", "size_hint": 887586358, "expected_md5": "8747331d168160e013a0c81e0323491d", "stream": true, "skip_local": true }, "nq-train-21": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-21.jsonl.gz", "size_hint": 890472815, "expected_md5": "6de9b253b4a069fdfdc2bc80ddc5c2d1", "stream": true, "skip_local": true }, "nq-train-22": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-22.jsonl.gz", "size_hint": 888396337, "expected_md5": "7c24574f4bc21ad764b9b8b208b97d64", "stream": true, "skip_local": true }, "nq-train-23": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-23.jsonl.gz", "size_hint": 900331594, "expected_md5": "eadf9189bd9557943f468323e00de2ff", "stream": true, "skip_local": true }, "nq-train-24": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-24.jsonl.gz", "size_hint": 871216444, "expected_md5": "f063dec5a57b2701e9318d3c93ad6486", "stream": true, "skip_local": true }, "nq-train-25": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-25.jsonl.gz", "size_hint": 871166814, "expected_md5": "7db0ce87b997e3d47d4b511924384ec8", "stream": true, "skip_local": true }, "nq-train-26": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-26.jsonl.gz", "size_hint": 903385811, "expected_md5": "c50093c839d33956d38548e2b44e0c50", "stream": true, "skip_local": true }, "nq-train-27": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-27.jsonl.gz", "size_hint": 842966594, "expected_md5": "355e504e2a4b5bb6eb05d6976c93517f", "stream": true, "skip_local": true }, "nq-train-28": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-28.jsonl.gz", "size_hint": 876393409, "expected_md5": "f4da07e59b33d1ce1dc006363f961e9d", "stream": true, "skip_local": true }, "nq-train-29": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-29.jsonl.gz", "size_hint": 872982425, "expected_md5": "519c4a0349495253ec9f4211c85f8fcb", "stream": true, "skip_local": true }, "nq-train-30": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-30.jsonl.gz", "size_hint": 899739217, "expected_md5": "96cde3a58f5e9bad4c94510aab7a5db5", "stream": true, "skip_local": true }, "nq-train-31": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-31.jsonl.gz", "size_hint": 875703668, "expected_md5": "26955c79974141f71cb4729908f9c45b", "stream": true, "skip_local": true }, "nq-train-32": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-32.jsonl.gz", "size_hint": 895840703, "expected_md5": "91e33f14401adafb3d5cd2c13239531b", "stream": true, "skip_local": true }, "nq-train-33": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-33.jsonl.gz", "size_hint": 874713497, "expected_md5": "d28bf97bf7d433e64d7c016a477e97a1", "stream": true, "skip_local": true }, "nq-train-34": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-34.jsonl.gz", "size_hint": 872620262, "expected_md5": "c98c6aa8578b04524a8e7f035aaa4bbe", "stream": true, "skip_local": true }, "nq-train-35": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-35.jsonl.gz", "size_hint": 854439473, "expected_md5": "3e2b5b6280158f9ba312158a57360512", "stream": true, "skip_local": true }, "nq-train-36": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-36.jsonl.gz", "size_hint": 866233094, "expected_md5": "b6114ca6d3804c045b6abd4b6e6a9f52", "stream": true, "skip_local": true }, "nq-train-37": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-37.jsonl.gz", "size_hint": 894411832, "expected_md5": "89d1c93f2425c232882118fc3af92b9d", "stream": true, "skip_local": true }, "nq-train-38": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-38.jsonl.gz", "size_hint": 879967719, "expected_md5": "785d092d65bb3d815eb5a8027d764c13", "stream": true, "skip_local": true }, "nq-train-39": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-39.jsonl.gz", "size_hint": 887056754, "expected_md5": "c58b54591127ef977f9b900c05ec44ec", "stream": true, "skip_local": true }, "nq-train-40": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-40.jsonl.gz", "size_hint": 873720601, "expected_md5": "cd14b09b6fc9ae8a7bda558b020459aa", "stream": true, "skip_local": true }, "nq-train-41": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-41.jsonl.gz", "size_hint": 880452966, "expected_md5": "bb7415c0fd7cdc52ef8381c44194f2da", "stream": true, "skip_local": true }, "nq-train-42": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-42.jsonl.gz", "size_hint": 856217171, "expected_md5": "08f384e34c0358ab4e9d72ac18eeb4f2", "stream": true, "skip_local": true }, "nq-train-43": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-43.jsonl.gz", "size_hint": 908184635, "expected_md5": "c148652671404f7b2796327db542ef90", "stream": true, "skip_local": true }, "nq-train-44": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-44.jsonl.gz", "size_hint": 891701874, "expected_md5": "245074693e3863f099ad92a2e7e07d88", "stream": true, "skip_local": true }, "nq-train-45": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-45.jsonl.gz", "size_hint": 870559738, "expected_md5": "8e6d4a0895f5c87ede7cbbd0fe8ec0bb", "stream": true, "skip_local": true }, "nq-train-46": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-46.jsonl.gz", "size_hint": 883791796, "expected_md5": "6b965fb2b2fdc3772da641aac10a7c7e", "stream": true, "skip_local": true }, "nq-train-47": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-47.jsonl.gz", "size_hint": 882109720, "expected_md5": "79ac295cf818f1dcdf676772b3863232", "stream": true, "skip_local": true }, "nq-train-48": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-48.jsonl.gz", "size_hint": 882241605, "expected_md5": "545bb6edf1290d61fd42b13125b5bf2a", "stream": true, "skip_local": true }, "nq-train-49": { "url": "https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-49.jsonl.gz", "size_hint": 863247626, "expected_md5": "d6aaa8706626d4b6210e4f9bc6ffe265", "stream": true, "skip_local": true } }, "neuclir": { "1/fa/docs": { "url": "https://huggingface.co/datasets/neuclir/neuclir1/resolve/main/data/fas-00000-of-00001.jsonl.gz?download=true", "size_hint": 2359094118, "expected_md5": "c88f79f6b6da974db22cef3dd73fcee1", "cache_path": "1/fas/docs.jsonl.gz" }, "1/zh/docs": { "url": "https://huggingface.co/datasets/neuclir/neuclir1/resolve/main/data/zho-00000-of-00001.jsonl.gz?download=true", "size_hint": 3188072408, "expected_md5": "99eb400f3a474603d1db5d41f606889b", "cache_path": "1/zho/docs.jsonl.gz" }, "1/ru/docs": { "url": "https://huggingface.co/datasets/neuclir/neuclir1/resolve/main/data/rus-00000-of-00001.jsonl.gz?download=true", "size_hint": 4504119267, "expected_md5": "3aabc798a3b5dd92d7c47db9521870b1", "cache_path": "1/rus/docs.jsonl.gz" }, "trec-2022/queries": { "url": "https://trec.nist.gov/data/neuclir/2022/topics.0720.utf8.jsonl", "size_hint": 662272, "expected_md5": "264bf244f798670f063f32ff57ba6135", "cache_path": "trec-2022/topics.jsonl", "irds_mirror": true }, "trec-2022/qrels": { "url": "https://trec.nist.gov/data/neuclir/2022/2022-qrels.all", "size_hint": 4785668, "expected_md5": "8dc1aecf13fbe358eea74ade7496b085", "cache_path": "trec-2022/qrels", "irds_mirror": true }, "trec-2023/queries": { "url": "https://trec.nist.gov/data/neuclir/2023/neuclir-2023-topics.0605.jsonl", "size_hint": 683779, "expected_md5": "3dbb41b02bfbd719d8b55632d9b15b83", "cache_path": "trec-2023/topics.jsonl", "irds_mirror": true }, "trec-2023/qrels": { "url": "https://trec.nist.gov/data/neuclir/2023/neuclir-2023-qrels.final.tar.gz", "size_hint": 6023886, "expected_md5": "cea4ff3d9eba612c7119e6490217d4e1", "cache_path": "trec-2023/qrels.tar.gz", "irds_mirror": true } }, "nyt": { "source": { "instructions": "The New York Times Annotated Corpus. It is available from the LDC via: <https://catalog.ldc.upenn.edu/LDC2008T19>.\nMore details about the procedure can be found here: <https://ir-datasets.com/nyt.html#DataAccess>.\nTo proceed, symlink the source file here: {path}", "expected_md5": "67a1bcf200c448424bf0fba34cef17b0", "cache_path": "nyt.tgz" }, "trec-core-2017/queries": { "url": "https://trec.nist.gov/data/core/core_nist.txt", "irds_mirror": true, "size_hint": 24444, "expected_md5": "821f8eaaf11ae3ce9657d1442749480a", "cache_path": "trec-core-2017/queries.txt" }, "trec-core-2017/qrels": { "url": "https://trec.nist.gov/data/core/qrels.txt", "irds_mirror": true, "size_hint": 462387, "expected_md5": "8cf8dcafba6557e5ee62a28a44b0314d", "cache_path": "trec-core-2017/qrels" } }, "pmc": { "v1/source0": { "url": "https://ceb.nlm.nih.gov/~simpsonmatt/pmc-text-00.tar.gz", "size_hint": 1958971361, "expected_md5": "3448bc2967913dd46c771d55b3b3ccae", "cache_path": "v1/pmc-text-00.tar.gz" }, "v1/source1": { "url": "https://ceb.nlm.nih.gov/~simpsonmatt/pmc-text-01.tar.gz", "size_hint": 3038214436, "expected_md5": "6f4bee841dcf307ce1f4cd137f65e822", "cache_path": "v1/pmc-text-01.tar.gz" }, "v1/source2": { "url": "https://ceb.nlm.nih.gov/~simpsonmatt/pmc-text-02.tar.gz", "size_hint": 3133121435, "expected_md5": "03375c91a74220c59071c7aa5f78deee", "cache_path": "v1/pmc-text-02.tar.gz" }, "v1/source3": { "url": "https://ceb.nlm.nih.gov/~simpsonmatt/pmc-text-03.tar.gz", "size_hint": 2161685233, "expected_md5": "270648367547f746cf1e2e4323a46aa9", "cache_path": "v1/pmc-text-03.tar.gz" }, "v1/dup1": { "url": "http://www.trec-cds.org/duplicates-1.txt", "size_hint": 3836, "expected_md5": "7a93656ca21c1749bf0b71a03be01cf3", "cache_path": "v1/duplicates-1.txt" }, "v1/dup2": { "url": "http://www.trec-cds.org/duplicates-2.txt", "size_hint": 2640, "expected_md5": "8d5f1004ea4d00cfd5a96b40c6419be1", "cache_path": "v1/duplicates-2.txt" }, "v2/source0": { "url": "https://ceb.nlm.nih.gov/~robertske/pmc-00.tar.gz", "size_hint": 3875701879, "expected_md5": "d66f2e243cb697138753e622f0a1867a", "cache_path": "v2/pmc-text-00.tar.gz" }, "v2/source1": { "url": "https://ceb.nlm.nih.gov/~robertske/pmc-01.tar.gz", "size_hint": 3590186941, "expected_md5": "e59422e5d21ef7e1be307a0de61626f7", "cache_path": "v2/pmc-text-01.tar.gz" }, "v2/source2": { "url": "https://ceb.nlm.nih.gov/~robertske/pmc-02.tar.gz", "size_hint": 4991488701, "expected_md5": "145d194643818ea09a4947ba6b1c91c7", "cache_path": "v2/pmc-text-02.tar.gz" }, "v2/source3": { "url": "https://ceb.nlm.nih.gov/~robertske/pmc-03.tar.gz", "size_hint": 6443033629, "expected_md5": "231de843bb5334c3c885d75f2ca3240b", "cache_path": "v2/pmc-text-03.tar.gz" }, "trec-cds-2014/queries": { "url": "http://www.trec-cds.org/topics2014.xml", "size_hint": 22514, "expected_md5": "4924e2a3bb539feac6cbb967f4875926", "cache_path": "trec-cds-2014/queries.xml" }, "trec-cds-2014/qrels": { "url": "https://trec.nist.gov/data/clinical/qrels-treceval-2014.txt", "irds_mirror": true, "size_hint": 556628, "expected_md5": "07c8f85a7b7bcfd4211301ecaa3b4769", "cache_path": "trec-cds-2014/qrels" }, "trec-cds-2015/queries": { "url": "https://trec.nist.gov/data/clinical/topics-2015-A.xml", "irds_mirror": true, "size_hint": 22491, "expected_md5": "462d9804257e7ad0128f16324c9ec06d", "cache_path": "trec-cds-2015/queries.xml" }, "trec-cds-2015/qrels": { "url": "https://trec.nist.gov/data/clinical/qrels-treceval-2015.txt", "irds_mirror": true, "size_hint": 554312, "expected_md5": "7bbe901cfa36df56dd13cce0275c1a2b", "cache_path": "trec-cds-2015/qrels" }, "trec-cds-2016/queries": { "url": "https://trec.nist.gov/data/clinical/topics2016.xml", "irds_mirror": true, "size_hint": 79966, "expected_md5": "22ccb3412931efe1ea084330737e41bc", "cache_path": "trec-cds-2016/queries.xml" }, "trec-cds-2016/qrels": { "url": "https://trec.nist.gov/data/clinical/qrels-treceval-2016.txt", "irds_mirror": true, "size_hint": 553709, "expected_md5": "1a450d38137082e214c1201a3023a6d1", "cache_path": "trec-cds-2016/qrels" } }, "touche": { "2020/task-1/queries": { "url": "https://zenodo.org/record/6862281/files/topics-task-1.zip", "size_hint": 8768, "expected_md5": "9605104435165a6b01b737464596eba4" }, "2020/task-1/qrels": { "url": "https://zenodo.org/record/6862281/files/touche2020-task1-relevance-args-me-corpus-version-2020-04-01-corrected.qrels", "size_hint": 62058, "expected_md5": "6a645e2ebd4f1d6c44da4d9509624598" }, "2020/task-1/qrels-argsme-1.0-uncorrected": { "url": "https://zenodo.org/record/6862281/files/touche2020-task1-relevance-args-me-corpus-version-1.qrels", "size_hint": 145201, "expected_md5": "10f043e086818f9159ac37a9ebe5ce5d" }, "2020/task-1/qrels-argsme-2020-04-01-uncorrected": { "url": "https://zenodo.org/record/6862281/files/touche2020-task1-relevance-args-me-corpus-version-2020-04-01.qrels", "size_hint": 66283, "expected_md5": "6a27d7123423540664ccfe0391e4e417" }, "2020/task-2/queries": { "url": "https://zenodo.org/record/6797876/files/topics-task-2.zip", "size_hint": 17279, "expected_md5": "8de387d753ee8289a9f02346b63e12e4" }, "2020/task-2/qrels": { "url": "https://zenodo.org/record/6797876/files/touche2020-task2-relevance-withbaseline.qrels", "size_hint": 58522, "expected_md5": "b230436beb3a9eecbeb19c84ee6c855c" }, "2021/task-1/queries": { "url": "https://zenodo.org/record/6798216/files/topics-task-1-only-titles-2021.zip", "size_hint": 1350, "expected_md5": "61bad9cf6bc713a81297cd95cf9e156f" }, "2021/task-1/qrels-relevance": { "url": "https://zenodo.org/record/6798216/files/touche-task1-51-100-relevance.qrels", "size_hint": 99736, "expected_md5": "76b4e8348bde353167ce52ffa598a6b1" }, "2021/task-1/qrels-quality": { "url": "https://zenodo.org/record/6798216/files/touche-task1-51-100-quality.qrels", "size_hint": 100087, "expected_md5": "c899bcab9b00fdd28f77d08a1d26298a" }, "2021/task-2/queries": { "url": "https://zenodo.org/record/6798217/files/topics-task-2-2021.zip", "size_hint": 15532, "expected_md5": "0c06079b327ecd2b4c5971bd17bd0aa3" }, "2021/task-2/qrels-relevance": { "url": "https://zenodo.org/record/6798217/files/touche-task2-51-100-relevance.qrels", "size_hint": 68548, "expected_md5": "970b48e0c057afaee17a7832100ab67c" }, "2021/task-2/qrels-quality": { "url": "https://zenodo.org/record/6798217/files/touche-task2-51-100-quality.qrels", "size_hint": 68548, "expected_md5": "062bea3b8307ae481876fe31253092b5" }, "2022/task-1/queries": { "url": "https://zenodo.org/record/6873574/files/topics.xml", "size_hint": 31089, "expected_md5": "83a9cb2290f867199d6de9c73eeacf43" }, "2022/task-1/qrels-relevance": { "url": "https://zenodo.org/record/6873574/files/touche-task1-2022-relevance-dedup.qrels", "size_hint": 482162, "expected_md5": "658e5c13d8e5a80371d73ee1b04499bc" }, "2022/task-1/qrels-quality": { "url": "https://zenodo.org/record/6873574/files/touche-task1-2022-quality-dedup.qrels", "size_hint": 482162, "expected_md5": "8d12dac4ca8dfc0693ab34c926e87c1b" }, "2022/task-1/qrels-coherence": { "url": "https://zenodo.org/record/6873574/files/touche-task1-2022-coherence-dedup.qrels", "size_hint": 482162, "expected_md5": "5389bb1df02d3ba54a458eb969cbc5e9" }, "2022/task-2/queries": { "url": "https://zenodo.org/record/6873567/files/topics-task2-2022.zip", "size_hint": 17203, "expected_md5": "fafbb6352be108419535aaed83fc5762" }, "2022/task-2/qrels-relevance": { "url": "https://zenodo.org/record/6873567/files/touche-task2-2022-relevance.qrels", "size_hint": 78379, "expected_md5": "dcdd1031ce2e0830ae76d7b21fca2579" }, "2022/task-2/qrels-quality": { "url": "https://zenodo.org/record/6873567/files/touche-task2-2022-quality.qrels", "size_hint": 78379, "expected_md5": "adf28c877efcd94954263b392714cb60" }, "2022/task-2/qrels-stance": { "url": "https://zenodo.org/record/6873567/files/touche-task2-2022-stance.qrels", "size_hint": 84891, "expected_md5": "ac9afec4eb590877df9c94dd8d931a37" }, "2022/task-2/passages": { "url": "https://zenodo.org/record/6873567/files/touche-task2-passages-version-002.jsonl.gz", "size_hint": 285743369, "expected_md5": "ed4d6104b78986849c59bbc470464cec" }, "2022/task-2/passages-expanded-doc-t5-query": { "url": "https://zenodo.org/record/6873567/files/touche-task2-passages-version-002-expanded-with-doc-t5-query.jsonl.gz", "size_hint": 300736532, "expected_md5": "062452996389a320b83ce274df82cf4b" }, "2022/task-3/queries": { "url": "https://zenodo.org/record/6873575/files/topics.xml", "size_hint": 31089, "expected_md5": "83a9cb2290f867199d6de9c73eeacf43" }, "2022/task-3/qrels": { "url": "https://zenodo.org/record/6873575/files/touche-task3-001-050-relevance.qrels", "size_hint": 558028, "expected_md5": "83ec2d715d0205b68b9b63f9c30da784" } }, "touche-image": { "2022-06-13/images-main": { "url": "https://zenodo.org/record/6873575/files/touche22-image-search-main.zip", "size_hint": 4498749006, "expected_md5": "e59b1c724d976af27596b5c8ad310fd5" }, "2022-06-13/images-nodes": { "url": "https://zenodo.org/record/6873575/files/touche22-image-search-nodes.zip", "size_hint": 5424503960, "expected_md5": "97b7117d02668fa3e93095d277efd56b" }, "2022-06-13/images-png": { "url": "https://zenodo.org/record/6873575/files/touche22-image-search-png-images.zip", "size_hint": 17851724760, "expected_md5": "e2965b221248ba23a288135f757efae1" } }, "trec-arabic": { "docs": { "instructions": "The dataset is based on the Arabic Newswire corpus. It is available from the LDC via: <https://catalog.ldc.upenn.edu/LDC2001T55>\nMore details about the procedure can be found here: <https://ir-datasets.com/trec-arabic.html#DataAccess>.\nTo proceed, symlink the source file here: {path}", "expected_md5": "b17f34a51dca8d19fae66c338c9ed73a", "cache_path": "corpus.tgz" }, "ar2001/queries": { "url": "https://trec.nist.gov/data/topics_noneng/arabic_topics.txt", "irds_mirror": true, "size_hint": 10320, "expected_md5": "a3d78c379056a080fe40a59a341496b8", "cache_path": "ar2001/queries" }, "ar2001/qrels": { "url": "https://trec.nist.gov/data/qrels_noneng/xlingual_t10qrels.txt", "irds_mirror": true, "size_hint": 650331, "expected_md5": "5951e2f0bf72df9f93fc32b93e3a7fde", "cache_path": "ar2001/qrels" }, "ar2002/queries": { "url": "https://trec.nist.gov/data/topics_noneng/CL.topics.arabic.trec11.txt", "irds_mirror": true, "size_hint": 15873, "expected_md5": "f75a6164d794bab66509f1e818612363", "cache_path": "ar2002/queries" }, "ar2002/qrels": { "url": "https://trec.nist.gov/data/qrels_noneng/qrels.trec11.xlingual.txt", "irds_mirror": true, "size_hint": 1114528, "expected_md5": "40f25e1e98101e27d081685cbdc390ef", "cache_path": "ar2002/qrels" } }, "trec-cast": { "marco_dupes": { "url": "http://boston.lti.cs.cmu.edu/Services/treccast19/duplicate_list_v1.0.txt", "size_hint": 131116975, "expected_md5": "071d789ce7d8ebab4392dec62c5d17b7", "cache_path": "marco_duplicate_list_v1.0.txt" }, "wapo_dupes": { "url": "http://boston.lti.cs.cmu.edu/Services/treccast19/wapo_duplicate_list_v1.0.txt", "size_hint": 497283320, "expected_md5": "3f221bdd3a44d1850d698041a8cdd2bc", "cache_path": "wapo_duplicate_list_v1.0.txt" }, "v2/dupes/marco_v1": { "url": "https://github.com/daltonj/treccastweb/raw/master/2021/duplicate_files/marco_duplicates.txt", "size_hint": 131116975, "expected_md5": "549f721aec777b18f6538ddeabf6a8f3", "cache_path": "v2/dupes/marco_v1.txt" }, "v2/dupes/wapo": { "url": "https://github.com/daltonj/treccastweb/raw/master/2021/duplicate_files/wapo-near-duplicates", "size_hint": 98717888, "expected_md5": "23bacc7e03af656dc590fd6a5476bc83", "cache_path": "v2/dupes/wapo_near_duplicates.txt" }, "v2/offsets/msmarco": { "url": "https://huggingface.co/datasets/irds/trec_cast_offsets/resolve/main/MARCO_v1.chunks.jsonl.gz", "size_hint": 190998420, "expected_md5": "b76c8d1e3b260764d573ce618a15525f", "cache_path": "v2/offsets/marco_v1.jsonl.gz" }, "v2/offsets/wapo": { "url": "https://huggingface.co/datasets/irds/trec_cast_offsets/resolve/main/WaPo-v2.chunks.jsonl.gz", "size_hint": 43654392, "expected_md5": "900c56039b4a3edd642983c4a1e13796", "cache_path": "v2/offsets/wapo.jsonl.gz" }, "v2/offsets/kilt": { "url": "https://huggingface.co/datasets/irds/trec_cast_offsets/resolve/main/KILT-nodupes.chunks.jsonl.gz", "size_hint": 237224538, "expected_md5": "7bd9c844ea7d8ecc7a1236ef5c7d7722", "cache_path": "v2/offsets/kilt.jsonl.gz" }, "v3/dupes": { "url": "https://github.com/daltonj/treccastweb/raw/master/2022/duplicate_files/all_duplicates.txt", "size_hint": 21506576, "expected_md5": "2d6cbb4a2e8733423434cfe561635c21", "cache_path": "v3/all_duplicates.txt" }, "v3/offsets/wapo": { "url": "https://huggingface.co/datasets/irds/trec_cast_offsets/resolve/main/WaPo.chunks.jsonl.gz", "expected_md5": "1dbed58d09aeaa6046c234d056ea703e", "size_hint": 42785199, "cache_path": "v3/offsets/wapo.jsonl.gz" }, "v3/offsets/msmarco": { "url": "https://huggingface.co/datasets/irds/trec_cast_offsets/resolve/main/MARCO_v2.chunks.jsonl.gz", "expected_md5": "2f0bbe152b4645bff744892c5b53471f", "size_hint": 780228823, "cache_path": "v3/offsets/msmarco.jsonl.gz" }, "v3/offsets/kilt": { "url": "https://huggingface.co/datasets/irds/trec_cast_offsets/resolve/main/KILT.chunks.jsonl.gz", "expected_md5": "290cc1a172b7fc29b5ce211a87ced098", "size_hint": 237135443, "cache_path": "v3/offsets/kilt.jsonl.gz" }, "2019/train/queries": { "url": "https://raw.githubusercontent.com/daltonj/treccastweb/master/2019/data/training/train_topics_v1.0.json", "size_hint": 32800, "expected_md5": "2017389f5bbea04478574c6e84d65482", "cache_path": "2019/train_topics_v1.0.json" }, "2019/train/qrels": { "url": "https://raw.githubusercontent.com/daltonj/treccastweb/master/2019/data/training/train_topics_mod.qrel", "size_hint": 80964, "expected_md5": "84af27620dfc009f1f76a58e3d9d6c40", "cache_path": "2019/train_topics_mod.qrel" }, "2019/train/scoreddocs": { "url": "https://huggingface.co/datasets/macavaney/trec-cast-files/resolve/main/train_topics.teIn", "size_hint": 14766212, "expected_md5": "83bdb720e0c469390004598091021901", "cache_path": "2019/train_topics.teIn" }, "2019/eval/queries": { "url": "https://raw.githubusercontent.com/daltonj/treccastweb/master/2019/data/evaluation/evaluation_topics_v1.0.json", "size_hint": 57204, "expected_md5": "362283885194feefcab8441d2bb24f7c", "cache_path": "2019/evaluation_topics_v1.0.json" }, "2019/eval/qrels": { "url": "https://trec.nist.gov/data/cast/2019qrels.txt", "irds_mirror": true, "size_hint": 1138032, "expected_md5": "aab238105020c4cd55fae60dedfa9f1e", "cache_path": "2019/2019qrels.txt" }, "2019/eval/scoreddocs": { "url": "https://huggingface.co/datasets/macavaney/trec-cast-files/resolve/main/test_topics.teIn", "size_hint": 27691171, "expected_md5": "4c3958c09edab1b46474a337590c1ecd", "cache_path": "2019/test_topics.teIn" }, "2020/queries": { "url": "https://raw.githubusercontent.com/daltonj/treccastweb/master/2020/2020_manual_evaluation_topics_v1.0.json", "size_hint": 78998, "expected_md5": "98ae2be2c82e294895a83e76b4133e19", "cache_path": "2020/2020_manual_evaluation_topics_v1.0.json" }, "2020/qrels": { "url": "https://trec.nist.gov/data/cast/2020qrels.txt", "irds_mirror": true, "size_hint": 1563427, "expected_md5": "de6a8406217945bdbf1da304214ef60c", "cache_path": "2020/2020qrels.txt" }, "2021/queries": { "url": "https://raw.githubusercontent.com/daltonj/treccastweb/master/2021/2021_manual_evaluation_topics_v1.0.json", "expected_md5": "eafb4d4c5b914baab02a71c474e4ecc4", "size_hint": 363259, "cache_path": "2021/2021_manual_evaluation_topics_v1.0.json" }, "2021/qrels": { "url": "https://raw.githubusercontent.com/daltonj/treccastweb/master/2021/trec-cast-qrels-docs.2021.qrel", "irds_mirror": true, "expected_md5": "3393d7d2fbf8771511e3cf2448ad3dad", "size_hint": 507214, "cache_path": "2021/trec-cast-qrels-docs.2021.qrel" }, "2022/queries": { "url": "https://raw.githubusercontent.com/daltonj/treccastweb/master/2022/2022_evaluation_topics_tree_v1.0.json", "expected_md5": "a2996adf88b3933816f65ad5536c7860", "size_hint": 252600, "cache_path": "2022/2022_evaluation_topics_tree_v1.0.json" }, "2022/qrels": { "url": "https://trec.nist.gov/data/cast/2022-qrels.txt", "expected_md5": "1d1ffe3d3a3a7fc64278ac7cc72b8bde", "irds_mirror": true, "size_hint": 1435510, "cache_path": "2022/2022qrels.txt" } }, "trec-mandarin": { "docs": { "instructions": "The dataset is based on the TREC Mandarin corpus. It is available from the LDC via: <https://catalog.ldc.upenn.edu/LDC2000T52>\nMore details about the procedure can be found here: <https://ir-datasets.com/trec-mandarin.html#DataAccess>.\nTo proceed, symlink the source file here: {path}", "expected_md5": "a847fa029a1356b8f396aa642c449e38", "cache_path": "corpus.tgz" }, "trec5/queries": { "url": "https://trec.nist.gov/data/topics_noneng/topics.CH1-CH28.chinese.english.gz", "irds_mirror": true, "size_hint": 9136, "expected_md5": "9ce885d36e8642d4114f40e7008e5b8a", "cache_path": "trec5/queries.gz" }, "trec5/qrels": { "url": "https://trec.nist.gov/data/qrels_noneng/qrels.1-28.chinese.gz", "irds_mirror": true, "size_hint": 76063, "expected_md5": "73693083d75ef323fca2a218604b41ac", "cache_path": "trec5/qrels.gz" }, "trec6/queries": { "url": "https://trec.nist.gov/data/topics_noneng/topics.CH29-CH54.chinese.english.gz", "irds_mirror": true, "size_hint": 8920, "expected_md5": "c3a58ec59e55c162fdc3e3a9c5e9b8a7", "cache_path": "trec6/queries.gz" }, "trec6/qrels": { "url": "https://trec.nist.gov/data/qrels_noneng/qrels.trec6.29-54.chinese.gz", "irds_mirror": true, "size_hint": 44468, "expected_md5": "675ab2f14fad9017d646d052c0b35c46", "cache_path": "trec6/qrels.gz" } }, "trec-spanish": { "docs": { "instructions": "The dataset is based on the TREC Spanish corpus. It is available from the LDC via: <https://catalog.ldc.upenn.edu/LDC2000T51>\nMore details about the procedure can be found here: <https://ir-datasets.com/trec-spanish.html#DataAccess>.\nTo proceed, symlink the source file here: {path}", "expected_md5": "4b8583c03149cf0c06c090fd230b33c6", "cache_path": "corpus.tgz" }, "trec3/queries": { "url": "https://trec.nist.gov/data/topics_noneng/topics.SP1-SP25.spanish.english.gz", "irds_mirror": true, "size_hint": 9029, "expected_md5": "22eea4a5c131db9cc4a431235f6a0573", "cache_path": "trec3/queries.gz" }, "trec3/qrels": { "url": "https://trec.nist.gov/data/qrels_noneng/qrels.1-25.spanish.gz", "irds_mirror": true, "size_hint": 64178, "expected_md5": "e1703487f43fb7ea30b87a0f14ccb5ce", "cache_path": "trec3/qrels.gz" }, "trec4/queries": { "url": "https://trec.nist.gov/data/topics_noneng/topics.SP26-SP50.spanish.english.gz", "irds_mirror": true, "size_hint": 2091, "expected_md5": "dfd9685cce559e33ab397c1878a6a1f8", "cache_path": "trec4/querie.gz" }, "trec4/qrels": { "url": "https://trec.nist.gov/data/qrels_noneng/qrels.26-50.spanish.gz", "irds_mirror": true, "size_hint": 46394, "expected_md5": "f2540f9fb83433ca8ef9503671136498", "cache_path": "trec4/qrels.gz" } }, "trec-tot": { "2023": { "url": "https://surfdrive.surf.nl/files/index.php/s/FaEK4xc6Xp2JcAJ/download", "expected_md5": "f84fe82cb80e3ee1072576c8d6c4a417", "cache_path": "trec-tot.zip" }, "2024": { "url": "https://zenodo.org/records/13370657/files/corpus.jsonl.zip?download=1", "expected_md5": "4ea86770817e46a06fea5c94f596409c", "cache_path": "trec-tot-2024-corpus.zip" }, "2024-test": { "url": "https://zenodo.org/records/13370657/files/test-2024.zip?download=1", "expected_md5": "3d0a4d83957ee6a1398afefbc96162fa", "cache_path": "trec-tot-2024-queries.zip" } }, "trec-tot-2025": { "trec-tot-2025-offsets.jsonl.gz": { "url": "https://zenodo.org/records/15356599/files/trec-tot-2025-offsets.jsonl.gz", "expected_md5": "00678e3155d962bb244e034e6401b79b", "cache_path": "trec-tot-2025-offsets.jsonl.gz" }, "trec-tot-2025-corpus.jsonl.gz": { "url": "https://zenodo.org/records/15356599/files/trec-tot-2025-corpus.jsonl.gz", "expected_md5": "a2c82398aa86df6a68c8706b9b462bf2", "cache_path": "trec-tot-2025-corpus.jsonl.gz" }, "train-2025-qrel.txt": { "url": "https://zenodo.org/records/15356599/files/train-2025-qrel.txt", "expected_md5": "10a3c727fc5806ec4510f7a071b57cd7", "cache_path": "train-2025-qrel.txt" }, "train-2025-queries.jsonl": { "url": "https://zenodo.org/records/15356599/files/train-2025-queries.jsonl", "expected_md5": "288b7707b4e897f7447aac2cc2f613be", "cache_path": "train-2025-queries.jsonl" }, "dev1-2025-qrel.txt": { "url": "https://zenodo.org/records/15356599/files/dev1-2025-qrel.txt", "expected_md5": "0c913ce8b5b287c73a6dfac662971e82", "cache_path": "dev1-2025-qrel.txt" }, "dev1-2025-queries.jsonl": { "url": "https://zenodo.org/records/15356599/files/dev1-2025-queries.jsonl", "expected_md5": "b87c2f51d058de844e258a69b02e70fc", "cache_path": "dev1-2025-queries.jsonl" }, "dev2-2025-qrel.txt": { "url": "https://zenodo.org/records/15356599/files/dev2-2025-qrel.txt", "expected_md5": "4548eb41e639905384aa017c69129bfc", "cache_path": "dev2-2025-qrel.txt" }, "dev2-2025-queries.jsonl": { "url": "https://zenodo.org/records/15356599/files/dev2-2025-queries.jsonl", "expected_md5": "b174a128a255e92d0d54b76465d596b5", "cache_path": "dev2-2025-queries.jsonl" }, "dev3-2025-qrel.txt": { "url": "https://zenodo.org/records/15356599/files/dev3-2025-qrel.txt", "expected_md5": "48ab0d24a5946861546e54064238477f", "cache_path": "dev3-2025-qrel.txt" }, "dev3-2025-queries.jsonl": { "url": "https://zenodo.org/records/15356599/files/dev3-2025-queries.jsonl", "expected_md5": "259c11645694a3c5230b66c7852d4d80", "cache_path": "dev3-2025-queries.jsonl" }, "test-2025-queries.jsonl": { "url": "https://zenodo.org/records/15869078/files/test-2025-queries.jsonl", "expected_md5": "374cdc9142240f8bc9e4b071c35713f8", "cache_path": "test-2025-queries.jsonl" } }, "tripclick": { "benchmark": { "instructions": "To use this dataset, you need to request the source files from the Trip Database here: <https://tripdatabase.github.io/tripclick/#getting-the-data>.\nMore details about the procedure can be found here: <https://ir-datasets.com/tripclick.html#DataAccess>.\nTo proceed, symlink the source file here: {path}", "expected_md5": "6e5d3deeba138750e9a148b538f30a8f", "cache_path": "benchmark.tar.gz" }, "dlfiles": { "instructions": "To use this dataset, you need to request the source files from the Trip Database here: <https://tripdatabase.github.io/tripclick/#getting-the-data>.\nMore details about the procedure can be found here: <https://ir-datasets.com/tripclick.html#DataAccess>.\nTo proceed, symlink the source file here: {path}", "expected_md5": "1f256c19466b414e365324d8ef21f09c", "cache_path": "dlfiles.tar.gz" }, "dlfiles_runs_test": { "instructions": "To use this dataset, you need to request the source files from the Trip Database here: <https://tripdatabase.github.io/tripclick/#getting-the-data>.\nMore details about the procedure can be found here: <https://ir-datasets.com/tripclick.html#DataAccess>.\nTo proceed, symlink the source file here: {path}", "expected_md5": "2b5e98c683a91e19630636b6f83e3b15", "cache_path": "dlfiles_runs_test.tar.gz" }, "logs": { "instructions": "To use this dataset, you need to request the source files from the Trip Database here: <https://tripdatabase.github.io/tripclick/#getting-the-data>.\nMore details about the procedure can be found here: <https://ir-datasets.com/tripclick.html#DataAccess>.\nTo proceed, symlink the source file here: {path}", "expected_md5": "1d3a548685c2fbef9b2076b0b04ba44f", "cache_path": "logs.tar.gz" }, "hofstaetter-triples": { "url": "https://huggingface.co/datasets/sebastian-hofstaetter/tripclick-training/resolve/main/improved_tripclick_train_triple-ids.tsv", "size_hint": 233053452, "expected_md5": "8d70808ec06570e02bc4014ed033b5d0", "cache_path": "improved_tripclick_train_triple-ids.tsv" } }, "trec-robust04": { "docs": { "instructions": "The TREC Robust document collection is from TREC disks 4 and 5. Due to the copyrighted nature of the documents, this collection is for research use only, which requires agreements to be filed with NIST. See details here: <https://trec.nist.gov/data/cd45/index.html>.\nMore details about the procedure can be found here: <https://ir-datasets.com/trec-robust04.html#DataAccess>.\nOnce completed, place the uncompressed source here: {path}\nThis should contain directories like NEWS_data/FBIS, NEWS_data/FR94, etc.", "irds_mirror": true, "cache_path": "trec45" }, "queries": { "url": "https://trec.nist.gov/data/robust/04.testset.gz", "irds_mirror": true, "size_hint": 34293, "expected_md5": "5eac3d774a2f87da61c08a94f945beff", "cache_path": "queries.gz" }, "qrels": { "url": "https://trec.nist.gov/data/robust/qrels.robust2004.txt", "irds_mirror": true, "size_hint": 6543541, "expected_md5": "123c2a0ba2ec31178cb1050995dcfdfa", "cache_path": "qrels" } }, "tweets2013-ia": { "docs/feb": { "url": "https://archive.org/download/archiveteam-twitter-stream-2013-02/archiveteam-twitter-stream-2013-02.tar", "expected_md5": "e82916d37116c781afff750e2127156f", "stream": true, "skip_local": true, "skip_test": true }, "docs/mar": { "url": "https://archive.org/download/archiveteam-twitter-stream-2013-03/archiveteam-twitter-stream-2013-03.tar", "expected_md5": "486817a372e03298162daa0e80dc6399", "stream": true, "skip_local": true, "skip_test": true }, "trec-mb-2013/queries": { "url": "https://trec.nist.gov/data/microblog/2013/topics.MB111-170.txt", "irds_mirror": true, "size_hint": 11471, "expected_md5": "0b78d99dfa2d655dca7e9f138a93c21a", "cache_path": "trec-mb-2013/queries.txt" }, "trec-mb-2013/qrels": { "url": "https://trec.nist.gov/data/microblog/2013/qrels.txt", "irds_mirror": true, "size_hint": 1995812, "expected_md5": "4776a5dfd80b3f675184315ec989c02f", "cache_path": "trec-mb-2013/qrels" }, "trec-mb-2014/queries": { "url": "https://trec.nist.gov/data/microblog/2014/topics.desc.MB171-225.txt", "irds_mirror": true, "size_hint": 17785, "expected_md5": "e9d520f976176e710fd68bb3a065a3e7", "cache_path": "trec-mb-2014/queries.txt" }, "trec-mb-2014/qrels": { "url": "https://trec.nist.gov/data/microblog/2014/qrels2014.txt", "irds_mirror": true, "size_hint": 1623580, "expected_md5": "68d9a1920b244f6ccdc687ee1d473214", "cache_path": "trec-mb-2014/qrels" } }, "vaswani": { "main": { "url": "http://ir.dcs.gla.ac.uk/resources/test_collections/npl/npl.tar.gz", "size_hint": 2125168, "expected_md5": "23e5607081191b153738e81fbd834680", "cache_path": "npl.tar.gz" } }, "wapo": { "v2": { "instructions": "The Washington Post collection can be requested here: <https://trec.nist.gov/data/wapost/>\nMore details about the procedure can be found here: <https://ir-datasets.com/wapo.html#DataAccess>.\nOnce completed, place/link the source file: {path}", "expected_md5": "ce6e93f6ce9959b72c2de4f8d12089ab", "cache_path": "WashingtonPost.v2.tar.gz" }, "v4": { "instructions": "The Washington Post collection can be requested here: <https://trec.nist.gov/data/wapost/>\nMore details about the procedure can be found here: <https://ir-datasets.com/wapo.html#DataAccess>.\nOnce completed, place/link the source file: {path}", "expected_md5": "b45b8d34393b4df72737c11aa7fb2b3d", "cache_path": "WashingtonPost.v4.tar.gz" }, "trec-core-2018/queries": { "url": "https://trec.nist.gov/data/core/topics2018.txt", "irds_mirror": true, "size_hint": 24079, "expected_md5": "1b11276f0e1badd68347884664816654", "cache_path": "trec-core-2018/queries.txt" }, "trec-core-2018/qrels": { "url": "https://trec.nist.gov/data/core/qrels2018.txt", "irds_mirror": true, "size_hint": 1121301, "expected_md5": "7a982cd110f8bb30da4141f0f639f2e1", "cache_path": "trec-core-2018/qrels" }, "trec-news-2018/queries": { "url": "https://trec.nist.gov/data/news/2018/newsir18-topics.txt", "irds_mirror": true, "size_hint": 12489, "expected_md5": "73740793543b439d1ff1b8ee9359973a", "cache_path": "trec-news-2018/queries.txt" }, "trec-news-2018/qrels": { "url": "https://trec.nist.gov/data/news/2018/bqrels.exp-gains.txt", "irds_mirror": true, "size_hint": 364062, "expected_md5": "396963175006cb3201ea7c16e874033a", "cache_path": "trec-news-2018/qrels" }, "trec-news-2019/queries": { "url": "https://trec.nist.gov/data/news/2019/newsir19-background-linking-topics.xml", "irds_mirror": true, "size_hint": 14847, "expected_md5": "388b5c96f8962da17eb1024b856d21c1", "cache_path": "trec-news-2019/queries.txt" }, "trec-news-2019/qrels": { "url": "https://trec.nist.gov/data/news/2019/newsir19-qrels-background.txt", "irds_mirror": true, "size_hint": 669632, "expected_md5": "7b839a1a94e349d3facf28012542cc1d", "cache_path": "trec-news-2019/qrels" }, "trec-news-2020/queries": { "url": "https://trec.nist.gov/data/news/2020/newsir20-topics.txt", "irds_mirror": true, "size_hint": 13217, "expected_md5": "2674538a07fb7ac29200cbc4c4a05404", "cache_path": "trec-news-2020/queries.txt" }, "trec-news-2020/qrels": { "url": "https://trec.nist.gov/data/news/2020/qrels.background", "irds_mirror": true, "size_hint": 729348, "expected_md5": "7c31f731775bdd4148d349df1a9e43fc", "cache_path": "trec-news-2020/qrels" } }, "wikiclir": { "source": { "url": "https://www.cs.jhu.edu/~kevinduh/a/wikiclir2018/wiki-clir.tar.gz", "size_hint": 7036445773, "expected_md5": "705abb611eb8cbab9ced2b8767a3bdb6" } }, "wikir": { "en1k": { "url": "https://zenodo.org/record/3565761/files/wikIR1k.zip", "size_hint": 164995559, "expected_md5": "554299bca984640cb283d6ba55753608" }, "en59k": { "url": "https://zenodo.org/record/3557342/files/wikIR59k.zip", "size_hint": 1154400672, "expected_md5": "c9f7e646e022eea84e6f00e3870ca79b" }, "en78k": { "url": "https://www.zenodo.org/record/3707606/files/enwikIR.zip", "size_hint": 4234761118, "expected_md5": "e1a1f7678523032e0be5fedaed6c0740" }, "ens78k": { "url": "https://www.zenodo.org/record/3707238/files/enwikIRS.zip", "size_hint": 4245785781, "expected_md5": "8fd2e530ec9dfd17f3b305ec23122b55" }, "fr14k": { "url": "https://zenodo.org/record/3569718/files/FRwikIR14k.zip", "size_hint": 331209361, "expected_md5": "0bf8a8965b1a550ad3604a9ddd5c6bbe" }, "es13k": { "url": "https://zenodo.org/record/3569724/files/ESwikIR13k.zip", "size_hint": 299523201, "expected_md5": "4847eeffbf261d3877da86f5ccae4e43" }, "it16k": { "url": "https://zenodo.org/record/3569732/files/ITwikIR16k.zip", "size_hint": 248875419, "expected_md5": "e9c5b81c9df6fdc0e2986fa8ffb8ff12" } }, "sara": { "docs": { "url": "https://zenodo.org/records/18609870/files/sara_combined_docs.zip?download=1", "expected_md5": "e806b1d5ce35c94cec2899e190db7dd7" }, "queries": { "url": "https://raw.githubusercontent.com/JackMcKechnie/SARA-A-Collection-of-Sensitivity-Aware-Relevance-Assessments/main/repeated_queries.tsv", "expected_md5": "fc0247928a0b93bb344068fa238a5e3f", "cache_path": "queries.csv" }, "qrels": { "url": "https://raw.githubusercontent.com/JackMcKechnie/SARA/refs/heads/main/combined_qrels.txt", "expected_md5": "39a24d38b4d0e352e7818abd09d6815a", "cache_path": "qrels.txt" } } } ================================================ FILE: ir_datasets/etc/metadata.json ================================================ { "antique": {"docs": {"count": 403666, "fields": {"doc_id": {"max_len": 10, "common_prefix": ""}}}}, "antique/test": {"docs": {"_ref": "antique"}, "queries": {"count": 200}, "qrels": {"count": 6589, "fields": {"relevance": {"counts_by_value": {"4": 1334, "1": 1642, "2": 2417, "3": 1196}}}}}, "antique/test/non-offensive": {"docs": {"_ref": "antique"}, "queries": {"count": 176}, "qrels": {"count": 5752, "fields": {"relevance": {"counts_by_value": {"4": 1195, "1": 1407, "2": 2101, "3": 1049}}}}}, "antique/train": {"docs": {"_ref": "antique"}, "queries": {"count": 2426}, "qrels": {"count": 27422, "fields": {"relevance": {"counts_by_value": {"4": 11733, "3": 8080, "2": 6337, "1": 1272}}}}}, "antique/train/split200-train": {"docs": {"_ref": "antique"}, "queries": {"count": 2226}, "qrels": {"count": 25229, "fields": {"relevance": {"counts_by_value": {"4": 10782, "3": 7447, "2": 5829, "1": 1171}}}}}, "antique/train/split200-valid": {"docs": {"_ref": "antique"}, "queries": {"count": 200}, "qrels": {"count": 2193, "fields": {"relevance": {"counts_by_value": {"4": 951, "2": 508, "3": 633, "1": 101}}}}}, "aol-ia": {"docs": {"count": 1525586, "fields": {"doc_id": {"max_len": 12, "common_prefix": ""}}}, "queries": {"count": 9966939}, "qrels": {"count": 19442629, "fields": {"relevance": {"counts_by_value": {"1": 19442629}}}}, "qlogs": {"count": 36389567}}, "aquaint": {"docs": {"count": 1033461, "fields": {"doc_id": {"max_len": 16, "common_prefix": ""}}}}, "aquaint/trec-robust-2005": {"docs": {"_ref": "aquaint"}, "queries": {"count": 50}, "qrels": {"count": 37798, "fields": {"relevance": {"counts_by_value": {"2": 2790, "1": 3771, "0": 31237}}}}}, "argsme": {}, "argsme/1.0": {"docs": {"count": 387692, "fields": {"doc_id": {"max_len": 39, "common_prefix": ""}}}}, "argsme/1.0-cleaned": {"docs": {"count": 382545, "fields": {"doc_id": {"max_len": 39, "common_prefix": ""}}}}, "argsme/1.0/touche-2020-task-1/uncorrected": {"docs": {"_ref": "argsme/1.0"}, "queries": {"count": 49}, "qrels": {"count": 2964, "fields": {"relevance": {"counts_by_value": {"4": 1006, "5": 398, "3": 628, "2": 195, "-2": 551, "1": 186}}}}}, "argsme/2020-04-01": {"docs": {"count": 387740, "fields": {"doc_id": {"max_len": 19, "common_prefix": "S"}}}}, "argsme/2020-04-01/debateorg": {"docs": {"count": 338620, "fields": {"doc_id": {"max_len": 19, "common_prefix": "S"}}}}, "argsme/2020-04-01/debatepedia": {"docs": {"count": 21197, "fields": {"doc_id": {"max_len": 19, "common_prefix": "S"}}}}, "argsme/2020-04-01/debatewise": {"docs": {"count": 14353, "fields": {"doc_id": {"max_len": 19, "common_prefix": "S"}}}}, "argsme/2020-04-01/idebate": {"docs": {"count": 13522, "fields": {"doc_id": {"max_len": 19, "common_prefix": "S"}}}}, "argsme/2020-04-01/parliamentary": {"docs": {"count": 48, "fields": {"doc_id": {"max_len": 19, "common_prefix": "S"}}}}, "argsme/2020-04-01/processed": {"docs": {"count": 365408, "fields": {"doc_id": {"max_len": 19, "common_prefix": "S"}}}}, "argsme/2020-04-01/processed/touche-2022-task-1": {"docs": {"_ref": "argsme/2020-04-01/processed"}, "queries": {"count": 50}, "qrels": {"count": 6841, "fields": {"relevance": {"counts_by_value": {"2": 2020, "0": 1802, "1": 3019}}}}}, "argsme/2020-04-01/touche-2020-task-1": {"docs": {"_ref": "argsme/2020-04-01"}, "queries": {"count": 49}, "qrels": {"count": 2298, "fields": {"relevance": {"counts_by_value": {"0": 615, "1": 296, "-2": 751, "2": 636}}}}}, "argsme/2020-04-01/touche-2020-task-1/uncorrected": {"docs": {"_ref": "argsme/2020-04-01"}, "queries": {"_ref": "argsme/2020-04-01/touche-2020-task-1"}, "qrels": {"count": 2298, "fields": {"relevance": {"counts_by_value": {"4": 665, "3": 485, "-2": 380, "5": 425, "2": 199, "1": 144}}}}}, "argsme/2020-04-01/touche-2021-task-1": {"docs": {"_ref": "argsme/2020-04-01"}, "queries": {"count": 50}, "qrels": {"count": 3711, "fields": {"relevance": {"counts_by_value": {"2": 1082, "0": 1542, "1": 736, "-2": 351}}}}}, "beir": {}, "beir/arguana": {"docs": {"count": 8674, "fields": {"doc_id": {"max_len": 47, "common_prefix": ""}}}, "queries": {"count": 1406}, "qrels": {"count": 1406, "fields": {"relevance": {"counts_by_value": {"1": 1406}}}}}, "beir/climate-fever": {"docs": {"count": 5416593, "fields": {"doc_id": {"max_len": 221, "common_prefix": ""}}}, "queries": {"count": 1535}, "qrels": {"count": 4681, "fields": {"relevance": {"counts_by_value": {"1": 4681}}}}}, "beir/cqadupstack/android": {"docs": {"count": 22998, "fields": {"doc_id": {"max_len": 5, "common_prefix": ""}}}, "queries": {"count": 699}, "qrels": {"count": 1696, "fields": {"relevance": {"counts_by_value": {"1": 1696}}}}}, "beir/cqadupstack/english": {"docs": {"count": 40221, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}, "queries": {"count": 1570}, "qrels": {"count": 3765, "fields": {"relevance": {"counts_by_value": {"1": 3765}}}}}, "beir/cqadupstack/gaming": {"docs": {"count": 45301, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}, "queries": {"count": 1595}, "qrels": {"count": 2263, "fields": {"relevance": {"counts_by_value": {"1": 2263}}}}}, "beir/cqadupstack/gis": {"docs": {"count": 37637, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}, "queries": {"count": 885}, "qrels": {"count": 1114, "fields": {"relevance": {"counts_by_value": {"1": 1114}}}}}, "beir/cqadupstack/mathematica": {"docs": {"count": 16705, "fields": {"doc_id": {"max_len": 5, "common_prefix": ""}}}, "queries": {"count": 804}, "qrels": {"count": 1358, "fields": {"relevance": {"counts_by_value": {"1": 1358}}}}}, "beir/cqadupstack/physics": {"docs": {"count": 38316, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}, "queries": {"count": 1039}, "qrels": {"count": 1933, "fields": {"relevance": {"counts_by_value": {"1": 1933}}}}}, "beir/cqadupstack/programmers": {"docs": {"count": 32176, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}, "queries": {"count": 876}, "qrels": {"count": 1675, "fields": {"relevance": {"counts_by_value": {"1": 1675}}}}}, "beir/cqadupstack/stats": {"docs": {"count": 42269, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}, "queries": {"count": 652}, "qrels": {"count": 913, "fields": {"relevance": {"counts_by_value": {"1": 913}}}}}, "beir/cqadupstack/tex": {"docs": {"count": 68184, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}, "queries": {"count": 2906}, "qrels": {"count": 5154, "fields": {"relevance": {"counts_by_value": {"1": 5154}}}}}, "beir/cqadupstack/unix": {"docs": {"count": 47382, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}, "queries": {"count": 1072}, "qrels": {"count": 1693, "fields": {"relevance": {"counts_by_value": {"1": 1693}}}}}, "beir/cqadupstack/webmasters": {"docs": {"count": 17405, "fields": {"doc_id": {"max_len": 5, "common_prefix": ""}}}, "queries": {"count": 506}, "qrels": {"count": 1395, "fields": {"relevance": {"counts_by_value": {"1": 1395}}}}}, "beir/cqadupstack/wordpress": {"docs": {"count": 48605, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}, "queries": {"count": 541}, "qrels": {"count": 744, "fields": {"relevance": {"counts_by_value": {"1": 744}}}}}, "beir/dbpedia-entity": {"docs": {"count": 4635922, "fields": {"doc_id": {"max_len": 200, "common_prefix": ""}}}, "queries": {"count": 467}}, "beir/dbpedia-entity/dev": {"docs": {"_ref": "beir/dbpedia-entity"}, "queries": {"count": 67}, "qrels": {"count": 5673, "fields": {"relevance": {"counts_by_value": {"0": 4268, "1": 1024, "2": 381}}}}}, "beir/dbpedia-entity/test": {"docs": {"_ref": "beir/dbpedia-entity"}, "queries": {"count": 400}, "qrels": {"count": 43515, "fields": {"relevance": {"counts_by_value": {"0": 28229, "1": 8785, "2": 6501}}}}}, "beir/fever": {"docs": {"count": 5416568, "fields": {"doc_id": {"max_len": 221, "common_prefix": ""}}}, "queries": {"count": 123142}}, "beir/fever/dev": {"docs": {"_ref": "beir/fever"}, "queries": {"count": 6666}, "qrels": {"count": 8079, "fields": {"relevance": {"counts_by_value": {"1": 8079}}}}}, "beir/fever/test": {"docs": {"_ref": "beir/fever"}, "queries": {"count": 6666}, "qrels": {"count": 7937, "fields": {"relevance": {"counts_by_value": {"1": 7937}}}}}, "beir/fever/train": {"docs": {"_ref": "beir/fever"}, "queries": {"count": 109810}, "qrels": {"count": 140085, "fields": {"relevance": {"counts_by_value": {"1": 140085}}}}}, "beir/fiqa": {"docs": {"count": 57638, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}, "queries": {"count": 6648}}, "beir/fiqa/dev": {"docs": {"_ref": "beir/fiqa"}, "queries": {"count": 500}, "qrels": {"count": 1238, "fields": {"relevance": {"counts_by_value": {"1": 1238}}}}}, "beir/fiqa/test": {"docs": {"_ref": "beir/fiqa"}, "queries": {"count": 648}, "qrels": {"count": 1706, "fields": {"relevance": {"counts_by_value": {"1": 1706}}}}}, "beir/fiqa/train": {"docs": {"_ref": "beir/fiqa"}, "queries": {"count": 5500}, "qrels": {"count": 14166, "fields": {"relevance": {"counts_by_value": {"1": 14166}}}}}, "beir/hotpotqa": {"docs": {"count": 5233329, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 97852}}, "beir/hotpotqa/dev": {"docs": {"_ref": "beir/hotpotqa"}, "queries": {"count": 5447}, "qrels": {"count": 10894, "fields": {"relevance": {"counts_by_value": {"1": 10894}}}}}, "beir/hotpotqa/test": {"docs": {"_ref": "beir/hotpotqa"}, "queries": {"count": 7405}, "qrels": {"count": 14810, "fields": {"relevance": {"counts_by_value": {"1": 14810}}}}}, "beir/hotpotqa/train": {"docs": {"_ref": "beir/hotpotqa"}, "queries": {"count": 85000}, "qrels": {"count": 170000, "fields": {"relevance": {"counts_by_value": {"1": 170000}}}}}, "beir/msmarco": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}, "queries": {"count": 509962}}, "beir/msmarco/dev": {"docs": {"_ref": "beir/msmarco"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}}, "beir/msmarco/test": {"docs": {"_ref": "beir/msmarco"}, "queries": {"count": 43}, "qrels": {"count": 9260, "fields": {"relevance": {"counts_by_value": {"0": 5158, "1": 1601, "2": 1804, "3": 697}}}}}, "beir/msmarco/train": {"docs": {"_ref": "beir/msmarco"}, "queries": {"count": 502939}, "qrels": {"count": 532751, "fields": {"relevance": {"counts_by_value": {"1": 532751}}}}}, "beir/nfcorpus": {"docs": {"count": 3633, "fields": {"doc_id": {"max_len": 8, "common_prefix": "MED-"}}}, "queries": {"count": 3237}}, "beir/nfcorpus/dev": {"docs": {"_ref": "beir/nfcorpus"}, "queries": {"count": 324}, "qrels": {"count": 11385, "fields": {"relevance": {"counts_by_value": {"2": 521, "1": 10864}}}}}, "beir/nfcorpus/test": {"docs": {"_ref": "beir/nfcorpus"}, "queries": {"count": 323}, "qrels": {"count": 12334, "fields": {"relevance": {"counts_by_value": {"2": 576, "1": 11758}}}}}, "beir/nfcorpus/train": {"docs": {"_ref": "beir/nfcorpus"}, "queries": {"count": 2590}, "qrels": {"count": 110575, "fields": {"relevance": {"counts_by_value": {"1": 110575}}}}}, "beir/nq": {"docs": {"count": 2681468, "fields": {"doc_id": {"max_len": 10, "common_prefix": "doc"}}}, "queries": {"count": 3452}, "qrels": {"count": 4201, "fields": {"relevance": {"counts_by_value": {"1": 4201}}}}}, "beir/quora": {"docs": {"count": 522931, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}, "queries": {"count": 15000}}, "beir/quora/dev": {"docs": {"_ref": "beir/quora"}, "queries": {"count": 5000}, "qrels": {"count": 7626, "fields": {"relevance": {"counts_by_value": {"1": 7626}}}}}, "beir/quora/test": {"docs": {"_ref": "beir/quora"}, "queries": {"count": 10000}, "qrels": {"count": 15675, "fields": {"relevance": {"counts_by_value": {"1": 15675}}}}}, "beir/scidocs": {"docs": {"count": 25657, "fields": {"doc_id": {"max_len": 40, "common_prefix": ""}}}, "queries": {"count": 1000}, "qrels": {"count": 29928, "fields": {"relevance": {"counts_by_value": {"1": 4928, "0": 25000}}}}}, "beir/scifact": {"docs": {"count": 5183, "fields": {"doc_id": {"max_len": 9, "common_prefix": ""}}}, "queries": {"count": 1109}}, "beir/scifact/test": {"docs": {"_ref": "beir/scifact"}, "queries": {"count": 300}, "qrels": {"count": 339, "fields": {"relevance": {"counts_by_value": {"1": 339}}}}}, "beir/scifact/train": {"docs": {"_ref": "beir/scifact"}, "queries": {"count": 809}, "qrels": {"count": 919, "fields": {"relevance": {"counts_by_value": {"1": 919}}}}}, "beir/trec-covid": {"docs": {"count": 171332, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 50}, "qrels": {"count": 66336, "fields": {"relevance": {"counts_by_value": {"2": 14217, "1": 10456, "0": 41661, "-1": 2}}}}}, "beir/webis-touche2020": {"docs": {"count": 382545, "fields": {"doc_id": {"max_len": 39, "common_prefix": ""}}}, "queries": {"count": 49}, "qrels": {"count": 2962, "fields": {"relevance": {"counts_by_value": {"4": 1006, "5": 398, "3": 628, "2": 195, "-2": 549, "1": 186}}}}}, "beir/webis-touche2020/v2": {"docs": {"count": 382545, "fields": {"doc_id": {"max_len": 39, "common_prefix": ""}}}, "queries": {"count": 49}, "qrels": {"count": 2214, "fields": {"relevance": {"counts_by_value": {"0": 1282, "1": 296, "2": 636}}}}}, "c4": {}, "c4/en-noclean-tr": {"docs": {"count": 1063805381, "fields": {"doc_id": {"max_len": 41, "common_prefix": "en.noclean.c4-train.0"}}}}, "c4/en-noclean-tr/trec-misinfo-2021": {"docs": {"_ref": "c4/en-noclean-tr"}, "queries": {"count": 50}}, "car": {}, "car/v1.5": {"docs": {"count": 29678367, "fields": {"doc_id": {"max_len": 40, "common_prefix": ""}}}}, "car/v1.5/test200": {"docs": {"_ref": "car/v1.5"}, "queries": {"count": 1987}, "qrels": {"count": 4706, "fields": {"relevance": {"counts_by_value": {"1": 4706}}}}}, "car/v1.5/train/fold0": {"docs": {"_ref": "car/v1.5"}, "queries": {"count": 467946}, "qrels": {"count": 1054369, "fields": {"relevance": {"counts_by_value": {"1": 1054369}}}}}, "car/v1.5/train/fold1": {"docs": {"_ref": "car/v1.5"}, "queries": {"count": 466596}, "qrels": {"count": 1052398, "fields": {"relevance": {"counts_by_value": {"1": 1052398}}}}}, "car/v1.5/train/fold2": {"docs": {"_ref": "car/v1.5"}, "queries": {"count": 469323}, "qrels": {"count": 1061162, "fields": {"relevance": {"counts_by_value": {"1": 1061162}}}}}, "car/v1.5/train/fold3": {"docs": {"_ref": "car/v1.5"}, "queries": {"count": 463314}, "qrels": {"count": 1046784, "fields": {"relevance": {"counts_by_value": {"1": 1046784}}}}}, "car/v1.5/train/fold4": {"docs": {"_ref": "car/v1.5"}, "queries": {"count": 468789}, "qrels": {"count": 1061911, "fields": {"relevance": {"counts_by_value": {"1": 1061911}}}}}, "car/v1.5/trec-y1": {"docs": {"_ref": "car/v1.5"}, "queries": {"count": 2287}}, "car/v1.5/trec-y1/auto": {"docs": {"_ref": "car/v1.5"}, "queries": {"_ref": "car/v1.5/trec-y1"}, "qrels": {"count": 5820, "fields": {"relevance": {"counts_by_value": {"1": 5820}}}}}, "car/v1.5/trec-y1/manual": {"docs": {"_ref": "car/v1.5"}, "queries": {"_ref": "car/v1.5/trec-y1"}, "qrels": {"count": 29571, "fields": {"relevance": {"counts_by_value": {"-1": 12785, "0": 9219, "1": 3094, "2": 1970, "3": 2461, "-2": 42}}}}}, "car/v2.0": {"docs": {"count": 29794697, "fields": {"doc_id": {"max_len": 40, "common_prefix": ""}}}}, "clinicaltrials": {}, "clinicaltrials/2017": {"docs": {"count": 241006, "fields": {"doc_id": {"max_len": 11, "common_prefix": "NCT0"}}}}, "clinicaltrials/2017/trec-pm-2017": {"docs": {"_ref": "clinicaltrials/2017"}, "queries": {"count": 30}, "qrels": {"count": 13019, "fields": {"relevance": {"counts_by_value": {"0": 11848, "1": 735, "2": 436}}}}}, "clinicaltrials/2017/trec-pm-2018": {"docs": {"_ref": "clinicaltrials/2017"}, "queries": {"count": 50}, "qrels": {"count": 14188, "fields": {"relevance": {"counts_by_value": {"0": 12141, "2": 873, "1": 1174}}}}}, "clinicaltrials/2019": {"docs": {"count": 306238, "fields": {"doc_id": {"max_len": 11, "common_prefix": "NCT0"}}}}, "clinicaltrials/2019/trec-pm-2019": {"docs": {"_ref": "clinicaltrials/2019"}, "queries": {"count": 40}, "qrels": {"count": 12996, "fields": {"relevance": {"counts_by_value": {"0": 10811, "1": 1700, "2": 485}}}}}, "clinicaltrials/2021": {"docs": {"count": 375580, "fields": {"doc_id": {"max_len": 11, "common_prefix": "NCT0"}}}}, "clinicaltrials/2021/trec-ct-2021": {"docs": {"_ref": "clinicaltrials/2021"}, "queries": {"count": 75}, "qrels": {"count": 35832, "fields": {"relevance": {"counts_by_value": {"1": 6019, "0": 24243, "2": 5570}}}}}, "clinicaltrials/2021/trec-ct-2022": {"docs": {"_ref": "clinicaltrials/2021"}, "queries": {"count": 50}}, "clirmatrix": {}, "clueweb09": {"docs": {"count": 1040859705, "fields": {"doc_id": {"max_len": 25, "common_prefix": "clueweb09-"}}}}, "clueweb09/ar": {"docs": {"count": 29192662, "fields": {"doc_id": {"max_len": 25, "common_prefix": "clueweb09-ar000"}}}}, "clueweb09/catb": {"docs": {"count": 50220423, "fields": {"doc_id": {"max_len": 25, "common_prefix": "clueweb09-en"}}}}, "clueweb09/catb/trec-web-2009": {"docs": {"_ref": "clueweb09/catb"}, "queries": {"count": 50}, "qrels": {"count": 13118, "fields": {"relevance": {"counts_by_value": {"0": 9116, "1": 2514, "2": 1488}}}}}, "clueweb09/catb/trec-web-2009/diversity": {"docs": {"_ref": "clueweb09/catb"}, "queries": {"count": 50}, "qrels": {"count": 16347, "fields": {"relevance": {"counts_by_value": {"0": 12266, "1": 4081}}}}}, "clueweb09/catb/trec-web-2010": {"docs": {"_ref": "clueweb09/catb"}, "queries": {"count": 50}, "qrels": {"count": 15845, "fields": {"relevance": {"counts_by_value": {"0": 12040, "1": 2318, "-2": 715, "2": 682, "3": 90}}}}}, "clueweb09/catb/trec-web-2010/diversity": {"docs": {"_ref": "clueweb09/catb"}, "queries": {"count": 50}, "qrels": {"count": 5522, "fields": {"relevance": {"counts_by_value": {"1": 5522}}}}}, "clueweb09/catb/trec-web-2011": {"docs": {"_ref": "clueweb09/catb"}, "queries": {"count": 50}, "qrels": {"count": 13081, "fields": {"relevance": {"counts_by_value": {"0": 10920, "1": 1100, "2": 354, "-2": 499, "3": 208}}}}}, "clueweb09/catb/trec-web-2011/diversity": {"docs": {"_ref": "clueweb09/catb"}, "queries": {"count": 50}, "qrels": {"count": 43889, "fields": {"relevance": {"counts_by_value": {"0": 37665, "1": 3016, "2": 919, "-2": 1733, "3": 556}}}}}, "clueweb09/catb/trec-web-2012": {"docs": {"_ref": "clueweb09/catb"}, "queries": {"count": 50}, "qrels": {"count": 10022, "fields": {"relevance": {"counts_by_value": {"-2": 561, "0": 7178, "1": 1386, "4": 580, "2": 300, "3": 17}}}}}, "clueweb09/catb/trec-web-2012/diversity": {"docs": {"_ref": "clueweb09/catb"}, "queries": {"count": 50}, "qrels": {"count": 38992, "fields": {"relevance": {"counts_by_value": {"-2": 2237, "0": 30669, "1": 3494, "4": 1658, "2": 887, "3": 47}}}}}, "clueweb09/de": {"docs": {"count": 49814309, "fields": {"doc_id": {"max_len": 25, "common_prefix": "clueweb09-de00"}}}}, "clueweb09/en": {"docs": {"count": 503903810, "fields": {"doc_id": {"max_len": 25, "common_prefix": "clueweb09-en"}}}}, "clueweb09/en/trec-web-2009": {"docs": {"_ref": "clueweb09/en"}, "queries": {"count": 50}, "qrels": {"count": 23601, "fields": {"relevance": {"counts_by_value": {"0": 16743, "1": 4832, "2": 2026}}}}}, "clueweb09/en/trec-web-2009/diversity": {"docs": {"_ref": "clueweb09/en"}, "queries": {"count": 50}, "qrels": {"count": 27964, "fields": {"relevance": {"counts_by_value": {"0": 21465, "1": 6499}}}}}, "clueweb09/en/trec-web-2010": {"docs": {"_ref": "clueweb09/en"}, "queries": {"count": 50}, "qrels": {"count": 25329, "fields": {"relevance": {"counts_by_value": {"0": 18665, "1": 4018, "-2": 1431, "2": 1077, "3": 138}}}}}, "clueweb09/en/trec-web-2010/diversity": {"docs": {"_ref": "clueweb09/en"}, "queries": {"count": 50}, "qrels": {"count": 9006, "fields": {"relevance": {"counts_by_value": {"1": 9006}}}}}, "clueweb09/en/trec-web-2011": {"docs": {"_ref": "clueweb09/en"}, "queries": {"count": 50}, "qrels": {"count": 19381, "fields": {"relevance": {"counts_by_value": {"0": 15205, "2": 711, "1": 2038, "-2": 1019, "3": 408}}}}}, "clueweb09/en/trec-web-2011/diversity": {"docs": {"_ref": "clueweb09/en"}, "queries": {"count": 50}, "qrels": {"count": 64868, "fields": {"relevance": {"counts_by_value": {"0": 53055, "2": 1828, "1": 5469, "-2": 3435, "3": 1081}}}}}, "clueweb09/en/trec-web-2012": {"docs": {"_ref": "clueweb09/en"}, "queries": {"count": 50}, "qrels": {"count": 16055, "fields": {"relevance": {"counts_by_value": {"-2": 858, "0": 11674, "1": 2208, "4": 858, "2": 405, "3": 52}}}}}, "clueweb09/en/trec-web-2012/diversity": {"docs": {"_ref": "clueweb09/en"}, "queries": {"count": 50}, "qrels": {"count": 62394, "fields": {"relevance": {"counts_by_value": {"-2": 3373, "0": 49653, "1": 5578, "4": 2486, "2": 1174, "3": 130}}}}}, "clueweb09/es": {"docs": {"count": 79333950, "fields": {"doc_id": {"max_len": 25, "common_prefix": "clueweb09-es"}}}}, "clueweb09/fr": {"docs": {"count": 50883172, "fields": {"doc_id": {"max_len": 25, "common_prefix": "clueweb09-fr"}}}}, "clueweb09/it": {"docs": {"count": 27250729, "fields": {"doc_id": {"max_len": 25, "common_prefix": "clueweb09-it"}}}}, "clueweb09/ja": {"docs": {"count": 67337717, "fields": {"doc_id": {"max_len": 25, "common_prefix": "clueweb09-ja"}}}}, "clueweb09/ko": {"docs": {"count": 18075141, "fields": {"doc_id": {"max_len": 25, "common_prefix": "clueweb09-ko000"}}}}, "clueweb09/pt": {"docs": {"count": 37578858, "fields": {"doc_id": {"max_len": 25, "common_prefix": "clueweb09-pt"}}}}, "clueweb09/trec-mq-2009": {"docs": {"_ref": "clueweb09"}, "queries": {"count": 40000}, "qrels": {"count": 34534, "fields": {"relevance": {"counts_by_value": {"0": 25586, "1": 5856, "2": 3092}}}}}, "clueweb09/zh": {"docs": {"count": 177489357, "fields": {"doc_id": {"max_len": 25, "common_prefix": "clueweb09-zh"}}}}, "clueweb12": {"docs": {"count": 733019372, "fields": {"doc_id": {"max_len": 25, "common_prefix": "clueweb12-"}}}}, "clueweb12/b13": {"docs": {"count": 52343021, "fields": {"doc_id": {"max_len": 25, "common_prefix": "clueweb12-"}}}}, "clueweb12/b13/clef-ehealth": {"docs": {"_ref": "clueweb12/b13"}, "queries": {"count": 300}, "qrels": {"count": 269232, "fields": {"relevance": {"counts_by_value": {"0": 230658, "1": 22584, "2": 15990}}}}}, "clueweb12/b13/clef-ehealth/cs": {"docs": {"_ref": "clueweb12/b13"}, "queries": {"count": 300}, "qrels": {"count": 269232, "fields": {"relevance": {"counts_by_value": {"0": 230658, "1": 22584, "2": 15990}}}}}, "clueweb12/b13/clef-ehealth/de": {"docs": {"_ref": "clueweb12/b13"}, "queries": {"count": 300}, "qrels": {"count": 269232, "fields": {"relevance": {"counts_by_value": {"0": 230658, "1": 22584, "2": 15990}}}}}, "clueweb12/b13/clef-ehealth/fr": {"docs": {"_ref": "clueweb12/b13"}, "queries": {"count": 300}, "qrels": {"count": 269232, "fields": {"relevance": {"counts_by_value": {"0": 230658, "1": 22584, "2": 15990}}}}}, "clueweb12/b13/clef-ehealth/hu": {"docs": {"_ref": "clueweb12/b13"}, "queries": {"count": 300}, "qrels": {"count": 269232, "fields": {"relevance": {"counts_by_value": {"0": 230658, "1": 22584, "2": 15990}}}}}, "clueweb12/b13/clef-ehealth/pl": {"docs": {"_ref": "clueweb12/b13"}, "queries": {"count": 300}, "qrels": {"count": 269232, "fields": {"relevance": {"counts_by_value": {"0": 230658, "1": 22584, "2": 15990}}}}}, "clueweb12/b13/clef-ehealth/sv": {"docs": {"_ref": "clueweb12/b13"}, "queries": {"count": 300}, "qrels": {"count": 269232, "fields": {"relevance": {"counts_by_value": {"0": 230658, "1": 22584, "2": 15990}}}}}, "clueweb12/b13/ntcir-www-1": {"docs": {"_ref": "clueweb12/b13"}, "queries": {"count": 100}, "qrels": {"count": 25465, "fields": {"relevance": {"counts_by_value": {"3": 4081, "1": 5184, "0": 9821, "2": 4709, "4": 1670}}}}}, "clueweb12/b13/ntcir-www-2": {"docs": {"_ref": "clueweb12/b13"}, "queries": {"count": 80}, "qrels": {"count": 27627, "fields": {"relevance": {"counts_by_value": {"0": 13305, "2": 4664, "1": 6469, "3": 2332, "4": 857}}}}}, "clueweb12/b13/ntcir-www-3": {"docs": {"_ref": "clueweb12/b13"}, "queries": {"count": 160}}, "clueweb12/b13/trec-misinfo-2019": {"docs": {"_ref": "clueweb12/b13"}, "queries": {"count": 51}, "qrels": {"count": 22859, "fields": {"relevance": {"counts_by_value": {"1": 3137, "0": 18694, "2": 1028}}}}}, "clueweb12/touche-2020-task-2": {"docs": {"_ref": "clueweb12"}, "queries": {"count": 50}, "qrels": {"count": 1783, "fields": {"relevance": {"counts_by_value": {"0": 961, "1": 448, "2": 374}}}}}, "clueweb12/touche-2021-task-2": {"docs": {"_ref": "clueweb12"}, "queries": {"count": 50}, "qrels": {"count": 2076, "fields": {"relevance": {"counts_by_value": {"0": 1435, "1": 377, "2": 264}}}}}, "clueweb12/touche-2022-task-2": {"docs": {"count": 868655, "fields": {"doc_id": {"max_len": 32, "common_prefix": "clueweb12-"}}}, "queries": {"count": 50}, "qrels": {"count": 2107, "fields": {"relevance": {"counts_by_value": {"0": 851, "1": 723, "2": 533}}}}}, "clueweb12/touche-2022-task-2/expanded-doc-t5-query": {"docs": {"count": 868655, "fields": {"doc_id": {"max_len": 32, "common_prefix": "clueweb12-"}}}, "queries": {"_ref": "clueweb12/touche-2022-task-2"}, "qrels": {"_ref": "clueweb12/touche-2022-task-2"}}, "clueweb12/trec-web-2013": {"docs": {"_ref": "clueweb12"}, "queries": {"count": 50}, "qrels": {"count": 14474, "fields": {"relevance": {"counts_by_value": {"1": 3044, "0": 10090, "2": 920, "-2": 234, "4": 7, "3": 179}}}}}, "clueweb12/trec-web-2013/diversity": {"docs": {"_ref": "clueweb12"}, "queries": {"count": 50}, "qrels": {"count": 46985, "fields": {"relevance": {"counts_by_value": {"1": 6716, "0": 37089, "2": 2081, "-2": 775, "3": 313, "4": 11}}}}}, "clueweb12/trec-web-2014": {"docs": {"_ref": "clueweb12"}, "queries": {"count": 50}, "qrels": {"count": 14432, "fields": {"relevance": {"counts_by_value": {"1": 3788, "2": 1614, "0": 8211, "-2": 556, "3": 230, "4": 33}}}}}, "clueweb12/trec-web-2014/diversity": {"docs": {"_ref": "clueweb12"}, "queries": {"count": 50}, "qrels": {"count": 43840, "fields": {"relevance": {"counts_by_value": {"1": 7358, "2": 2812, "0": 31719, "-2": 1492, "3": 424, "4": 35}}}}}, "codec": {"docs": {"count": 729824, "fields": {"doc_id": {"max_len": 32, "common_prefix": ""}}}, "queries": {"count": 42}, "qrels": {"count": 6186, "fields": {"relevance": {"counts_by_value": {"2": 1207, "0": 2353, "1": 2210, "3": 416}}}}}, "codec/economics": {"docs": {"_ref": "codec"}, "queries": {"count": 14}, "qrels": {"count": 1970, "fields": {"relevance": {"counts_by_value": {"2": 458, "0": 660, "1": 693, "3": 159}}}}}, "codec/history": {"docs": {"_ref": "codec"}, "queries": {"count": 14}, "qrels": {"count": 2024, "fields": {"relevance": {"counts_by_value": {"0": 998, "1": 618, "2": 292, "3": 116}}}}}, "codec/politics": {"docs": {"_ref": "codec"}, "queries": {"count": 14}, "qrels": {"count": 2192, "fields": {"relevance": {"counts_by_value": {"3": 141, "2": 457, "1": 899, "0": 695}}}}}, "codesearchnet": {"docs": {"count": 2070536, "fields": {"doc_id": {"max_len": 339, "common_prefix": "https://github.com/"}}}}, "codesearchnet/challenge": {"docs": {"_ref": "codesearchnet"}, "queries": {"count": 99}, "qrels": {"count": 4006, "fields": {"relevance": {"counts_by_value": {"0": 1314, "1": 982, "2": 863, "3": 847}}}}}, "codesearchnet/test": {"docs": {"_ref": "codesearchnet"}, "queries": {"count": 100529}, "qrels": {"count": 100529, "fields": {"relevance": {"counts_by_value": {"1": 100529}}}}}, "codesearchnet/train": {"docs": {"_ref": "codesearchnet"}, "queries": {"count": 1880853}, "qrels": {"count": 1880853, "fields": {"relevance": {"counts_by_value": {"1": 1880853}}}}}, "codesearchnet/valid": {"docs": {"_ref": "codesearchnet"}, "queries": {"count": 89154}, "qrels": {"count": 89154, "fields": {"relevance": {"counts_by_value": {"1": 89154}}}}}, "cord19": {"docs": {"count": 192509, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, "cord19/fulltext": {"docs": {"count": 192509, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, "cord19/fulltext/trec-covid": {"docs": {"_ref": "cord19/fulltext"}, "queries": {"count": 50}, "qrels": {"count": 69318, "fields": {"relevance": {"counts_by_value": {"2": 15609, "1": 11055, "0": 42652, "-1": 2}}}}}, "cord19/trec-covid": {"docs": {"_ref": "cord19"}, "queries": {"count": 50}, "qrels": {"count": 69318, "fields": {"relevance": {"counts_by_value": {"2": 15609, "1": 11055, "0": 42652, "-1": 2}}}}}, "cord19/trec-covid/round1": {"docs": {"count": 51078, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 30}, "qrels": {"count": 8691, "fields": {"relevance": {"counts_by_value": {"2": 1237, "1": 1115, "0": 6339}}}}}, "cord19/trec-covid/round2": {"docs": {"count": 59887, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 35}, "qrels": {"count": 12037, "fields": {"relevance": {"counts_by_value": {"0": 9035, "1": 1410, "2": 1592}}}}}, "cord19/trec-covid/round3": {"docs": {"count": 128492, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 40}, "qrels": {"count": 12713, "fields": {"relevance": {"counts_by_value": {"1": 2089, "0": 8015, "2": 2609}}}}}, "cord19/trec-covid/round4": {"docs": {"count": 158274, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 45}, "qrels": {"count": 13262, "fields": {"relevance": {"counts_by_value": {"1": 2279, "0": 7438, "2": 3545}}}}}, "cord19/trec-covid/round5": {"docs": {"_ref": "cord19"}, "queries": {"_ref": "cord19/trec-covid"}, "qrels": {"count": 23151, "fields": {"relevance": {"counts_by_value": {"2": 6677, "1": 4233, "0": 12239, "-1": 2}}}}}, "cranfield": {"docs": {"count": 1400, "fields": {"doc_id": {"max_len": 4, "common_prefix": ""}}}, "queries": {"count": 225}, "qrels": {"count": 1837, "fields": {"relevance": {"counts_by_value": {"2": 387, "3": 734, "4": 363, "-1": 225, "1": 128}}}}}, "csl": {"docs": {"count": 395927, "fields": {"doc_id": {"max_len": 10, "common_prefix": "csl-"}}}}, "csl/trec-2023": {"docs": {"_ref": "csl"}, "queries": {"count": 41}, "qrels": {"count": 11291, "fields": {"relevance": {"counts_by_value": {"0": 10644, "1": 419, "3": 228}}}}}, "disks45": {}, "disks45/nocr": {"docs": {"count": 528155, "fields": {"doc_id": {"max_len": 16, "common_prefix": ""}}}}, "disks45/nocr/trec-robust-2004": {"docs": {"_ref": "disks45/nocr"}, "queries": {"count": 250}, "qrels": {"count": 311410, "fields": {"relevance": {"counts_by_value": {"1": 16381, "0": 293998, "2": 1031}}}}}, "disks45/nocr/trec-robust-2004/fold1": {"docs": {"_ref": "disks45/nocr"}, "queries": {"count": 50}, "qrels": {"count": 62789, "fields": {"relevance": {"counts_by_value": {"0": 59765, "1": 2795, "2": 229}}}}}, "disks45/nocr/trec-robust-2004/fold2": {"docs": {"_ref": "disks45/nocr"}, "queries": {"count": 50}, "qrels": {"count": 63917, "fields": {"relevance": {"counts_by_value": {"1": 3334, "0": 60246, "2": 337}}}}}, "disks45/nocr/trec-robust-2004/fold3": {"docs": {"_ref": "disks45/nocr"}, "queries": {"count": 50}, "qrels": {"count": 62901, "fields": {"relevance": {"counts_by_value": {"0": 58859, "1": 3877, "2": 165}}}}}, "disks45/nocr/trec-robust-2004/fold4": {"docs": {"_ref": "disks45/nocr"}, "queries": {"count": 50}, "qrels": {"count": 57962, "fields": {"relevance": {"counts_by_value": {"0": 55103, "1": 2707, "2": 152}}}}}, "disks45/nocr/trec-robust-2004/fold5": {"docs": {"_ref": "disks45/nocr"}, "queries": {"count": 50}, "qrels": {"count": 63841, "fields": {"relevance": {"counts_by_value": {"0": 60025, "1": 3668, "2": 148}}}}}, "disks45/nocr/trec7": {"docs": {"_ref": "disks45/nocr"}, "queries": {"count": 50}, "qrels": {"count": 80345, "fields": {"relevance": {"counts_by_value": {"0": 75671, "1": 4674}}}}}, "disks45/nocr/trec8": {"docs": {"_ref": "disks45/nocr"}, "queries": {"count": 50}, "qrels": {"count": 86830, "fields": {"relevance": {"counts_by_value": {"0": 82102, "1": 4728}}}}}, "dpr-w100": {"docs": {"count": 21015324, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, "dpr-w100/natural-questions/dev": {"docs": {"_ref": "dpr-w100"}, "queries": {"count": 6515}, "qrels": {"count": 979893, "fields": {"relevance": {"counts_by_value": {"2": 6515, "1": 44736, "0": 602894, "-1": 325748}}}}}, "dpr-w100/natural-questions/train": {"docs": {"_ref": "dpr-w100"}, "queries": {"count": 58880}, "qrels": {"count": 8856662, "fields": {"relevance": {"counts_by_value": {"2": 58880, "1": 405729, "0": 5448064, "-1": 2943989}}}}}, "dpr-w100/trivia-qa/dev": {"docs": {"_ref": "dpr-w100"}, "queries": {"count": 8837}, "qrels": {"count": 883700, "fields": {"relevance": {"counts_by_value": {"1": 82658, "0": 801042}}}}}, "dpr-w100/trivia-qa/train": {"docs": {"_ref": "dpr-w100"}, "queries": {"count": 78785}, "qrels": {"count": 7878500, "fields": {"relevance": {"counts_by_value": {"0": 7137064, "1": 741436}}}}}, "gov": {"docs": {"count": 1247753, "fields": {"doc_id": {"max_len": 14, "common_prefix": "G"}}}}, "gov/trec-web-2002": {"docs": {"_ref": "gov"}, "queries": {"count": 50}, "qrels": {"count": 56650, "fields": {"relevance": {"counts_by_value": {"0": 55076, "1": 1574}}}}}, "gov/trec-web-2002/named-page": {"docs": {"_ref": "gov"}, "queries": {"count": 150}, "qrels": {"count": 170, "fields": {"relevance": {"counts_by_value": {"1": 170}}}}}, "gov/trec-web-2003": {"docs": {"_ref": "gov"}, "queries": {"count": 50}, "qrels": {"count": 51062, "fields": {"relevance": {"counts_by_value": {"0": 50546, "1": 516}}}}}, "gov/trec-web-2003/named-page": {"docs": {"_ref": "gov"}, "queries": {"count": 300}, "qrels": {"count": 352, "fields": {"relevance": {"counts_by_value": {"1": 352}}}}}, "gov/trec-web-2004": {"docs": {"_ref": "gov"}, "queries": {"count": 225}, "qrels": {"count": 88566, "fields": {"relevance": {"counts_by_value": {"0": 86803, "1": 1763}}}}}, "gov2": {"docs": {"count": 25205179, "fields": {"doc_id": {"max_len": 17, "common_prefix": "GX"}}}}, "gov2/trec-mq-2007": {"docs": {"_ref": "gov2"}, "queries": {"count": 10000}, "qrels": {"count": 73015, "fields": {"relevance": {"counts_by_value": {"0": 54333, "1": 14689, "2": 3993}}}}}, "gov2/trec-mq-2008": {"docs": {"_ref": "gov2"}, "queries": {"count": 10000}, "qrels": {"count": 15211, "fields": {"relevance": {"counts_by_value": {"0": 12279, "1": 2932}}}}}, "gov2/trec-tb-2004": {"docs": {"_ref": "gov2"}, "queries": {"count": 50}, "qrels": {"count": 58077, "fields": {"relevance": {"counts_by_value": {"0": 47460, "1": 9327, "2": 1290}}}}}, "gov2/trec-tb-2005": {"docs": {"_ref": "gov2"}, "queries": {"count": 50}, "qrels": {"count": 45291, "fields": {"relevance": {"counts_by_value": {"0": 34884, "1": 7772, "2": 2635}}}}}, "gov2/trec-tb-2005/efficiency": {"docs": {"_ref": "gov2"}, "queries": {"count": 50000}, "qrels": {"count": 45291, "fields": {"relevance": {"counts_by_value": {"0": 34884, "1": 7772, "2": 2635}}}}}, "gov2/trec-tb-2005/named-page": {"docs": {"_ref": "gov2"}, "queries": {"count": 252}, "qrels": {"count": 11729, "fields": {"relevance": {"counts_by_value": {"1": 11729}}}}}, "gov2/trec-tb-2006": {"docs": {"_ref": "gov2"}, "queries": {"count": 50}, "qrels": {"count": 31984, "fields": {"relevance": {"counts_by_value": {"0": 26091, "1": 5467, "2": 426}}}}}, "gov2/trec-tb-2006/efficiency": {"docs": {"_ref": "gov2"}, "queries": {"count": 100000}, "qrels": {"count": 31984, "fields": {"relevance": {"counts_by_value": {"0": 26091, "1": 5467, "2": 426}}}}}, "gov2/trec-tb-2006/efficiency/10k": {"docs": {"_ref": "gov2"}, "queries": {"count": 10000}}, "gov2/trec-tb-2006/efficiency/stream1": {"docs": {"_ref": "gov2"}, "queries": {"count": 25000}}, "gov2/trec-tb-2006/efficiency/stream2": {"docs": {"_ref": "gov2"}, "queries": {"count": 25000}}, "gov2/trec-tb-2006/efficiency/stream3": {"docs": {"_ref": "gov2"}, "queries": {"count": 25000}, "qrels": {"count": 31984, "fields": {"relevance": {"counts_by_value": {"0": 26091, "1": 5467, "2": 426}}}}}, "gov2/trec-tb-2006/efficiency/stream4": {"docs": {"_ref": "gov2"}, "queries": {"count": 25000}}, "gov2/trec-tb-2006/named-page": {"docs": {"_ref": "gov2"}, "queries": {"count": 181}, "qrels": {"count": 2361, "fields": {"relevance": {"counts_by_value": {"1": 807, "0": 1554}}}}}, "hc4": {}, "hc4/fa": {"docs": {"count": 486486, "fields": {"doc_id": {"max_len": 36, "common_prefix": ""}}}}, "hc4/fa/dev": {"docs": {"_ref": "hc4/fa"}, "queries": {"count": 10}, "qrels": {"count": 565, "fields": {"relevance": {"counts_by_value": {"0": 456, "3": 63, "1": 46}}}}}, "hc4/fa/test": {"docs": {"_ref": "hc4/fa"}, "queries": {"count": 50}, "qrels": {"count": 2522, "fields": {"relevance": {"counts_by_value": {"0": 2101, "1": 215, "3": 206}}}}}, "hc4/fa/train": {"docs": {"_ref": "hc4/fa"}, "queries": {"count": 8}, "qrels": {"count": 112, "fields": {"relevance": {"counts_by_value": {"1": 23, "3": 22, "0": 67}}}}}, "hc4/ru": {"docs": {"count": 4721064, "fields": {"doc_id": {"max_len": 36, "common_prefix": ""}}}}, "hc4/ru/dev": {"docs": {"_ref": "hc4/ru"}, "queries": {"count": 4}, "qrels": {"count": 265, "fields": {"relevance": {"counts_by_value": {"0": 186, "1": 67, "3": 12}}}}}, "hc4/ru/test": {"docs": {"_ref": "hc4/ru"}, "queries": {"count": 50}, "qrels": {"count": 2970, "fields": {"relevance": {"counts_by_value": {"0": 2297, "1": 411, "3": 262}}}}}, "hc4/ru/train": {"docs": {"_ref": "hc4/ru"}, "queries": {"count": 7}, "qrels": {"count": 92, "fields": {"relevance": {"counts_by_value": {"1": 31, "3": 23, "0": 38}}}}}, "hc4/zh": {"docs": {"count": 646305, "fields": {"doc_id": {"max_len": 36, "common_prefix": ""}}}}, "hc4/zh/dev": {"docs": {"_ref": "hc4/zh"}, "queries": {"count": 10}, "qrels": {"count": 466, "fields": {"relevance": {"counts_by_value": {"0": 374, "3": 62, "1": 30}}}}}, "hc4/zh/test": {"docs": {"_ref": "hc4/zh"}, "queries": {"count": 50}, "qrels": {"count": 2751, "fields": {"relevance": {"counts_by_value": {"0": 2277, "3": 282, "1": 192}}}}}, "hc4/zh/train": {"docs": {"_ref": "hc4/zh"}, "queries": {"count": 23}, "qrels": {"count": 341, "fields": {"relevance": {"counts_by_value": {"0": 173, "1": 140, "3": 28}}}}}, "highwire": {"docs": {"count": 162259, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, "highwire/trec-genomics-2006": {"docs": {"_ref": "highwire"}, "queries": {"count": 28}, "qrels": {"count": 27999, "fields": {"relevance": {"counts_by_value": {"0": 24934, "1": 1237, "2": 1828}}}}}, "highwire/trec-genomics-2007": {"docs": {"_ref": "highwire"}, "queries": {"count": 36}, "qrels": {"count": 35996, "fields": {"relevance": {"counts_by_value": {"0": 31501, "1": 4495}}}}}, "istella22": {"docs": {"count": 8421456, "fields": {"doc_id": {"max_len": 16, "common_prefix": "1990"}}}}, "istella22/test": {"docs": {"_ref": "istella22"}, "qrels": {"count": 10693, "fields": {"relevance": {"counts_by_value": {"3": 2573, "4": 1040, "1": 6070, "2": 1010}}}}, "queries": {"count": 2198}}, "istella22/test/fold1": {"docs": {"_ref": "istella22"}, "queries": {"count": 440}, "qrels": {"count": 2164, "fields": {"relevance": {"counts_by_value": {"4": 194, "1": 1209, "3": 560, "2": 201}}}}}, "istella22/test/fold2": {"docs": {"_ref": "istella22"}, "queries": {"count": 440}, "qrels": {"count": 2140, "fields": {"relevance": {"counts_by_value": {"3": 493, "1": 1251, "4": 200, "2": 196}}}}}, "istella22/test/fold3": {"docs": {"_ref": "istella22"}, "queries": {"count": 440}, "qrels": {"count": 2197, "fields": {"relevance": {"counts_by_value": {"3": 532, "1": 1242, "4": 207, "2": 216}}}}}, "istella22/test/fold4": {"docs": {"_ref": "istella22"}, "queries": {"count": 439}, "qrels": {"count": 2098, "fields": {"relevance": {"counts_by_value": {"1": 1178, "4": 216, "3": 512, "2": 192}}}}}, "istella22/test/fold5": {"docs": {"_ref": "istella22"}, "queries": {"count": 439}, "qrels": {"count": 2094, "fields": {"relevance": {"counts_by_value": {"3": 476, "1": 1190, "4": 223, "2": 205}}}}}, "kilt": {"docs": {"count": 5903530, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, "kilt/codec": {"docs": {"_ref": "kilt"}, "queries": {"count": 42}, "qrels": {"count": 11323, "fields": {"relevance": {"counts_by_value": {"0": 7053, "2": 1252, "3": 777, "1": 2241}}}}}, "kilt/codec/economics": {"docs": {"_ref": "kilt"}, "queries": {"count": 14}, "qrels": {"count": 1970, "fields": {"relevance": {"counts_by_value": {"2": 458, "0": 660, "1": 693, "3": 159}}}}}, "kilt/codec/history": {"docs": {"_ref": "kilt"}, "queries": {"count": 14}, "qrels": {"count": 2024, "fields": {"relevance": {"counts_by_value": {"0": 998, "1": 618, "2": 292, "3": 116}}}}}, "kilt/codec/politics": {"docs": {"_ref": "kilt"}, "queries": {"count": 14}, "qrels": {"count": 2192, "fields": {"relevance": {"counts_by_value": {"3": 141, "2": 457, "1": 899, "0": 695}}}}}, "lotte": {}, "lotte/lifestyle/dev": {"docs": {"count": 268893, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}}, "lotte/lifestyle/dev/forum": {"docs": {"_ref": "lotte/lifestyle/dev"}, "queries": {"count": 2076}, "qrels": {"count": 12823, "fields": {"relevance": {"counts_by_value": {"1": 12823}}}}}, "lotte/lifestyle/dev/search": {"docs": {"_ref": "lotte/lifestyle/dev"}, "queries": {"count": 417}, "qrels": {"count": 1376, "fields": {"relevance": {"counts_by_value": {"1": 1376}}}}}, "lotte/lifestyle/test": {"docs": {"count": 119461, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}}, "lotte/lifestyle/test/forum": {"docs": {"_ref": "lotte/lifestyle/test"}, "queries": {"count": 2002}, "qrels": {"count": 10278, "fields": {"relevance": {"counts_by_value": {"1": 10278}}}}}, "lotte/lifestyle/test/search": {"docs": {"_ref": "lotte/lifestyle/test"}, "queries": {"count": 661}, "qrels": {"count": 1804, "fields": {"relevance": {"counts_by_value": {"1": 1804}}}}}, "lotte/pooled/dev": {"docs": {"count": 2428854, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "lotte/pooled/dev/forum": {"docs": {"_ref": "lotte/pooled/dev"}, "queries": {"count": 10097}, "qrels": {"count": 68685, "fields": {"relevance": {"counts_by_value": {"1": 68685}}}}}, "lotte/pooled/dev/search": {"docs": {"_ref": "lotte/pooled/dev"}, "queries": {"count": 2931}, "qrels": {"count": 8573, "fields": {"relevance": {"counts_by_value": {"1": 8573}}}}}, "lotte/pooled/test": {"docs": {"count": 2819103, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "lotte/pooled/test/forum": {"docs": {"_ref": "lotte/pooled/test"}, "queries": {"count": 10025}, "qrels": {"count": 61536, "fields": {"relevance": {"counts_by_value": {"1": 61536}}}}}, "lotte/pooled/test/search": {"docs": {"_ref": "lotte/pooled/test"}, "queries": {"count": 3869}, "qrels": {"count": 11124, "fields": {"relevance": {"counts_by_value": {"1": 11124}}}}}, "lotte/recreation/dev": {"docs": {"count": 263025, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}}, "lotte/recreation/dev/forum": {"docs": {"_ref": "lotte/recreation/dev"}, "queries": {"count": 2002}, "qrels": {"count": 12752, "fields": {"relevance": {"counts_by_value": {"1": 12752}}}}}, "lotte/recreation/dev/search": {"docs": {"_ref": "lotte/recreation/dev"}, "queries": {"count": 563}, "qrels": {"count": 1754, "fields": {"relevance": {"counts_by_value": {"1": 1754}}}}}, "lotte/recreation/test": {"docs": {"count": 166975, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}}, "lotte/recreation/test/forum": {"docs": {"_ref": "lotte/recreation/test"}, "queries": {"count": 2002}, "qrels": {"count": 6947, "fields": {"relevance": {"counts_by_value": {"1": 6947}}}}}, "lotte/recreation/test/search": {"docs": {"_ref": "lotte/recreation/test"}, "queries": {"count": 924}, "qrels": {"count": 1991, "fields": {"relevance": {"counts_by_value": {"1": 1991}}}}}, "lotte/science/dev": {"docs": {"count": 343642, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}}, "lotte/science/dev/forum": {"docs": {"_ref": "lotte/science/dev"}, "queries": {"count": 2013}, "qrels": {"count": 12271, "fields": {"relevance": {"counts_by_value": {"1": 12271}}}}}, "lotte/science/dev/search": {"docs": {"_ref": "lotte/science/dev"}, "queries": {"count": 538}, "qrels": {"count": 1480, "fields": {"relevance": {"counts_by_value": {"1": 1480}}}}}, "lotte/science/test": {"docs": {"count": 1694164, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "lotte/science/test/forum": {"docs": {"_ref": "lotte/science/test"}, "queries": {"count": 2017}, "qrels": {"count": 15515, "fields": {"relevance": {"counts_by_value": {"1": 15515}}}}}, "lotte/science/test/search": {"docs": {"_ref": "lotte/science/test"}, "queries": {"count": 617}, "qrels": {"count": 1738, "fields": {"relevance": {"counts_by_value": {"1": 1738}}}}}, "lotte/technology/dev": {"docs": {"count": 1276222, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "lotte/technology/dev/forum": {"docs": {"_ref": "lotte/technology/dev"}, "queries": {"count": 2003}, "qrels": {"count": 15741, "fields": {"relevance": {"counts_by_value": {"1": 15741}}}}}, "lotte/technology/dev/search": {"docs": {"_ref": "lotte/technology/dev"}, "queries": {"count": 916}, "qrels": {"count": 2676, "fields": {"relevance": {"counts_by_value": {"1": 2676}}}}}, "lotte/technology/test": {"docs": {"count": 638509, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}}, "lotte/technology/test/forum": {"docs": {"_ref": "lotte/technology/test"}, "queries": {"count": 2004}, "qrels": {"count": 15890, "fields": {"relevance": {"counts_by_value": {"1": 15890}}}}}, "lotte/technology/test/search": {"docs": {"_ref": "lotte/technology/test"}, "queries": {"count": 596}, "qrels": {"count": 2045, "fields": {"relevance": {"counts_by_value": {"1": 2045}}}}}, "lotte/writing/dev": {"docs": {"count": 277072, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}}, "lotte/writing/dev/forum": {"docs": {"_ref": "lotte/writing/dev"}, "queries": {"count": 2003}, "qrels": {"count": 15098, "fields": {"relevance": {"counts_by_value": {"1": 15098}}}}}, "lotte/writing/dev/search": {"docs": {"_ref": "lotte/writing/dev"}, "queries": {"count": 497}, "qrels": {"count": 1287, "fields": {"relevance": {"counts_by_value": {"1": 1287}}}}}, "lotte/writing/test": {"docs": {"count": 199994, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}}, "lotte/writing/test/forum": {"docs": {"_ref": "lotte/writing/test"}, "queries": {"count": 2000}, "qrels": {"count": 12906, "fields": {"relevance": {"counts_by_value": {"1": 12906}}}}}, "lotte/writing/test/search": {"docs": {"_ref": "lotte/writing/test"}, "queries": {"count": 1071}, "qrels": {"count": 3546, "fields": {"relevance": {"counts_by_value": {"1": 3546}}}}}, "medline": {}, "medline/2004": {"docs": {"count": 3672808, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, "medline/2004/trec-genomics-2004": {"docs": {"_ref": "medline/2004"}, "queries": {"count": 50}, "qrels": {"count": 8268, "fields": {"relevance": {"counts_by_value": {"2": 3639, "1": 4629}}}}}, "medline/2004/trec-genomics-2005": {"docs": {"_ref": "medline/2004"}, "queries": {"count": 50}, "qrels": {"count": 39958, "fields": {"relevance": {"counts_by_value": {"0": 35374, "2": 2525, "1": 2059}}}}}, "medline/2017": {"docs": {"count": 26740025, "fields": {"doc_id": {"max_len": 15, "common_prefix": ""}}}}, "medline/2017/trec-pm-2017": {"docs": {"_ref": "medline/2017"}, "queries": {"count": 30}, "qrels": {"count": 22642, "fields": {"relevance": {"counts_by_value": {"0": 18767, "1": 1853, "2": 2022}}}}}, "medline/2017/trec-pm-2018": {"docs": {"_ref": "medline/2017"}, "queries": {"count": 50}, "qrels": {"count": 22429, "fields": {"relevance": {"counts_by_value": {"0": 16841, "2": 3442, "1": 2146}}}}}, "miracl": {}, "miracl/ar": {"docs": {"count": 2061414, "fields": {"doc_id": {"max_len": 11, "common_prefix": ""}}}}, "miracl/ar/dev": {"docs": {"_ref": "miracl/ar"}, "queries": {"count": 2896}, "qrels": {"count": 29197, "fields": {"relevance": {"counts_by_value": {"1": 5658, "0": 23539}}}}}, "miracl/ar/test-a": {"docs": {"_ref": "miracl/ar"}, "queries": {"count": 936}}, "miracl/ar/test-b": {"docs": {"_ref": "miracl/ar"}, "queries": {"count": 1405}}, "miracl/ar/train": {"docs": {"_ref": "miracl/ar"}, "queries": {"count": 3495}, "qrels": {"count": 25382, "fields": {"relevance": {"counts_by_value": {"1": 6217, "0": 19165}}}}}, "miracl/bn": {"docs": {"count": 297265, "fields": {"doc_id": {"max_len": 10, "common_prefix": ""}}}}, "miracl/bn/dev": {"docs": {"_ref": "miracl/bn"}, "queries": {"count": 411}, "qrels": {"count": 4206, "fields": {"relevance": {"counts_by_value": {"1": 863, "0": 3343}}}}}, "miracl/bn/test-a": {"docs": {"_ref": "miracl/bn"}, "queries": {"count": 102}}, "miracl/bn/test-b": {"docs": {"_ref": "miracl/bn"}, "queries": {"count": 1130}}, "miracl/bn/train": {"docs": {"_ref": "miracl/bn"}, "queries": {"count": 1631}, "qrels": {"count": 16754, "fields": {"relevance": {"counts_by_value": {"1": 3859, "0": 12895}}}}}, "miracl/de": {"docs": {"count": 15866222, "fields": {"doc_id": {"max_len": 12, "common_prefix": ""}}}}, "miracl/de/dev": {"docs": {"_ref": "miracl/de"}, "queries": {"count": 305}, "qrels": {"count": 3144, "fields": {"relevance": {"counts_by_value": {"1": 811, "0": 2333}}}}}, "miracl/de/test-b": {"docs": {"_ref": "miracl/de"}, "queries": {"count": 712}}, "miracl/en": {"docs": {"count": 32893221, "fields": {"doc_id": {"max_len": 13, "common_prefix": ""}}}}, "miracl/en/dev": {"docs": {"_ref": "miracl/en"}, "queries": {"count": 799}, "qrels": {"count": 8350, "fields": {"relevance": {"counts_by_value": {"1": 2326, "0": 6024}}}}}, "miracl/en/test-a": {"docs": {"_ref": "miracl/en"}, "queries": {"count": 734}}, "miracl/en/test-b": {"docs": {"_ref": "miracl/en"}, "queries": {"count": 1790}}, "miracl/en/train": {"docs": {"_ref": "miracl/en"}, "queries": {"count": 2863}, "qrels": {"count": 29416, "fields": {"relevance": {"counts_by_value": {"1": 7899, "0": 21517}}}}}, "miracl/es": {"docs": {"count": 10373953, "fields": {"doc_id": {"max_len": 12, "common_prefix": ""}}}}, "miracl/es/dev": {"docs": {"_ref": "miracl/es"}, "queries": {"count": 648}, "qrels": {"count": 6443, "fields": {"relevance": {"counts_by_value": {"1": 2987, "0": 3456}}}}}, "miracl/es/test-b": {"docs": {"_ref": "miracl/es"}, "queries": {"count": 1515}}, "miracl/es/train": {"docs": {"_ref": "miracl/es"}, "queries": {"count": 2162}, "qrels": {"count": 21531, "fields": {"relevance": {"counts_by_value": {"1": 10025, "0": 11506}}}}}, "miracl/fa": {"docs": {"count": 2207172, "fields": {"doc_id": {"max_len": 11, "common_prefix": ""}}}}, "miracl/fa/dev": {"docs": {"_ref": "miracl/fa"}, "queries": {"count": 632}, "qrels": {"count": 6571, "fields": {"relevance": {"counts_by_value": {"1": 1314, "0": 5257}}}}}, "miracl/fa/test-b": {"docs": {"_ref": "miracl/fa"}, "queries": {"count": 1476}}, "miracl/fa/train": {"docs": {"_ref": "miracl/fa"}, "queries": {"count": 2107}, "qrels": {"count": 21844, "fields": {"relevance": {"counts_by_value": {"1": 4277, "0": 17567}}}}}, "miracl/fi": {"docs": {"count": 1883509, "fields": {"doc_id": {"max_len": 11, "common_prefix": ""}}}}, "miracl/fi/dev": {"docs": {"_ref": "miracl/fi"}, "queries": {"count": 1271}, "qrels": {"count": 12008, "fields": {"relevance": {"counts_by_value": {"1": 2447, "0": 9561}}}}}, "miracl/fi/test-a": {"docs": {"_ref": "miracl/fi"}, "queries": {"count": 1060}}, "miracl/fi/test-b": {"docs": {"_ref": "miracl/fi"}, "queries": {"count": 711}}, "miracl/fi/train": {"docs": {"_ref": "miracl/fi"}, "queries": {"count": 2897}, "qrels": {"count": 20350, "fields": {"relevance": {"counts_by_value": {"1": 4928, "0": 15422}}}}}, "miracl/fr": {"docs": {"count": 14636953, "fields": {"doc_id": {"max_len": 12, "common_prefix": ""}}}}, "miracl/fr/dev": {"docs": {"_ref": "miracl/fr"}, "queries": {"count": 343}, "qrels": {"count": 3429, "fields": {"relevance": {"counts_by_value": {"1": 731, "0": 2698}}}}}, "miracl/fr/test-b": {"docs": {"_ref": "miracl/fr"}, "queries": {"count": 801}}, "miracl/fr/train": {"docs": {"_ref": "miracl/fr"}, "queries": {"count": 1143}, "qrels": {"count": 11426, "fields": {"relevance": {"counts_by_value": {"1": 2321, "0": 9105}}}}}, "miracl/hi": {"docs": {"count": 506264, "fields": {"doc_id": {"max_len": 11, "common_prefix": ""}}}}, "miracl/hi/dev": {"docs": {"_ref": "miracl/hi"}, "queries": {"count": 350}, "qrels": {"count": 3494, "fields": {"relevance": {"counts_by_value": {"1": 752, "0": 2742}}}}}, "miracl/hi/test-b": {"docs": {"_ref": "miracl/hi"}, "queries": {"count": 819}}, "miracl/hi/train": {"docs": {"_ref": "miracl/hi"}, "queries": {"count": 1169}, "qrels": {"count": 11668, "fields": {"relevance": {"counts_by_value": {"1": 2469, "0": 9199}}}}}, "miracl/id": {"docs": {"count": 1446315, "fields": {"doc_id": {"max_len": 11, "common_prefix": ""}}}}, "miracl/id/dev": {"docs": {"_ref": "miracl/id"}, "queries": {"count": 960}, "qrels": {"count": 9668, "fields": {"relevance": {"counts_by_value": {"1": 3088, "0": 6580}}}}}, "miracl/id/test-a": {"docs": {"_ref": "miracl/id"}, "queries": {"count": 731}}, "miracl/id/test-b": {"docs": {"_ref": "miracl/id"}, "queries": {"count": 611}}, "miracl/id/train": {"docs": {"_ref": "miracl/id"}, "queries": {"count": 4071}, "qrels": {"count": 41358, "fields": {"relevance": {"counts_by_value": {"1": 12505, "0": 28853}}}}}, "miracl/ja": {"docs": {"count": 6953614, "fields": {"doc_id": {"max_len": 12, "common_prefix": ""}}}}, "miracl/ja/dev": {"docs": {"_ref": "miracl/ja"}, "queries": {"count": 860}, "qrels": {"count": 8354, "fields": {"relevance": {"counts_by_value": {"1": 1790, "0": 6564}}}}}, "miracl/ja/test-a": {"docs": {"_ref": "miracl/ja"}, "queries": {"count": 650}}, "miracl/ja/test-b": {"docs": {"_ref": "miracl/ja"}, "queries": {"count": 1141}}, "miracl/ja/train": {"docs": {"_ref": "miracl/ja"}, "queries": {"count": 3477}, "qrels": {"count": 34387, "fields": {"relevance": {"counts_by_value": {"1": 6984, "0": 27403}}}}}, "miracl/ko": {"docs": {"count": 1486752, "fields": {"doc_id": {"max_len": 12, "common_prefix": ""}}}}, "miracl/ko/dev": {"docs": {"_ref": "miracl/ko"}, "queries": {"count": 213}, "qrels": {"count": 3057, "fields": {"relevance": {"counts_by_value": {"1": 547, "0": 2510}}}}}, "miracl/ko/test-a": {"docs": {"_ref": "miracl/ko"}, "queries": {"count": 263}}, "miracl/ko/test-b": {"docs": {"_ref": "miracl/ko"}, "queries": {"count": 1417}}, "miracl/ko/train": {"docs": {"_ref": "miracl/ko"}, "queries": {"count": 868}, "qrels": {"count": 12767, "fields": {"relevance": {"counts_by_value": {"1": 1973, "0": 10794}}}}}, "miracl/ru": {"docs": {"count": 9543918, "fields": {"doc_id": {"max_len": 12, "common_prefix": ""}}}}, "miracl/ru/dev": {"docs": {"_ref": "miracl/ru"}, "queries": {"count": 1252}, "qrels": {"count": 13100, "fields": {"relevance": {"counts_by_value": {"1": 3560, "0": 9540}}}}}, "miracl/ru/test-a": {"docs": {"_ref": "miracl/ru"}, "queries": {"count": 911}}, "miracl/ru/test-b": {"docs": {"_ref": "miracl/ru"}, "queries": {"count": 718}}, "miracl/ru/train": {"docs": {"_ref": "miracl/ru"}, "queries": {"count": 4683}, "qrels": {"count": 33921, "fields": {"relevance": {"counts_by_value": {"1": 10000, "0": 23921}}}}}, "miracl/sw": {"docs": {"count": 131924, "fields": {"doc_id": {"max_len": 9, "common_prefix": ""}}}}, "miracl/sw/dev": {"docs": {"_ref": "miracl/sw"}, "queries": {"count": 482}, "qrels": {"count": 5092, "fields": {"relevance": {"counts_by_value": {"1": 910, "0": 4182}}}}}, "miracl/sw/test-a": {"docs": {"_ref": "miracl/sw"}, "queries": {"count": 638}}, "miracl/sw/test-b": {"docs": {"_ref": "miracl/sw"}, "queries": {"count": 465}}, "miracl/sw/train": {"docs": {"_ref": "miracl/sw"}, "queries": {"count": 1901}, "qrels": {"count": 9359, "fields": {"relevance": {"counts_by_value": {"1": 2687, "0": 6672}}}}}, "miracl/te": {"docs": {"count": 518079, "fields": {"doc_id": {"max_len": 10, "common_prefix": ""}}}}, "miracl/te/dev": {"docs": {"_ref": "miracl/te"}, "queries": {"count": 828}, "qrels": {"count": 1606, "fields": {"relevance": {"counts_by_value": {"1": 854, "0": 752}}}}}, "miracl/te/test-a": {"docs": {"_ref": "miracl/te"}, "queries": {"count": 594}}, "miracl/te/test-b": {"docs": {"_ref": "miracl/te"}, "queries": {"count": 793}}, "miracl/te/train": {"docs": {"_ref": "miracl/te"}, "queries": {"count": 3452}, "qrels": {"count": 18608, "fields": {"relevance": {"counts_by_value": {"1": 4119, "0": 14489}}}}}, "miracl/th": {"docs": {"count": 542166, "fields": {"doc_id": {"max_len": 10, "common_prefix": ""}}}}, "miracl/th/dev": {"docs": {"_ref": "miracl/th"}, "queries": {"count": 733}, "qrels": {"count": 7573, "fields": {"relevance": {"counts_by_value": {"1": 1343, "0": 6230}}}}}, "miracl/th/test-a": {"docs": {"_ref": "miracl/th"}, "queries": {"count": 992}}, "miracl/th/test-b": {"docs": {"_ref": "miracl/th"}, "queries": {"count": 650}}, "miracl/th/train": {"docs": {"_ref": "miracl/th"}, "queries": {"count": 2972}, "qrels": {"count": 21293, "fields": {"relevance": {"counts_by_value": {"1": 4778, "0": 16515}}}}}, "miracl/yo": {"docs": {"count": 49043, "fields": {"doc_id": {"max_len": 9, "common_prefix": ""}}}}, "miracl/yo/dev": {"docs": {"_ref": "miracl/yo"}, "queries": {"count": 119}, "qrels": {"count": 1188, "fields": {"relevance": {"counts_by_value": {"1": 144, "0": 1044}}}}}, "miracl/yo/test-b": {"docs": {"_ref": "miracl/yo"}, "queries": {"count": 288}}, "miracl/zh": {"docs": {"count": 4934368, "fields": {"doc_id": {"max_len": 12, "common_prefix": ""}}}}, "miracl/zh/dev": {"docs": {"_ref": "miracl/zh"}, "queries": {"count": 393}, "qrels": {"count": 3928, "fields": {"relevance": {"counts_by_value": {"1": 994, "0": 2934}}}}}, "miracl/zh/test-b": {"docs": {"_ref": "miracl/zh"}, "queries": {"count": 920}}, "miracl/zh/train": {"docs": {"_ref": "miracl/zh"}, "queries": {"count": 1312}, "qrels": {"count": 13113, "fields": {"relevance": {"counts_by_value": {"1": 3187, "0": 9926}}}}}, "mmarco": {}, "mmarco/de": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "mmarco/de/dev": {"docs": {"_ref": "mmarco/de"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}}, "mmarco/de/dev/small": {"docs": {"_ref": "mmarco/de"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}, "scoreddocs": {"count": 6594126}}, "mmarco/de/train": {"docs": {"_ref": "mmarco/de"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 39780811}}, "mmarco/es": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "mmarco/es/dev": {"docs": {"_ref": "mmarco/es"}, "queries": {"count": 101092}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}}, "mmarco/es/dev/small": {"docs": {"_ref": "mmarco/es"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}, "scoreddocs": {"count": 6786720}}, "mmarco/es/train": {"docs": {"_ref": "mmarco/es"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 39780811}}, "mmarco/fr": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "mmarco/fr/dev": {"docs": {"_ref": "mmarco/fr"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}}, "mmarco/fr/dev/small": {"docs": {"_ref": "mmarco/fr"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}, "scoreddocs": {"count": 6785763}}, "mmarco/fr/train": {"docs": {"_ref": "mmarco/fr"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 39780811}}, "mmarco/id": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "mmarco/id/dev": {"docs": {"_ref": "mmarco/id"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}}, "mmarco/id/dev/small": {"docs": {"_ref": "mmarco/id"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}, "scoreddocs": {"count": 6841990}}, "mmarco/id/train": {"docs": {"_ref": "mmarco/id"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 39780811}}, "mmarco/it": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "mmarco/it/dev": {"docs": {"_ref": "mmarco/it"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}}, "mmarco/it/dev/small": {"docs": {"_ref": "mmarco/it"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}, "scoreddocs": {"count": 6966491}}, "mmarco/it/train": {"docs": {"_ref": "mmarco/it"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 39780811}}, "mmarco/pt": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "mmarco/pt/dev": {"docs": {"_ref": "mmarco/pt"}, "queries": {"count": 101619}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}}, "mmarco/pt/dev/small": {"docs": {"_ref": "mmarco/pt"}, "queries": {"count": 7000}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}}, "mmarco/pt/dev/small/v1.1": {"docs": {"_ref": "mmarco/pt"}, "queries": {"count": 6980}, "qrels": {"_ref": "mmarco/pt/dev/small"}, "scoreddocs": {"count": 6976324}}, "mmarco/pt/dev/v1.1": {"docs": {"_ref": "mmarco/pt"}, "queries": {"count": 101093}, "qrels": {"_ref": "mmarco/pt/dev"}}, "mmarco/pt/train": {"docs": {"_ref": "mmarco/pt"}, "queries": {"count": 811690}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 39780811}}, "mmarco/pt/train/v1.1": {"docs": {"_ref": "mmarco/pt"}, "queries": {"count": 808731}, "qrels": {"_ref": "mmarco/pt/train"}, "docpairs": {"count": 39780811}}, "mmarco/ru": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "mmarco/ru/dev": {"docs": {"_ref": "mmarco/ru"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}}, "mmarco/ru/dev/small": {"docs": {"_ref": "mmarco/ru"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}, "scoreddocs": {"count": 6958739}}, "mmarco/ru/train": {"docs": {"_ref": "mmarco/ru"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 39780811}}, "mmarco/v2/ar": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "mmarco/v2/ar/dev": {"docs": {"_ref": "mmarco/v2/ar"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}}, "mmarco/v2/ar/dev/small": {"docs": {"_ref": "mmarco/v2/ar"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}, "scoreddocs": {"count": 6848687}}, "mmarco/v2/ar/train": {"docs": {"_ref": "mmarco/v2/ar"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 39780811}}, "mmarco/v2/de": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "mmarco/v2/de/dev": {"docs": {"_ref": "mmarco/v2/de"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}}, "mmarco/v2/de/dev/small": {"docs": {"_ref": "mmarco/v2/de"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}, "scoreddocs": {"count": 6586918}}, "mmarco/v2/de/train": {"docs": {"_ref": "mmarco/v2/de"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 39780811}}, "mmarco/v2/dt": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "mmarco/v2/dt/dev": {"docs": {"_ref": "mmarco/v2/dt"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}}, "mmarco/v2/dt/dev/small": {"docs": {"_ref": "mmarco/v2/dt"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}, "scoreddocs": {"count": 6608183}}, "mmarco/v2/dt/train": {"docs": {"_ref": "mmarco/v2/dt"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 39780811}}, "mmarco/v2/es": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "mmarco/v2/es/dev": {"docs": {"_ref": "mmarco/v2/es"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}}, "mmarco/v2/es/dev/small": {"docs": {"_ref": "mmarco/v2/es"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}, "scoreddocs": {"count": 6777044}}, "mmarco/v2/es/train": {"docs": {"_ref": "mmarco/v2/es"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 39780811}}, "mmarco/v2/fr": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "mmarco/v2/fr/dev": {"docs": {"_ref": "mmarco/v2/fr"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}}, "mmarco/v2/fr/dev/small": {"docs": {"_ref": "mmarco/v2/fr"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}, "scoreddocs": {"count": 6831783}}, "mmarco/v2/fr/train": {"docs": {"_ref": "mmarco/v2/fr"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 39780811}}, "mmarco/v2/hi": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "mmarco/v2/hi/dev": {"docs": {"_ref": "mmarco/v2/hi"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}}, "mmarco/v2/hi/dev/small": {"docs": {"_ref": "mmarco/v2/hi"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}, "scoreddocs": {"count": 6961912}}, "mmarco/v2/hi/train": {"docs": {"_ref": "mmarco/v2/hi"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 39780811}}, "mmarco/v2/id": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "mmarco/v2/id/dev": {"docs": {"_ref": "mmarco/v2/id"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}}, "mmarco/v2/id/dev/small": {"docs": {"_ref": "mmarco/v2/id"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}, "scoreddocs": {"count": 6791487}}, "mmarco/v2/id/train": {"docs": {"_ref": "mmarco/v2/id"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 39780811}}, "mmarco/v2/it": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "mmarco/v2/it/dev": {"docs": {"_ref": "mmarco/v2/it"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}}, "mmarco/v2/it/dev/small": {"docs": {"_ref": "mmarco/v2/it"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}, "scoreddocs": {"count": 6952771}}, "mmarco/v2/it/train": {"docs": {"_ref": "mmarco/v2/it"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 39780811}}, "mmarco/v2/ja": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "mmarco/v2/ja/dev": {"docs": {"_ref": "mmarco/v2/ja"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}}, "mmarco/v2/ja/dev/small": {"docs": {"_ref": "mmarco/v2/ja"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}, "scoreddocs": {"count": 6817446}}, "mmarco/v2/ja/train": {"docs": {"_ref": "mmarco/v2/ja"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 39780811}}, "mmarco/v2/pt": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "mmarco/v2/pt/dev": {"docs": {"_ref": "mmarco/v2/pt"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}}, "mmarco/v2/pt/dev/small": {"docs": {"_ref": "mmarco/v2/pt"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}, "scoreddocs": {"count": 6975268}}, "mmarco/v2/pt/train": {"docs": {"_ref": "mmarco/v2/pt"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 39780811}}, "mmarco/v2/ru": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "mmarco/v2/ru/dev": {"docs": {"_ref": "mmarco/v2/ru"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}}, "mmarco/v2/ru/dev/small": {"docs": {"_ref": "mmarco/v2/ru"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}, "scoreddocs": {"count": 6931773}}, "mmarco/v2/ru/train": {"docs": {"_ref": "mmarco/v2/ru"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 39780811}}, "mmarco/v2/vi": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "mmarco/v2/vi/dev": {"docs": {"_ref": "mmarco/v2/vi"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}}, "mmarco/v2/vi/dev/small": {"docs": {"_ref": "mmarco/v2/vi"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}, "scoreddocs": {"count": 6976219}}, "mmarco/v2/vi/train": {"docs": {"_ref": "mmarco/v2/vi"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 39780811}}, "mmarco/v2/zh": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "mmarco/v2/zh/dev": {"docs": {"_ref": "mmarco/v2/zh"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}}, "mmarco/v2/zh/dev/small": {"docs": {"_ref": "mmarco/v2/zh"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}, "scoreddocs": {"count": 6979520}}, "mmarco/v2/zh/train": {"docs": {"_ref": "mmarco/v2/zh"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 39780811}}, "mmarco/zh": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "mmarco/zh/dev": {"docs": {"_ref": "mmarco/zh"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}}, "mmarco/zh/dev/small": {"docs": {"_ref": "mmarco/zh"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}}, "mmarco/zh/dev/small/v1.1": {"docs": {"_ref": "mmarco/zh"}, "queries": {"count": 6980}, "qrels": {"_ref": "mmarco/zh/dev/small"}, "scoreddocs": {"count": 1034597}}, "mmarco/zh/dev/v1.1": {"docs": {"_ref": "mmarco/zh"}, "queries": {"count": 101093}, "qrels": {"_ref": "mmarco/zh/dev"}}, "mmarco/zh/train": {"docs": {"_ref": "mmarco/zh"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 39780811}}, "mr-tydi": {}, "mr-tydi/ar": {"docs": {"count": 2106586, "fields": {"doc_id": {"max_len": 11, "common_prefix": ""}}}, "queries": {"count": 16595}, "qrels": {"count": 16749, "fields": {"relevance": {"counts_by_value": {"1": 16749}}}}}, "mr-tydi/ar/dev": {"docs": {"_ref": "mr-tydi/ar"}, "queries": {"count": 3115}, "qrels": {"count": 3115, "fields": {"relevance": {"counts_by_value": {"1": 3115}}}}}, "mr-tydi/ar/test": {"docs": {"_ref": "mr-tydi/ar"}, "queries": {"count": 1081}, "qrels": {"count": 1257, "fields": {"relevance": {"counts_by_value": {"1": 1257}}}}}, "mr-tydi/ar/train": {"docs": {"_ref": "mr-tydi/ar"}, "queries": {"count": 12377}, "qrels": {"count": 12377, "fields": {"relevance": {"counts_by_value": {"1": 12377}}}}}, "mr-tydi/bn": {"docs": {"count": 304059, "fields": {"doc_id": {"max_len": 10, "common_prefix": ""}}}, "queries": {"count": 2264}, "qrels": {"count": 2292, "fields": {"relevance": {"counts_by_value": {"1": 2292}}}}}, "mr-tydi/bn/dev": {"docs": {"_ref": "mr-tydi/bn"}, "queries": {"count": 440}, "qrels": {"count": 443, "fields": {"relevance": {"counts_by_value": {"1": 443}}}}}, "mr-tydi/bn/test": {"docs": {"_ref": "mr-tydi/bn"}, "queries": {"count": 111}, "qrels": {"count": 130, "fields": {"relevance": {"counts_by_value": {"1": 130}}}}}, "mr-tydi/bn/train": {"docs": {"_ref": "mr-tydi/bn"}, "queries": {"count": 1713}, "qrels": {"count": 1719, "fields": {"relevance": {"counts_by_value": {"1": 1719}}}}}, "mr-tydi/en": {"docs": {"count": 32907100, "fields": {"doc_id": {"max_len": 13, "common_prefix": ""}}}, "queries": {"count": 5194}, "qrels": {"count": 5360, "fields": {"relevance": {"counts_by_value": {"1": 5360}}}}}, "mr-tydi/en/dev": {"docs": {"_ref": "mr-tydi/en"}, "queries": {"count": 878}, "qrels": {"count": 878, "fields": {"relevance": {"counts_by_value": {"1": 878}}}}}, "mr-tydi/en/test": {"docs": {"_ref": "mr-tydi/en"}, "queries": {"count": 744}, "qrels": {"count": 935, "fields": {"relevance": {"counts_by_value": {"1": 935}}}}}, "mr-tydi/en/train": {"docs": {"_ref": "mr-tydi/en"}, "queries": {"count": 3547}, "qrels": {"count": 3547, "fields": {"relevance": {"counts_by_value": {"1": 3547}}}}}, "mr-tydi/fi": {"docs": {"count": 1908757, "fields": {"doc_id": {"max_len": 11, "common_prefix": ""}}}, "queries": {"count": 9572}, "qrels": {"count": 9750, "fields": {"relevance": {"counts_by_value": {"1": 9750}}}}}, "mr-tydi/fi/dev": {"docs": {"_ref": "mr-tydi/fi"}, "queries": {"count": 1738}, "qrels": {"count": 1738, "fields": {"relevance": {"counts_by_value": {"1": 1738}}}}}, "mr-tydi/fi/test": {"docs": {"_ref": "mr-tydi/fi"}, "queries": {"count": 1254}, "qrels": {"count": 1451, "fields": {"relevance": {"counts_by_value": {"1": 1451}}}}}, "mr-tydi/fi/train": {"docs": {"_ref": "mr-tydi/fi"}, "queries": {"count": 6561}, "qrels": {"count": 6561, "fields": {"relevance": {"counts_by_value": {"1": 6561}}}}}, "mr-tydi/id": {"docs": {"count": 1469399, "fields": {"doc_id": {"max_len": 11, "common_prefix": ""}}}, "queries": {"count": 6977}, "qrels": {"count": 7087, "fields": {"relevance": {"counts_by_value": {"1": 7087}}}}}, "mr-tydi/id/dev": {"docs": {"_ref": "mr-tydi/id"}, "queries": {"count": 1224}, "qrels": {"count": 1224, "fields": {"relevance": {"counts_by_value": {"1": 1224}}}}}, "mr-tydi/id/test": {"docs": {"_ref": "mr-tydi/id"}, "queries": {"count": 829}, "qrels": {"count": 961, "fields": {"relevance": {"counts_by_value": {"1": 961}}}}}, "mr-tydi/id/train": {"docs": {"_ref": "mr-tydi/id"}, "queries": {"count": 4902}, "qrels": {"count": 4902, "fields": {"relevance": {"counts_by_value": {"1": 4902}}}}}, "mr-tydi/ja": {"docs": {"count": 7000027, "fields": {"doc_id": {"max_len": 12, "common_prefix": ""}}}, "queries": {"count": 5353}, "qrels": {"count": 5548, "fields": {"relevance": {"counts_by_value": {"1": 5548}}}}}, "mr-tydi/ja/dev": {"docs": {"_ref": "mr-tydi/ja"}, "queries": {"count": 928}, "qrels": {"count": 928, "fields": {"relevance": {"counts_by_value": {"1": 928}}}}}, "mr-tydi/ja/test": {"docs": {"_ref": "mr-tydi/ja"}, "queries": {"count": 720}, "qrels": {"count": 923, "fields": {"relevance": {"counts_by_value": {"1": 923}}}}}, "mr-tydi/ja/train": {"docs": {"_ref": "mr-tydi/ja"}, "queries": {"count": 3697}, "qrels": {"count": 3697, "fields": {"relevance": {"counts_by_value": {"1": 3697}}}}}, "mr-tydi/ko": {"docs": {"count": 1496126, "fields": {"doc_id": {"max_len": 12, "common_prefix": ""}}}, "queries": {"count": 2019}, "qrels": {"count": 2116, "fields": {"relevance": {"counts_by_value": {"1": 2116}}}}}, "mr-tydi/ko/dev": {"docs": {"_ref": "mr-tydi/ko"}, "queries": {"count": 303}, "qrels": {"count": 307, "fields": {"relevance": {"counts_by_value": {"1": 307}}}}}, "mr-tydi/ko/test": {"docs": {"_ref": "mr-tydi/ko"}, "queries": {"count": 421}, "qrels": {"count": 492, "fields": {"relevance": {"counts_by_value": {"1": 492}}}}}, "mr-tydi/ko/train": {"docs": {"_ref": "mr-tydi/ko"}, "queries": {"count": 1295}, "qrels": {"count": 1317, "fields": {"relevance": {"counts_by_value": {"1": 1317}}}}}, "mr-tydi/ru": {"docs": {"count": 9597504, "fields": {"doc_id": {"max_len": 12, "common_prefix": ""}}}, "queries": {"count": 7763}, "qrels": {"count": 7909, "fields": {"relevance": {"counts_by_value": {"1": 7909}}}}}, "mr-tydi/ru/dev": {"docs": {"_ref": "mr-tydi/ru"}, "queries": {"count": 1375}, "qrels": {"count": 1375, "fields": {"relevance": {"counts_by_value": {"1": 1375}}}}}, "mr-tydi/ru/test": {"docs": {"_ref": "mr-tydi/ru"}, "queries": {"count": 995}, "qrels": {"count": 1168, "fields": {"relevance": {"counts_by_value": {"1": 1168}}}}}, "mr-tydi/ru/train": {"docs": {"_ref": "mr-tydi/ru"}, "queries": {"count": 5366}, "qrels": {"count": 5366, "fields": {"relevance": {"counts_by_value": {"1": 5366}}}}}, "mr-tydi/sw": {"docs": {"count": 136689, "fields": {"doc_id": {"max_len": 9, "common_prefix": ""}}}, "queries": {"count": 3271}, "qrels": {"count": 3767, "fields": {"relevance": {"counts_by_value": {"1": 3767}}}}}, "mr-tydi/sw/dev": {"docs": {"_ref": "mr-tydi/sw"}, "queries": {"count": 526}, "qrels": {"count": 623, "fields": {"relevance": {"counts_by_value": {"1": 623}}}}}, "mr-tydi/sw/test": {"docs": {"_ref": "mr-tydi/sw"}, "queries": {"count": 670}, "qrels": {"count": 743, "fields": {"relevance": {"counts_by_value": {"1": 743}}}}}, "mr-tydi/sw/train": {"docs": {"_ref": "mr-tydi/sw"}, "queries": {"count": 2072}, "qrels": {"count": 2401, "fields": {"relevance": {"counts_by_value": {"1": 2401}}}}}, "mr-tydi/te": {"docs": {"count": 548224, "fields": {"doc_id": {"max_len": 10, "common_prefix": ""}}}, "queries": {"count": 5517}, "qrels": {"count": 5540, "fields": {"relevance": {"counts_by_value": {"1": 5540}}}}}, "mr-tydi/te/dev": {"docs": {"_ref": "mr-tydi/te"}, "queries": {"count": 983}, "qrels": {"count": 983, "fields": {"relevance": {"counts_by_value": {"1": 983}}}}}, "mr-tydi/te/test": {"docs": {"_ref": "mr-tydi/te"}, "queries": {"count": 646}, "qrels": {"count": 677, "fields": {"relevance": {"counts_by_value": {"1": 677}}}}}, "mr-tydi/te/train": {"docs": {"_ref": "mr-tydi/te"}, "queries": {"count": 3880}, "qrels": {"count": 3880, "fields": {"relevance": {"counts_by_value": {"1": 3880}}}}}, "mr-tydi/th": {"docs": {"count": 568855, "fields": {"doc_id": {"max_len": 10, "common_prefix": ""}}}, "queries": {"count": 5322}, "qrels": {"count": 5545, "fields": {"relevance": {"counts_by_value": {"1": 5545}}}}}, "mr-tydi/th/dev": {"docs": {"_ref": "mr-tydi/th"}, "queries": {"count": 807}, "qrels": {"count": 817, "fields": {"relevance": {"counts_by_value": {"1": 817}}}}}, "mr-tydi/th/test": {"docs": {"_ref": "mr-tydi/th"}, "queries": {"count": 1190}, "qrels": {"count": 1368, "fields": {"relevance": {"counts_by_value": {"1": 1368}}}}}, "mr-tydi/th/train": {"docs": {"_ref": "mr-tydi/th"}, "queries": {"count": 3319}, "qrels": {"count": 3360, "fields": {"relevance": {"counts_by_value": {"1": 3360}}}}}, "msmarco-document": {"docs": {"count": 3213835, "fields": {"doc_id": {"max_len": 8, "common_prefix": "D"}}}}, "msmarco-document-v2": {"docs": {"count": 11959635, "fields": {"doc_id": {"max_len": 25, "common_prefix": "msmarco_doc_"}}}}, "msmarco-document-v2.1": {"docs": {"count": 10960555, "fields": {"doc_id": {"max_len": 30, "common_prefix": "msmarco_v2.1_doc_"}}}}, "msmarco-document-v2.1/trec-rag-2024": {"docs": {"_ref": "msmarco-document-v2.1"}, "queries": {"count": 301}}, "msmarco-document-v2/anchor-text": {"docs": {"count": 4821244, "fields": {"doc_id": {"max_len": 25, "common_prefix": "msmarco_doc_"}}}}, "msmarco-document-v2/dev1": {"docs": {"_ref": "msmarco-document-v2"}, "queries": {"count": 4552}, "qrels": {"count": 4702, "fields": {"relevance": {"counts_by_value": {"1": 4702}}}}, "scoreddocs": {"count": 455200}}, "msmarco-document-v2/dev2": {"docs": {"_ref": "msmarco-document-v2"}, "queries": {"count": 5000}, "qrels": {"count": 5178, "fields": {"relevance": {"counts_by_value": {"1": 5178}}}}, "scoreddocs": {"count": 500000}}, "msmarco-document-v2/train": {"docs": {"_ref": "msmarco-document-v2"}, "queries": {"count": 322196}, "qrels": {"count": 331956, "fields": {"relevance": {"counts_by_value": {"1": 331956}}}}, "scoreddocs": {"count": 32218809}}, "msmarco-document-v2/trec-dl-2019": {"docs": {"_ref": "msmarco-document-v2"}, "queries": {"count": 200}, "qrels": {"count": 13940, "fields": {"relevance": {"counts_by_value": {"0": 8229, "1": 3957, "2": 1009, "3": 745}}}}}, "msmarco-document-v2/trec-dl-2019/judged": {"docs": {"_ref": "msmarco-document-v2"}, "queries": {"count": 43}, "qrels": {"_ref": "msmarco-document-v2/trec-dl-2019"}}, "msmarco-document-v2/trec-dl-2020": {"docs": {"_ref": "msmarco-document-v2"}, "queries": {"count": 200}, "qrels": {"count": 7942, "fields": {"relevance": {"counts_by_value": {"0": 6371, "3": 233, "1": 1059, "2": 279}}}}}, "msmarco-document-v2/trec-dl-2020/judged": {"docs": {"_ref": "msmarco-document-v2"}, "queries": {"count": 45}, "qrels": {"_ref": "msmarco-document-v2/trec-dl-2020"}}, "msmarco-document-v2/trec-dl-2021": {"docs": {"_ref": "msmarco-document-v2"}, "queries": {"count": 477}, "scoreddocs": {"count": 47700}, "qrels": {"count": 13058, "fields": {"relevance": {"counts_by_value": {"2": 2769, "0": 4855, "3": 1256, "1": 4178}}}}}, "msmarco-document-v2/trec-dl-2021/judged": {"docs": {"_ref": "msmarco-document-v2"}, "queries": {"count": 57}, "qrels": {"_ref": "msmarco-document-v2/trec-dl-2021"}, "scoreddocs": {"count": 5700}}, "msmarco-document-v2/trec-dl-2022": {"docs": {"_ref": "msmarco-document-v2"}, "queries": {"count": 500}, "scoreddocs": {"count": 50000}, "qrels": {"count": 369638, "fields": {"relevance": {"counts_by_value": {"0": 274971, "1": 49016, "2": 44255, "3": 1396}}}}}, "msmarco-document-v2/trec-dl-2022/judged": {"docs": {"_ref": "msmarco-document-v2"}, "queries": {"count": 76}, "qrels": {"_ref": "msmarco-document-v2/trec-dl-2022"}, "scoreddocs": {"count": 7600}}, "msmarco-document-v2/trec-dl-2023": {"docs": {"_ref": "msmarco-document-v2"}, "queries": {"count": 700}, "scoreddocs": {"count": 70000}, "qrels": {"count": 18034, "fields": {"relevance": {"counts_by_value": {"0": 11876, "1": 3015, "2": 1668, "3": 1475}}}}}, "msmarco-document-v2/trec-dl-2023/judged": {"docs": {"_ref": "msmarco-document-v2"}, "queries": {"count": 82}, "qrels": {"_ref": "msmarco-document-v2/trec-dl-2023"}, "scoreddocs": {"count": 8200}}, "msmarco-document/anchor-text": {"docs": {"count": 1703834, "fields": {"doc_id": {"max_len": 8, "common_prefix": "D"}}}}, "msmarco-document/dev": {"docs": {"_ref": "msmarco-document"}, "queries": {"count": 5193}, "qrels": {"count": 5193, "fields": {"relevance": {"counts_by_value": {"1": 5193}}}}, "scoreddocs": {"count": 519300}}, "msmarco-document/eval": {"docs": {"_ref": "msmarco-document"}, "queries": {"count": 5793}, "scoreddocs": {"count": 579300}}, "msmarco-document/orcas": {"docs": {"_ref": "msmarco-document"}, "queries": {"count": 10405342}, "qrels": {"count": 18823602, "fields": {"relevance": {"counts_by_value": {"1": 18823602}}}}, "scoreddocs": {"count": 982951086}}, "msmarco-document/train": {"docs": {"_ref": "msmarco-document"}, "queries": {"count": 367013}, "qrels": {"count": 367013, "fields": {"relevance": {"counts_by_value": {"1": 367013}}}}, "scoreddocs": {"count": 36701116}}, "msmarco-document/trec-dl-2019": {"docs": {"_ref": "msmarco-document"}, "queries": {"count": 200}, "qrels": {"count": 16258, "fields": {"relevance": {"counts_by_value": {"0": 9661, "2": 1149, "1": 4607, "3": 841}}}}, "scoreddocs": {"count": 20000}}, "msmarco-document/trec-dl-2019/judged": {"docs": {"_ref": "msmarco-document"}, "queries": {"count": 43}, "qrels": {"_ref": "msmarco-document/trec-dl-2019"}, "scoreddocs": {"count": 4300}}, "msmarco-document/trec-dl-2020": {"docs": {"_ref": "msmarco-document"}, "queries": {"count": 200}, "qrels": {"count": 9098, "fields": {"relevance": {"counts_by_value": {"0": 7331, "3": 265, "1": 1187, "2": 315}}}}, "scoreddocs": {"count": 20000}}, "msmarco-document/trec-dl-2020/judged": {"docs": {"_ref": "msmarco-document"}, "queries": {"count": 45}, "qrels": {"_ref": "msmarco-document/trec-dl-2020"}, "scoreddocs": {"count": 4500}}, "msmarco-document/trec-dl-hard": {"docs": {"_ref": "msmarco-document"}, "queries": {"count": 50}, "qrels": {"count": 8544, "fields": {"relevance": {"counts_by_value": {"0": 5186, "2": 485, "1": 2517, "3": 356}}}}}, "msmarco-document/trec-dl-hard/fold1": {"docs": {"_ref": "msmarco-document"}, "queries": {"count": 10}, "qrels": {"count": 1557, "fields": {"relevance": {"counts_by_value": {"1": 328, "0": 1048, "3": 106, "2": 75}}}}}, "msmarco-document/trec-dl-hard/fold2": {"docs": {"_ref": "msmarco-document"}, "queries": {"count": 10}, "qrels": {"count": 1345, "fields": {"relevance": {"counts_by_value": {"2": 78, "1": 304, "0": 922, "3": 41}}}}}, "msmarco-document/trec-dl-hard/fold3": {"docs": {"_ref": "msmarco-document"}, "queries": {"count": 10}, "qrels": {"count": 474, "fields": {"relevance": {"counts_by_value": {"0": 333, "2": 44, "3": 32, "1": 65}}}}}, "msmarco-document/trec-dl-hard/fold4": {"docs": {"_ref": "msmarco-document"}, "queries": {"count": 10}, "qrels": {"count": 1054, "fields": {"relevance": {"counts_by_value": {"1": 258, "2": 34, "0": 718, "3": 44}}}}}, "msmarco-document/trec-dl-hard/fold5": {"docs": {"_ref": "msmarco-document"}, "queries": {"count": 10}, "qrels": {"count": 4114, "fields": {"relevance": {"counts_by_value": {"0": 2165, "1": 1562, "2": 254, "3": 133}}}}}, "msmarco-passage": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "msmarco-passage-v2": {"docs": {"count": 138364198, "fields": {"doc_id": {"max_len": 28, "common_prefix": "msmarco_passage_"}}}}, "msmarco-passage-v2/dedup": {"docs": {"count": 119582876, "fields": {"doc_id": {"max_len": 28, "common_prefix": "msmarco_passage_"}}}}, "msmarco-passage-v2/dev1": {"docs": {"_ref": "msmarco-passage-v2"}, "queries": {"count": 3903}, "qrels": {"count": 4009, "fields": {"relevance": {"counts_by_value": {"1": 4009}}}}, "scoreddocs": {"count": 390300}}, "msmarco-passage-v2/dev2": {"docs": {"_ref": "msmarco-passage-v2"}, "queries": {"count": 4281}, "qrels": {"count": 4411, "fields": {"relevance": {"counts_by_value": {"1": 4411}}}}, "scoreddocs": {"count": 428100}}, "msmarco-passage-v2/train": {"docs": {"_ref": "msmarco-passage-v2"}, "queries": {"count": 277144}, "qrels": {"count": 284212, "fields": {"relevance": {"counts_by_value": {"1": 284212}}}}, "scoreddocs": {"count": 27713673}}, "msmarco-passage-v2/trec-dl-2021": {"docs": {"_ref": "msmarco-passage-v2"}, "queries": {"count": 477}, "scoreddocs": {"count": 47700}, "qrels": {"count": 10828, "fields": {"relevance": {"counts_by_value": {"0": 4338, "3": 1086, "1": 3063, "2": 2341}}}}}, "msmarco-passage-v2/trec-dl-2021/judged": {"docs": {"_ref": "msmarco-passage-v2"}, "queries": {"count": 53}, "qrels": {"_ref": "msmarco-passage-v2/trec-dl-2021"}, "scoreddocs": {"count": 5300}}, "msmarco-passage-v2/trec-dl-2022": {"docs": {"_ref": "msmarco-passage-v2"}, "queries": {"count": 500}, "scoreddocs": {"count": 50000}, "qrels": {"count": 386416, "fields": {"relevance": {"counts_by_value": {"0": 286459, "1": 52218, "2": 46080, "3": 1659}}}}}, "msmarco-passage-v2/trec-dl-2022/judged": {"docs": {"_ref": "msmarco-passage-v2"}, "queries": {"count": 76}, "qrels": {"_ref": "msmarco-passage-v2/trec-dl-2022"}, "scoreddocs": {"count": 7600}}, "msmarco-passage-v2/trec-dl-2023": {"docs": {"_ref": "msmarco-passage-v2"}, "queries": {"count": 700}, "scoreddocs": {"count": 70000}, "qrels": {"count": 22327, "fields": {"relevance": {"counts_by_value": {"0": 13866, "1": 4372, "2": 2259, "3": 1830}}}}}, "msmarco-passage-v2/trec-dl-2023/judged": {"docs": {"_ref": "msmarco-passage-v2"}, "queries": {"count": 82}, "qrels": {"_ref": "msmarco-passage-v2/trec-dl-2023"}, "scoreddocs": {"count": 8200}}, "msmarco-passage/dev": {"docs": {"_ref": "msmarco-passage"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}}, "msmarco-passage/dev/2": {"docs": {"_ref": "msmarco-passage"}, "queries": {"count": 4281}, "qrels": {"count": 4655, "fields": {"relevance": {"counts_by_value": {"1": 4655}}}}}, "msmarco-passage/dev/judged": {"docs": {"_ref": "msmarco-passage"}, "queries": {"count": 55578}, "qrels": {"_ref": "msmarco-passage/dev"}}, "msmarco-passage/dev/small": {"docs": {"_ref": "msmarco-passage"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}, "scoreddocs": {"count": 6668967}}, "msmarco-passage/eval": {"docs": {"_ref": "msmarco-passage"}, "queries": {"count": 101092}}, "msmarco-passage/eval/small": {"docs": {"_ref": "msmarco-passage"}, "queries": {"count": 6837}, "scoreddocs": {"count": 6515736}}, "msmarco-passage/train": {"docs": {"_ref": "msmarco-passage"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "scoreddocs": {"count": 478002393}, "docpairs": {"count": 269919004}}, "msmarco-passage/train/judged": {"docs": {"_ref": "msmarco-passage"}, "queries": {"count": 502939}, "qrels": {"_ref": "msmarco-passage/train"}, "scoreddocs": {"count": 478002393}, "docpairs": {"_ref": "msmarco-passage/train"}}, "msmarco-passage/train/medical": {"docs": {"_ref": "msmarco-passage"}, "queries": {"count": 78895}, "qrels": {"count": 54627, "fields": {"relevance": {"counts_by_value": {"1": 54627}}}}, "scoreddocs": {"count": 48852277}, "docpairs": {"count": 28969254}}, "msmarco-passage/train/split200-train": {"docs": {"_ref": "msmarco-passage"}, "queries": {"count": 808531}, "qrels": {"count": 532630, "fields": {"relevance": {"counts_by_value": {"1": 532630}}}}, "scoreddocs": {"count": 477883382}, "docpairs": {"count": 269854839}}, "msmarco-passage/train/split200-valid": {"docs": {"_ref": "msmarco-passage"}, "queries": {"count": 200}, "qrels": {"count": 131, "fields": {"relevance": {"counts_by_value": {"1": 131}}}}, "scoreddocs": {"count": 119011}, "docpairs": {"count": 64165}}, "msmarco-passage/train/triples-small": {"docs": {"_ref": "msmarco-passage"}, "queries": {"_ref": "msmarco-passage/train"}, "qrels": {"_ref": "msmarco-passage/train"}, "scoreddocs": {"_ref": "msmarco-passage/train"}, "docpairs": {"count": 39780811}}, "msmarco-passage/train/triples-v2": {"docs": {"_ref": "msmarco-passage"}, "queries": {"_ref": "msmarco-passage/train"}, "qrels": {"_ref": "msmarco-passage/train"}, "scoreddocs": {"_ref": "msmarco-passage/train"}, "docpairs": {"count": 397768673}}, "msmarco-passage/trec-dl-2019": {"docs": {"_ref": "msmarco-passage"}, "queries": {"count": 200}, "qrels": {"count": 9260, "fields": {"relevance": {"counts_by_value": {"0": 5158, "1": 1601, "2": 1804, "3": 697}}}}, "scoreddocs": {"count": 189877}}, "msmarco-passage/trec-dl-2019/judged": {"docs": {"_ref": "msmarco-passage"}, "queries": {"count": 43}, "qrels": {"_ref": "msmarco-passage/trec-dl-2019"}, "scoreddocs": {"count": 41042}}, "msmarco-passage/trec-dl-2020": {"docs": {"_ref": "msmarco-passage"}, "queries": {"count": 200}, "qrels": {"count": 11386, "fields": {"relevance": {"counts_by_value": {"2": 1020, "3": 646, "0": 7780, "1": 1940}}}}, "scoreddocs": {"count": 190699}}, "msmarco-passage/trec-dl-2020/judged": {"docs": {"_ref": "msmarco-passage"}, "queries": {"count": 54}, "qrels": {"_ref": "msmarco-passage/trec-dl-2020"}, "scoreddocs": {"count": 50024}}, "msmarco-passage/trec-dl-hard": {"docs": {"_ref": "msmarco-passage"}, "queries": {"count": 50}, "qrels": {"count": 4256, "fields": {"relevance": {"counts_by_value": {"0": 2462, "1": 810, "2": 634, "3": 350}}}}}, "msmarco-passage/trec-dl-hard/fold1": {"docs": {"_ref": "msmarco-passage"}, "queries": {"count": 10}, "qrels": {"count": 1072, "fields": {"relevance": {"counts_by_value": {"0": 582, "1": 197, "2": 181, "3": 112}}}}}, "msmarco-passage/trec-dl-hard/fold2": {"docs": {"_ref": "msmarco-passage"}, "queries": {"count": 10}, "qrels": {"count": 898, "fields": {"relevance": {"counts_by_value": {"3": 37, "2": 99, "0": 611, "1": 151}}}}}, "msmarco-passage/trec-dl-hard/fold3": {"docs": {"_ref": "msmarco-passage"}, "queries": {"count": 10}, "qrels": {"count": 444, "fields": {"relevance": {"counts_by_value": {"0": 342, "1": 43, "2": 36, "3": 23}}}}}, "msmarco-passage/trec-dl-hard/fold4": {"docs": {"_ref": "msmarco-passage"}, "queries": {"count": 10}, "qrels": {"count": 716, "fields": {"relevance": {"counts_by_value": {"3": 32, "2": 151, "1": 137, "0": 396}}}}}, "msmarco-passage/trec-dl-hard/fold5": {"docs": {"_ref": "msmarco-passage"}, "queries": {"count": 10}, "qrels": {"count": 1126, "fields": {"relevance": {"counts_by_value": {"3": 146, "1": 282, "0": 531, "2": 167}}}}}, "msmarco-qna": {"docs": {"count": 9048606, "fields": {"doc_id": {"max_len": 10, "common_prefix": ""}}}}, "msmarco-qna/dev": {"docs": {"_ref": "msmarco-qna"}, "queries": {"count": 101093}, "qrels": {"count": 1008985, "fields": {"relevance": {"counts_by_value": {"0": 949712, "1": 59273}}}}, "scoreddocs": {"count": 1008985}}, "msmarco-qna/eval": {"docs": {"_ref": "msmarco-qna"}, "queries": {"count": 101092}, "scoreddocs": {"count": 1008943}}, "msmarco-qna/train": {"docs": {"_ref": "msmarco-qna"}, "queries": {"count": 808731}, "qrels": {"count": 8069749, "fields": {"relevance": {"counts_by_value": {"1": 532761, "0": 7536988}}}}, "scoreddocs": {"count": 8069749}}, "msmarco-segment-v2.1": {"docs": {"count": 113520750, "fields": {"doc_id": {"max_len": 45, "common_prefix": "msmarco_v2.1_doc_"}}}}, "msmarco-segment-v2.1/trec-rag-2024": {"docs": {"_ref": "msmarco-segment-v2.1"}, "queries": {"count": 301}}, "nano-beir": {}, "nano-beir/arguana": {"docs": {"count": 3635, "fields": {"doc_id": {"max_len": 47, "common_prefix": ""}}}, "queries": {"count": 50}, "qrels": {"count": 50, "fields": {"relevance": {"counts_by_value": {"1": 50}}}}}, "nano-beir/climate-fever": {"docs": {"count": 3408, "fields": {"doc_id": {"max_len": 130, "common_prefix": ""}}}, "queries": {"count": 50}, "qrels": {"count": 148, "fields": {"relevance": {"counts_by_value": {"1": 148}}}}}, "nano-beir/dbpedia-entity": {"docs": {"count": 6045, "fields": {"doc_id": {"max_len": 108, "common_prefix": "<dbpedia:"}}}, "queries": {"count": 50}, "qrels": {"count": 1158, "fields": {"relevance": {"counts_by_value": {"1": 1158}}}}}, "nano-beir/fever": {"docs": {"count": 4996, "fields": {"doc_id": {"max_len": 88, "common_prefix": ""}}}, "queries": {"count": 50}, "qrels": {"count": 57, "fields": {"relevance": {"counts_by_value": {"1": 57}}}}}, "nano-beir/fiqa": {"docs": {"count": 4598, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}, "queries": {"count": 50}, "qrels": {"count": 123, "fields": {"relevance": {"counts_by_value": {"1": 123}}}}}, "nano-beir/hotpotqa": {"docs": {"count": 5090, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 50}, "qrels": {"count": 100, "fields": {"relevance": {"counts_by_value": {"1": 100}}}}}, "nano-beir/msmarco": {"docs": {"count": 5043, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}, "queries": {"count": 50}, "qrels": {"count": 50, "fields": {"relevance": {"counts_by_value": {"1": 50}}}}}, "nano-beir/nfcorpus": {"docs": {"count": 2953, "fields": {"doc_id": {"max_len": 8, "common_prefix": "MED-"}}}, "queries": {"count": 50}, "qrels": {"count": 2518, "fields": {"relevance": {"counts_by_value": {"1": 2518}}}}}, "nano-beir/nq": {"docs": {"count": 5035, "fields": {"doc_id": {"max_len": 10, "common_prefix": "doc"}}}, "queries": {"count": 50}, "qrels": {"count": 57, "fields": {"relevance": {"counts_by_value": {"1": 57}}}}}, "nano-beir/quora": {"docs": {"count": 5046, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}, "queries": {"count": 50}, "qrels": {"count": 70, "fields": {"relevance": {"counts_by_value": {"1": 70}}}}}, "nano-beir/scidocs": {"docs": {"count": 2210, "fields": {"doc_id": {"max_len": 40, "common_prefix": ""}}}, "queries": {"count": 50}, "qrels": {"count": 244, "fields": {"relevance": {"counts_by_value": {"1": 244}}}}}, "nano-beir/scifact": {"docs": {"count": 2919, "fields": {"doc_id": {"max_len": 9, "common_prefix": ""}}}, "queries": {"count": 50}, "qrels": {"count": 56, "fields": {"relevance": {"counts_by_value": {"1": 56}}}}}, "nano-beir/webis-touche2020": {"docs": {"count": 5745, "fields": {"doc_id": {"max_len": 39, "common_prefix": ""}}}, "queries": {"count": 49}, "qrels": {"count": 932, "fields": {"relevance": {"counts_by_value": {"1": 932}}}}}, "natural-questions": {"docs": {"count": 28390850, "fields": {"doc_id": {"max_len": 11, "common_prefix": ""}}}}, "natural-questions/dev": {"docs": {"_ref": "natural-questions"}, "queries": {"count": 7830}, "qrels": {"count": 7695, "fields": {"relevance": {"counts_by_value": {"1": 7695}}}}, "scoreddocs": {"count": 973480}}, "natural-questions/train": {"docs": {"_ref": "natural-questions"}, "queries": {"count": 307373}, "qrels": {"count": 152148, "fields": {"relevance": {"counts_by_value": {"1": 152148}}}}, "scoreddocs": {"count": 40374730}}, "neuclir": {}, "neuclir/1": {}, "neuclir/1/fa": {"docs": {"count": 2232016, "fields": {"doc_id": {"max_len": 36, "common_prefix": ""}}}}, "neuclir/1/fa/hc4-filtered": {"docs": {"count": 391703, "fields": {"doc_id": {"max_len": 36, "common_prefix": ""}}}, "queries": {"count": 60}, "qrels": {"count": 3087, "fields": {"relevance": {"counts_by_value": {"0": 2557, "3": 269, "1": 261}}}}}, "neuclir/1/fa/trec-2022": {"docs": {"_ref": "neuclir/1/fa"}, "queries": {"count": 46}, "qrels": {"count": 34174, "fields": {"relevance": {"counts_by_value": {"3": 870, "0": 32702, "1": 602}}}}}, "neuclir/1/fa/trec-2023": {"docs": {"_ref": "neuclir/1/fa"}, "queries": {"count": 76}, "qrels": {"count": 26662, "fields": {"relevance": {"counts_by_value": {"1": 2491, "2": 2068, "0": 21624, "3": 479}}}}}, "neuclir/1/multi": {"docs": {"count": 10038768, "fields": {"doc_id": {"max_len": 36, "common_prefix": ""}}}}, "neuclir/1/multi/trec-2023": {"docs": {"_ref": "neuclir/1/multi"}, "queries": {"count": 76}, "qrels": {"count": 79934, "fields": {"relevance": {"counts_by_value": {"1": 6041, "2": 7171, "0": 66087, "3": 635}}}}}, "neuclir/1/ru": {"docs": {"count": 4627543, "fields": {"doc_id": {"max_len": 36, "common_prefix": ""}}}}, "neuclir/1/ru/hc4-filtered": {"docs": {"count": 964719, "fields": {"doc_id": {"max_len": 36, "common_prefix": ""}}}, "queries": {"count": 54}, "qrels": {"count": 3235, "fields": {"relevance": {"counts_by_value": {"0": 2483, "1": 478, "3": 274}}}}}, "neuclir/1/ru/trec-2022": {"docs": {"_ref": "neuclir/1/ru"}, "queries": {"count": 45}, "qrels": {"count": 33006, "fields": {"relevance": {"counts_by_value": {"3": 810, "0": 31117, "1": 1079}}}}}, "neuclir/1/ru/trec-2023": {"docs": {"_ref": "neuclir/1/ru"}, "queries": {"count": 76}, "qrels": {"count": 25634, "fields": {"relevance": {"counts_by_value": {"0": 20905, "2": 3190, "3": 117, "1": 1422}}}}}, "neuclir/1/zh": {"docs": {"count": 3179209, "fields": {"doc_id": {"max_len": 36, "common_prefix": ""}}}}, "neuclir/1/zh/hc4-filtered": {"docs": {"count": 519945, "fields": {"doc_id": {"max_len": 36, "common_prefix": ""}}}, "queries": {"count": 60}, "qrels": {"count": 3217, "fields": {"relevance": {"counts_by_value": {"0": 2651, "3": 344, "1": 222}}}}}, "neuclir/1/zh/trec-2022": {"docs": {"_ref": "neuclir/1/zh"}, "queries": {"count": 49}, "qrels": {"count": 36575, "fields": {"relevance": {"counts_by_value": {"0": 34442, "3": 720, "1": 1413}}}}}, "neuclir/1/zh/trec-2023": {"docs": {"_ref": "neuclir/1/zh"}, "queries": {"count": 76}, "qrels": {"count": 27638, "fields": {"relevance": {"counts_by_value": {"0": 23558, "2": 1913, "1": 2128, "3": 39}}}}}, "neumarco": {}, "neumarco/fa": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "neumarco/fa/dev": {"docs": {"_ref": "neumarco/fa"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}}, "neumarco/fa/dev/judged": {"docs": {"_ref": "neumarco/fa"}, "queries": {"count": 55578}, "qrels": {"_ref": "neumarco/fa/dev"}}, "neumarco/fa/dev/small": {"docs": {"_ref": "neumarco/fa"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}}, "neumarco/fa/train": {"docs": {"_ref": "neumarco/fa"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 269919004}}, "neumarco/fa/train/judged": {"docs": {"_ref": "neumarco/fa"}, "queries": {"count": 502939}, "qrels": {"_ref": "neumarco/fa/train"}, "docpairs": {"_ref": "neumarco/fa/train"}}, "neumarco/ru": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "neumarco/ru/dev": {"docs": {"_ref": "neumarco/ru"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}}, "neumarco/ru/dev/judged": {"docs": {"_ref": "neumarco/ru"}, "queries": {"count": 55578}, "qrels": {"_ref": "neumarco/ru/dev"}}, "neumarco/ru/dev/small": {"docs": {"_ref": "neumarco/ru"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}}, "neumarco/ru/train": {"docs": {"_ref": "neumarco/ru"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 269919004}}, "neumarco/ru/train/judged": {"docs": {"_ref": "neumarco/ru"}, "queries": {"count": 502939}, "qrels": {"_ref": "neumarco/ru/train"}, "docpairs": {"_ref": "neumarco/ru/train"}}, "neumarco/zh": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "neumarco/zh/dev": {"docs": {"_ref": "neumarco/zh"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}}, "neumarco/zh/dev/judged": {"docs": {"_ref": "neumarco/zh"}, "queries": {"count": 55578}, "qrels": {"_ref": "neumarco/zh/dev"}}, "neumarco/zh/dev/small": {"docs": {"_ref": "neumarco/zh"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}}, "neumarco/zh/train": {"docs": {"_ref": "neumarco/zh"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 269919004}}, "neumarco/zh/train/judged": {"docs": {"_ref": "neumarco/zh"}, "queries": {"count": 502939}, "qrels": {"_ref": "neumarco/zh/train"}, "docpairs": {"_ref": "neumarco/zh/train"}}, "nfcorpus": {"docs": {"count": 5371, "fields": {"doc_id": {"max_len": 8, "common_prefix": "MED-"}}}}, "nfcorpus/dev": {"docs": {"_ref": "nfcorpus"}, "queries": {"count": 325}, "qrels": {"count": 14589, "fields": {"relevance": {"counts_by_value": {"3": 521, "2": 10864, "1": 3204}}}}}, "nfcorpus/dev/nontopic": {"docs": {"_ref": "nfcorpus"}, "queries": {"count": 144}, "qrels": {"count": 4353, "fields": {"relevance": {"counts_by_value": {"3": 521, "2": 3133, "1": 699}}}}}, "nfcorpus/dev/video": {"docs": {"_ref": "nfcorpus"}, "queries": {"count": 102}, "qrels": {"count": 3068, "fields": {"relevance": {"counts_by_value": {"3": 411, "2": 1979, "1": 678}}}}}, "nfcorpus/test": {"docs": {"_ref": "nfcorpus"}, "queries": {"count": 325}, "qrels": {"count": 15820, "fields": {"relevance": {"counts_by_value": {"3": 576, "1": 3486, "2": 11758}}}}}, "nfcorpus/test/nontopic": {"docs": {"_ref": "nfcorpus"}, "queries": {"count": 144}, "qrels": {"count": 4540, "fields": {"relevance": {"counts_by_value": {"3": 576, "1": 676, "2": 3288}}}}}, "nfcorpus/test/video": {"docs": {"_ref": "nfcorpus"}, "queries": {"count": 102}, "qrels": {"count": 3108, "fields": {"relevance": {"counts_by_value": {"3": 464, "2": 2034, "1": 610}}}}}, "nfcorpus/train": {"docs": {"_ref": "nfcorpus"}, "queries": {"count": 2594}, "qrels": {"count": 139350, "fields": {"relevance": {"counts_by_value": {"3": 4279, "2": 106296, "1": 28775}}}}}, "nfcorpus/train/nontopic": {"docs": {"_ref": "nfcorpus"}, "queries": {"count": 1141}, "qrels": {"count": 37383, "fields": {"relevance": {"counts_by_value": {"3": 4279, "2": 26384, "1": 6720}}}}}, "nfcorpus/train/video": {"docs": {"_ref": "nfcorpus"}, "queries": {"count": 812}, "qrels": {"count": 27465, "fields": {"relevance": {"counts_by_value": {"3": 3536, "2": 17669, "1": 6260}}}}}, "nyt": {"docs": {"count": 1864661, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "nyt/trec-core-2017": {"docs": {"_ref": "nyt"}, "queries": {"count": 50}, "qrels": {"count": 30030, "fields": {"relevance": {"counts_by_value": {"1": 5549, "0": 21028, "2": 3453}}}}}, "nyt/wksup": {"docs": {"_ref": "nyt"}, "queries": {"count": 1864661}, "qrels": {"count": 1864661, "fields": {"relevance": {"counts_by_value": {"1": 1864661}}}}}, "nyt/wksup/train": {"docs": {"_ref": "nyt"}, "queries": {"count": 1863657}, "qrels": {"count": 1863657, "fields": {"relevance": {"counts_by_value": {"1": 1863657}}}}}, "nyt/wksup/valid": {"docs": {"_ref": "nyt"}, "queries": {"count": 1004}, "qrels": {"count": 1004, "fields": {"relevance": {"counts_by_value": {"1": 1004}}}}}, "pmc": {}, "pmc/v1": {"docs": {"count": 733111, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "pmc/v1/trec-cds-2014": {"docs": {"_ref": "pmc/v1"}, "queries": {"count": 30}, "qrels": {"count": 37949, "fields": {"relevance": {"counts_by_value": {"0": 34593, "2": 1683, "1": 1673}}}}}, "pmc/v1/trec-cds-2015": {"docs": {"_ref": "pmc/v1"}, "queries": {"count": 30}, "qrels": {"count": 37807, "fields": {"relevance": {"counts_by_value": {"1": 2979, "0": 32817, "2": 2011}}}}}, "pmc/v2": {"docs": {"count": 1255260, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "pmc/v2/trec-cds-2016": {"docs": {"_ref": "pmc/v2"}, "queries": {"count": 30}, "qrels": {"count": 37707, "fields": {"relevance": {"counts_by_value": {"0": 32246, "1": 3411, "2": 2050}}}}}, "sara": {"docs": {"count": 1702, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}, "queries": {"count": 150}, "qrels": {"count": 34413, "fields": {"relevance": {"counts_by_value": {"0": 32706, "2": 708, "1": 999}}}}}, "touche-image": {}, "touche-image/2022-06-13": {"docs": {"count": 23841, "fields": {"doc_id": {"max_len": 17, "common_prefix": "I"}}}}, "touche-image/2022-06-13/touche-2022-task-3": {"docs": {"_ref": "touche-image/2022-06-13"}, "queries": {"count": 50}, "qrels": {"count": 19821, "fields": {"relevance": {"counts_by_value": {"1": 8736, "0": 11085}}}}}, "trec-arabic": {"docs": {"count": 383872, "fields": {"doc_id": {"max_len": 21, "common_prefix": ""}}}}, "trec-arabic/ar2001": {"docs": {"_ref": "trec-arabic"}, "queries": {"count": 25}, "qrels": {"count": 22744, "fields": {"relevance": {"counts_by_value": {"0": 18622, "1": 4122}}}}}, "trec-arabic/ar2002": {"docs": {"_ref": "trec-arabic"}, "queries": {"count": 50}, "qrels": {"count": 38432, "fields": {"relevance": {"counts_by_value": {"0": 32523, "1": 5909}}}}}, "trec-cast": {}, "trec-cast/v0": {"docs": {"count": 47696605, "fields": {"doc_id": {"max_len": 46, "common_prefix": ""}}}}, "trec-cast/v0/train": {"docs": {"_ref": "trec-cast/v0"}, "queries": {"count": 269}, "qrels": {"count": 2399, "fields": {"relevance": {"counts_by_value": {"2": 311, "0": 1759, "1": 329}}}}, "scoreddocs": {"count": 269000}}, "trec-cast/v0/train/judged": {"docs": {"_ref": "trec-cast/v0"}, "queries": {"count": 120}, "qrels": {"_ref": "trec-cast/v0/train"}, "scoreddocs": {"count": 120000}}, "trec-cast/v1": {"docs": {"count": 38622444, "fields": {"doc_id": {"max_len": 44, "common_prefix": ""}}}}, "trec-cast/v1/2019": {"docs": {"_ref": "trec-cast/v1"}, "queries": {"count": 479}, "qrels": {"count": 29350, "fields": {"relevance": {"counts_by_value": {"0": 21230, "1": 2889, "2": 2157, "3": 1456, "4": 1618}}}}, "scoreddocs": {"count": 479000}}, "trec-cast/v1/2019/judged": {"docs": {"_ref": "trec-cast/v1"}, "queries": {"count": 173}, "qrels": {"_ref": "trec-cast/v1/2019"}, "scoreddocs": {"count": 173000}}, "trec-cast/v1/2020": {"docs": {"_ref": "trec-cast/v1"}, "queries": {"count": 216}, "qrels": {"count": 40451, "fields": {"relevance": {"counts_by_value": {"1": 2697, "0": 33781, "2": 1834, "3": 1408, "4": 731}}}}}, "trec-cast/v1/2020/judged": {"docs": {"_ref": "trec-cast/v1"}, "queries": {"count": 208}, "qrels": {"_ref": "trec-cast/v1/2020"}}, "trec-cast/v2": {"docs": {"count": 9680029, "fields": {"doc_id": {"max_len": 41, "common_prefix": ""}}}}, "trec-cast/v2/2021": {"docs": {"_ref": "trec-cast/v2"}, "queries": {"count": 239}, "qrels": {"count": 19334, "fields": {"relevance": {"counts_by_value": {"0": 13829, "4": 716, "3": 1007, "2": 1710, "1": 2072}}}}}, "trec-cast/v2/kilt": {"docs": {"count": 5903530, "fields": {"doc_id": {"max_len": 13, "common_prefix": "KILT_"}}}}, "trec-cast/v2/kilt/passages": {"docs": {"count": 17124025, "fields": {"doc_id": {"max_len": 17, "common_prefix": "KILT_"}}}}, "trec-cast/v2/kilt/segmented": {"docs": {"count": 5903530, "fields": {"doc_id": {"max_len": 13, "common_prefix": "KILT_"}}}}, "trec-cast/v2/msmarco": {"docs": {"count": 3051991, "fields": {"doc_id": {"max_len": 14, "common_prefix": "MARCO_D"}}}}, "trec-cast/v2/msmarco/passages": {"docs": {"count": 19092817, "fields": {"doc_id": {"max_len": 19, "common_prefix": "MARCO_D"}}}}, "trec-cast/v2/msmarco/segmented": {"docs": {"count": 3051991, "fields": {"doc_id": {"max_len": 14, "common_prefix": "MARCO_D"}}}}, "trec-cast/v2/passages": {"docs": {"count": 39254641, "fields": {"doc_id": {"max_len": 45, "common_prefix": ""}}}}, "trec-cast/v2/wapo": {"docs": {"count": 724508, "fields": {"doc_id": {"max_len": 41, "common_prefix": "WAPO_"}}}}, "trec-cast/v2/wapo/passages": {"docs": {"count": 3037799, "fields": {"doc_id": {"max_len": 45, "common_prefix": "WAPO_"}}}}, "trec-cast/v2/wapo/segmented": {"docs": {"count": 724508, "fields": {"doc_id": {"max_len": 41, "common_prefix": "WAPO_"}}}}, "trec-cast/v3": {"docs": {"count": 106400940, "fields": {"doc_id": {"max_len": 45, "common_prefix": ""}}}}, "trec-cast/v3/2022": {"docs": {"_ref": "trec-cast/v3"}, "queries": {"count": 408}, "qrels": {"count": 42196, "fields": {"relevance": {"counts_by_value": {"0": 29868, "1": 5063, "3": 2129, "2": 3297, "4": 1839}}}}}, "trec-cast/v3/kilt": {"docs": {"count": 5903219, "fields": {"doc_id": {"max_len": 13, "common_prefix": "KILT_"}}}}, "trec-cast/v3/kilt/passages": {"docs": {"count": 17111488, "fields": {"doc_id": {"max_len": 17, "common_prefix": "KILT_"}}}}, "trec-cast/v3/kilt/segmented": {"docs": {"count": 5903219, "fields": {"doc_id": {"max_len": 13, "common_prefix": "KILT_"}}}}, "trec-cast/v3/msmarco": {"docs": {"count": 10965836, "fields": {"doc_id": {"max_len": 19, "common_prefix": "MARCO_"}}}}, "trec-cast/v3/msmarco/passages": {"docs": {"count": 86326322, "fields": {"doc_id": {"max_len": 24, "common_prefix": "MARCO_"}}}}, "trec-cast/v3/msmarco/segmented": {"docs": {"count": 10965836, "fields": {"doc_id": {"max_len": 19, "common_prefix": "MARCO_"}}}}, "trec-cast/v3/wapo": {"docs": {"count": 713594, "fields": {"doc_id": {"max_len": 41, "common_prefix": "WAPO_"}}}}, "trec-cast/v3/wapo/passages": {"docs": {"count": 2963130, "fields": {"doc_id": {"max_len": 45, "common_prefix": "WAPO_"}}}}, "trec-cast/v3/wapo/segmented": {"docs": {"count": 713594, "fields": {"doc_id": {"max_len": 41, "common_prefix": "WAPO_"}}}}, "trec-fair": {}, "trec-fair-2021": {"docs": {"count": 6280328, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, "trec-fair-2021/eval": {"docs": {"_ref": "trec-fair-2021"}, "queries": {"count": 49}, "qrels": {"count": 13757, "fields": {"relevance": {"counts_by_value": {"1": 13757}}}}}, "trec-fair-2021/train": {"docs": {"_ref": "trec-fair-2021"}, "queries": {"count": 57}, "qrels": {"count": 2185446, "fields": {"relevance": {"counts_by_value": {"1": 2185446}}}}}, "trec-fair/2021": {"docs": {"count": 6280328, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, "trec-fair/2021/eval": {"docs": {"_ref": "trec-fair/2021"}, "queries": {"count": 49}, "qrels": {"count": 13757, "fields": {"relevance": {"counts_by_value": {"1": 13757}}}}}, "trec-fair/2021/train": {"docs": {"_ref": "trec-fair/2021"}, "queries": {"count": 57}, "qrels": {"count": 2185446, "fields": {"relevance": {"counts_by_value": {"1": 2185446}}}}}, "trec-fair/2022": {"docs": {"count": 6475537, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, "trec-fair/2022/train": {"docs": {"_ref": "trec-fair/2022"}, "queries": {"count": 50}, "qrels": {"count": 2088306, "fields": {"relevance": {"counts_by_value": {"1": 2088306}}}}}, "trec-mandarin": {"docs": {"count": 164789, "fields": {"doc_id": {"max_len": 22, "common_prefix": ""}}}}, "trec-mandarin/trec5": {"docs": {"_ref": "trec-mandarin"}, "queries": {"count": 28}, "qrels": {"count": 15588, "fields": {"relevance": {"counts_by_value": {"0": 13406, "1": 2182}}}}}, "trec-mandarin/trec6": {"docs": {"_ref": "trec-mandarin"}, "queries": {"count": 26}, "qrels": {"count": 9236, "fields": {"relevance": {"counts_by_value": {"1": 2958, "0": 6278}}}}}, "trec-robust04": {"docs": {"count": 528155, "fields": {"doc_id": {"max_len": 16, "common_prefix": ""}}}, "queries": {"count": 250}, "qrels": {"count": 311410, "fields": {"relevance": {"counts_by_value": {"1": 16381, "0": 293998, "2": 1031}}}}}, "trec-robust04/fold1": {"docs": {"_ref": "trec-robust04"}, "queries": {"count": 50}, "qrels": {"count": 62789, "fields": {"relevance": {"counts_by_value": {"0": 59765, "1": 2795, "2": 229}}}}}, "trec-robust04/fold2": {"docs": {"_ref": "trec-robust04"}, "queries": {"count": 50}, "qrels": {"count": 63917, "fields": {"relevance": {"counts_by_value": {"1": 3334, "0": 60246, "2": 337}}}}}, "trec-robust04/fold3": {"docs": {"_ref": "trec-robust04"}, "queries": {"count": 50}, "qrels": {"count": 62901, "fields": {"relevance": {"counts_by_value": {"0": 58859, "1": 3877, "2": 165}}}}}, "trec-robust04/fold4": {"docs": {"_ref": "trec-robust04"}, "queries": {"count": 50}, "qrels": {"count": 57962, "fields": {"relevance": {"counts_by_value": {"0": 55103, "1": 2707, "2": 152}}}}}, "trec-robust04/fold5": {"docs": {"_ref": "trec-robust04"}, "queries": {"count": 50}, "qrels": {"count": 63841, "fields": {"relevance": {"counts_by_value": {"0": 60025, "1": 3668, "2": 148}}}}}, "trec-spanish": {"docs": {"count": 120605, "fields": {"doc_id": {"max_len": 13, "common_prefix": ""}}}}, "trec-spanish/trec3": {"docs": {"_ref": "trec-spanish"}, "queries": {"count": 25}, "qrels": {"count": 19005, "fields": {"relevance": {"counts_by_value": {"1": 4766, "0": 14239}}}}}, "trec-spanish/trec4": {"docs": {"_ref": "trec-spanish"}, "queries": {"count": 25}, "qrels": {"count": 13109, "fields": {"relevance": {"counts_by_value": {"1": 2202, "0": 10907}}}}}, "trec-tot": {}, "trec-tot/2023": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, "trec-tot/2023/dev": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 150}, "qrels": {"count": 150, "fields": {"relevance": {"counts_by_value": {"1": 150}}}}}, "trec-tot/2023/train": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 150}, "qrels": {"count": 150, "fields": {"relevance": {"counts_by_value": {"1": 150}}}}}, "trec-tot/2024": {"docs": {"count": 3185450, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, "trec-tot/2024/test": {"docs": {"_ref": "trec-tot/2024"}, "queries": {"count": 600}}, "trec-tot/2025": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, "trec-tot/2025/dev1": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 142}, "qrels": {"count": 142, "fields": {"relevance": {"counts_by_value": {"1": 142}}}}}, "trec-tot/2025/dev2": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 143}, "qrels": {"count": 143, "fields": {"relevance": {"counts_by_value": {"1": 143}}}}}, "trec-tot/2025/dev3": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 536}, "qrels": {"count": 536, "fields": {"relevance": {"counts_by_value": {"1": 536}}}}}, "trec-tot/2025/test": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 622}}, "trec-tot/2025/train": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 143}, "qrels": {"count": 143, "fields": {"relevance": {"counts_by_value": {"1": 143}}}}}, "tripclick": {"docs": {"count": 1523878, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, "tripclick/logs": {"docs": {"count": 5196956, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "qlogs": {"count": 5317350}}, "tripclick/test": {"docs": {"_ref": "tripclick"}, "queries": {"count": 3525}, "scoreddocs": {"count": 3486402}}, "tripclick/test/head": {"docs": {"_ref": "tripclick"}, "queries": {"count": 1175}, "scoreddocs": {"count": 1159303}}, "tripclick/test/tail": {"docs": {"_ref": "tripclick"}, "queries": {"count": 1175}, "scoreddocs": {"count": 1165127}}, "tripclick/test/torso": {"docs": {"_ref": "tripclick"}, "queries": {"count": 1175}, "scoreddocs": {"count": 1161972}}, "tripclick/train": {"docs": {"_ref": "tripclick"}, "queries": {"count": 685649}, "qrels": {"count": 2705212, "fields": {"relevance": {"counts_by_value": {"1": 1239161, "0": 1466051}}}}, "docpairs": {"count": 23221224}}, "tripclick/train/head": {"docs": {"_ref": "tripclick"}, "queries": {"count": 3529}, "qrels": {"count": 116821, "fields": {"relevance": {"counts_by_value": {"1": 55663, "0": 61158}}}}}, "tripclick/train/head/dctr": {"docs": {"_ref": "tripclick"}, "queries": {"_ref": "tripclick/train/head"}, "qrels": {"count": 128420, "fields": {"relevance": {"counts_by_value": {"3": 3684, "2": 7844, "1": 24139, "0": 92753}}}}}, "tripclick/train/hofstaetter-triples": {"docs": {"_ref": "tripclick"}, "queries": {"_ref": "tripclick/train"}, "qrels": {"_ref": "tripclick/train"}, "docpairs": {"count": 10000000}}, "tripclick/train/tail": {"docs": {"_ref": "tripclick"}, "queries": {"count": 576156}, "qrels": {"count": 1621493, "fields": {"relevance": {"counts_by_value": {"1": 758678, "0": 862815}}}}}, "tripclick/train/torso": {"docs": {"_ref": "tripclick"}, "queries": {"count": 105964}, "qrels": {"count": 966898, "fields": {"relevance": {"counts_by_value": {"1": 424820, "0": 542078}}}}}, "tripclick/val": {"docs": {"_ref": "tripclick"}, "queries": {"count": 3525}, "qrels": {"count": 82409, "fields": {"relevance": {"counts_by_value": {"1": 40083, "0": 42326}}}}, "scoreddocs": {"count": 3503310}}, "tripclick/val/head": {"docs": {"_ref": "tripclick"}, "queries": {"count": 1175}, "qrels": {"count": 64364, "fields": {"relevance": {"counts_by_value": {"1": 32067, "0": 32297}}}}, "scoreddocs": {"count": 1166804}}, "tripclick/val/head/dctr": {"docs": {"_ref": "tripclick"}, "queries": {"_ref": "tripclick/val/head"}, "qrels": {"count": 66812, "fields": {"relevance": {"counts_by_value": {"2": 3974, "1": 13936, "0": 46936, "3": 1966}}}}, "scoreddocs": {"_ref": "tripclick/val/head"}}, "tripclick/val/tail": {"docs": {"_ref": "tripclick"}, "queries": {"count": 1175}, "qrels": {"count": 3912, "fields": {"relevance": {"counts_by_value": {"1": 1814, "0": 2098}}}}, "scoreddocs": {"count": 1166192}}, "tripclick/val/torso": {"docs": {"_ref": "tripclick"}, "queries": {"count": 1175}, "qrels": {"count": 14133, "fields": {"relevance": {"counts_by_value": {"1": 6202, "0": 7931}}}}, "scoreddocs": {"count": 1170314}}, "tweets2013-ia": {"docs": {"count": 252713133, "fields": {"doc_id": {"max_len": 18, "common_prefix": ""}}}}, "tweets2013-ia/trec-mb-2013": {"docs": {"_ref": "tweets2013-ia"}, "queries": {"count": 60}, "qrels": {"count": 71279, "fields": {"relevance": {"counts_by_value": {"0": 62268, "1": 5856, "2": 3155}}}}}, "tweets2013-ia/trec-mb-2014": {"docs": {"_ref": "tweets2013-ia"}, "queries": {"count": 55}, "qrels": {"count": 57985, "fields": {"relevance": {"counts_by_value": {"0": 47340, "2": 5892, "1": 4753}}}}}, "vaswani": {"docs": {"count": 11429, "fields": {"doc_id": {"max_len": 5, "common_prefix": ""}}}, "queries": {"count": 93}, "qrels": {"count": 2083, "fields": {"relevance": {"counts_by_value": {"1": 2083}}}}}, "wapo": {}, "wapo/v2": {"docs": {"count": 595037, "fields": {"doc_id": {"max_len": 36, "common_prefix": ""}}}}, "wapo/v2/trec-core-2018": {"docs": {"_ref": "wapo/v2"}, "queries": {"count": 50}, "qrels": {"count": 26233, "fields": {"relevance": {"counts_by_value": {"0": 22285, "2": 1865, "1": 2083}}}}}, "wapo/v2/trec-news-2018": {"docs": {"_ref": "wapo/v2"}, "queries": {"count": 50}, "qrels": {"count": 8508, "fields": {"relevance": {"counts_by_value": {"16": 106, "2": 1189, "0": 6465, "4": 584, "8": 164}}}}}, "wapo/v2/trec-news-2019": {"docs": {"_ref": "wapo/v2"}, "queries": {"count": 60}, "qrels": {"count": 15655, "fields": {"relevance": {"counts_by_value": {"2": 1677, "0": 12614, "8": 431, "16": 273, "4": 660}}}}}, "wapo/v3/trec-news-2020": {"queries": {"count": 50}, "qrels": {"count": 17764, "fields": {"relevance": {"counts_by_value": {"0": 15348, "2": 1603, "4": 631, "8": 132, "16": 50}}}}}, "wapo/v4": {"docs": {"count": 728626, "fields": {"doc_id": {"max_len": 36, "common_prefix": ""}}}}, "wikiclir": {}, "wikiclir/ar": {"docs": {"count": 535118, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}, "queries": {"count": 324489}, "qrels": {"count": 519269, "fields": {"relevance": {"counts_by_value": {"2": 324475, "1": 194794}}}}}, "wikiclir/ca": {"docs": {"count": 548722, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}, "queries": {"count": 339586}, "qrels": {"count": 965233, "fields": {"relevance": {"counts_by_value": {"2": 339562, "1": 625671}}}}}, "wikiclir/cs": {"docs": {"count": 386906, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}, "queries": {"count": 233553}, "qrels": {"count": 954370, "fields": {"relevance": {"counts_by_value": {"2": 233535, "1": 720835}}}}}, "wikiclir/de": {"docs": {"count": 2091278, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 938217}, "qrels": {"count": 5550454, "fields": {"relevance": {"counts_by_value": {"2": 938194, "1": 4612260}}}}}, "wikiclir/en-simple": {"docs": {"count": 127089, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}, "queries": {"count": 114572}, "qrels": {"count": 250380, "fields": {"relevance": {"counts_by_value": {"2": 114564, "1": 135816}}}}}, "wikiclir/es": {"docs": {"count": 1302958, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}, "queries": {"count": 781642}, "qrels": {"count": 2894807, "fields": {"relevance": {"counts_by_value": {"2": 781376, "1": 2113431}}}}}, "wikiclir/fi": {"docs": {"count": 418677, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}, "queries": {"count": 273819}, "qrels": {"count": 939613, "fields": {"relevance": {"counts_by_value": {"2": 273796, "1": 665817}}}}}, "wikiclir/fr": {"docs": {"count": 1894397, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 1089179}, "qrels": {"count": 5137366, "fields": {"relevance": {"counts_by_value": {"2": 1089052, "1": 4048314}}}}}, "wikiclir/it": {"docs": {"count": 1347011, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}, "queries": {"count": 808605}, "qrels": {"count": 3443633, "fields": {"relevance": {"counts_by_value": {"2": 808345, "1": 2635288}}}}}, "wikiclir/ja": {"docs": {"count": 1071292, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}, "queries": {"count": 426431}, "qrels": {"count": 3338667, "fields": {"relevance": {"counts_by_value": {"2": 426383, "1": 2912284}}}}}, "wikiclir/ko": {"docs": {"count": 394177, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}, "queries": {"count": 224855}, "qrels": {"count": 568205, "fields": {"relevance": {"counts_by_value": {"2": 224843, "1": 343362}}}}}, "wikiclir/nl": {"docs": {"count": 1908260, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}, "queries": {"count": 687718}, "qrels": {"count": 2334644, "fields": {"relevance": {"counts_by_value": {"2": 687672, "1": 1646972}}}}}, "wikiclir/nn": {"docs": {"count": 133290, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}, "queries": {"count": 99493}, "qrels": {"count": 250141, "fields": {"relevance": {"counts_by_value": {"2": 99465, "1": 150676}}}}}, "wikiclir/no": {"docs": {"count": 471420, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}, "queries": {"count": 299897}, "qrels": {"count": 963514, "fields": {"relevance": {"counts_by_value": {"2": 299831, "1": 663683}}}}}, "wikiclir/pl": {"docs": {"count": 1234316, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}, "queries": {"count": 693656}, "qrels": {"count": 2471360, "fields": {"relevance": {"counts_by_value": {"2": 693604, "1": 1777756}}}}}, "wikiclir/pt": {"docs": {"count": 973057, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}, "queries": {"count": 611732}, "qrels": {"count": 1741889, "fields": {"relevance": {"counts_by_value": {"2": 611643, "1": 1130246}}}}}, "wikiclir/ro": {"docs": {"count": 376655, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}, "queries": {"count": 199264}, "qrels": {"count": 451180, "fields": {"relevance": {"counts_by_value": {"2": 199253, "1": 251927}}}}}, "wikiclir/ru": {"docs": {"count": 1413945, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}, "queries": {"count": 664924}, "qrels": {"count": 2321384, "fields": {"relevance": {"counts_by_value": {"2": 664780, "1": 1656604}}}}}, "wikiclir/sv": {"docs": {"count": 3785412, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}, "queries": {"count": 639073}, "qrels": {"count": 2069453, "fields": {"relevance": {"counts_by_value": {"2": 638829, "1": 1430624}}}}}, "wikiclir/sw": {"docs": {"count": 37079, "fields": {"doc_id": {"max_len": 5, "common_prefix": ""}}}, "queries": {"count": 22860}, "qrels": {"count": 57924, "fields": {"relevance": {"counts_by_value": {"2": 22859, "1": 35065}}}}}, "wikiclir/tl": {"docs": {"count": 79008, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}, "queries": {"count": 48930}, "qrels": {"count": 72359, "fields": {"relevance": {"counts_by_value": {"2": 48928, "1": 23431}}}}}, "wikiclir/tr": {"docs": {"count": 295593, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}, "queries": {"count": 185388}, "qrels": {"count": 380651, "fields": {"relevance": {"counts_by_value": {"2": 185360, "1": 195291}}}}}, "wikiclir/uk": {"docs": {"count": 704903, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}, "queries": {"count": 348222}, "qrels": {"count": 913358, "fields": {"relevance": {"counts_by_value": {"2": 348168, "1": 565190}}}}}, "wikiclir/vi": {"docs": {"count": 1392152, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}, "queries": {"count": 354312}, "qrels": {"count": 611355, "fields": {"relevance": {"counts_by_value": {"2": 354279, "1": 257076}}}}}, "wikiclir/zh": {"docs": {"count": 951480, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}, "queries": {"count": 463273}, "qrels": {"count": 926130, "fields": {"relevance": {"counts_by_value": {"2": 463209, "1": 462921}}}}}, "wikir": {}, "wikir/en1k": {"docs": {"count": 369721, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "wikir/en1k/test": {"docs": {"_ref": "wikir/en1k"}, "queries": {"count": 100}, "qrels": {"count": 4435, "fields": {"relevance": {"counts_by_value": {"2": 100, "1": 4335}}}}, "scoreddocs": {"count": 10000}}, "wikir/en1k/training": {"docs": {"_ref": "wikir/en1k"}, "queries": {"count": 1444}, "qrels": {"count": 47699, "fields": {"relevance": {"counts_by_value": {"2": 1444, "1": 46255}}}}, "scoreddocs": {"count": 144400}}, "wikir/en1k/validation": {"docs": {"_ref": "wikir/en1k"}, "queries": {"count": 100}, "qrels": {"count": 4979, "fields": {"relevance": {"counts_by_value": {"2": 100, "1": 4879}}}}, "scoreddocs": {"count": 10000}}, "wikir/en59k": {"docs": {"count": 2454785, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "wikir/en59k/test": {"docs": {"_ref": "wikir/en59k"}, "queries": {"count": 1000}, "qrels": {"count": 104715, "fields": {"relevance": {"counts_by_value": {"2": 1000, "1": 103715}}}}, "scoreddocs": {"count": 100000}}, "wikir/en59k/training": {"docs": {"_ref": "wikir/en59k"}, "queries": {"count": 57251}, "qrels": {"count": 2443383, "fields": {"relevance": {"counts_by_value": {"2": 57251, "1": 2386132}}}}, "scoreddocs": {"count": 5725100}}, "wikir/en59k/validation": {"docs": {"_ref": "wikir/en59k"}, "queries": {"count": 1000}, "qrels": {"count": 68905, "fields": {"relevance": {"counts_by_value": {"2": 1000, "1": 67905}}}}, "scoreddocs": {"count": 100000}}, "wikir/en78k": {"docs": {"count": 2456637, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "wikir/en78k/test": {"docs": {"_ref": "wikir/en78k"}, "queries": {"count": 7862}, "qrels": {"count": 353060, "fields": {"relevance": {"counts_by_value": {"2": 7862, "1": 345198}}}}, "scoreddocs": {"count": 785600}}, "wikir/en78k/training": {"docs": {"_ref": "wikir/en78k"}, "queries": {"count": 62904}, "qrels": {"count": 2435257, "fields": {"relevance": {"counts_by_value": {"2": 62904, "1": 2372353}}}}, "scoreddocs": {"count": 6284800}}, "wikir/en78k/validation": {"docs": {"_ref": "wikir/en78k"}, "queries": {"count": 7862}, "qrels": {"count": 271874, "fields": {"relevance": {"counts_by_value": {"2": 7862, "1": 264012}}}}, "scoreddocs": {"count": 785700}}, "wikir/ens78k": {"docs": {"count": 2456637, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}}, "wikir/ens78k/test": {"docs": {"_ref": "wikir/ens78k"}, "queries": {"count": 7862}, "qrels": {"count": 353060, "fields": {"relevance": {"counts_by_value": {"2": 7862, "1": 345198}}}}, "scoreddocs": {"count": 786100}}, "wikir/ens78k/training": {"docs": {"_ref": "wikir/ens78k"}, "queries": {"count": 62904}, "qrels": {"count": 2435257, "fields": {"relevance": {"counts_by_value": {"2": 62904, "1": 2372353}}}}, "scoreddocs": {"count": 6289800}}, "wikir/ens78k/validation": {"docs": {"_ref": "wikir/ens78k"}, "queries": {"count": 7862}, "qrels": {"count": 271874, "fields": {"relevance": {"counts_by_value": {"2": 7862, "1": 264012}}}}, "scoreddocs": {"count": 786100}}, "wikir/es13k": {"docs": {"count": 645901, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}}, "wikir/es13k/test": {"docs": {"_ref": "wikir/es13k"}, "queries": {"count": 1300}, "qrels": {"count": 71339, "fields": {"relevance": {"counts_by_value": {"2": 1300, "1": 70039}}}}, "scoreddocs": {"count": 130000}}, "wikir/es13k/training": {"docs": {"_ref": "wikir/es13k"}, "queries": {"count": 11202}, "qrels": {"count": 477212, "fields": {"relevance": {"counts_by_value": {"2": 11202, "1": 466010}}}}, "scoreddocs": {"count": 1120200}}, "wikir/es13k/validation": {"docs": {"_ref": "wikir/es13k"}, "queries": {"count": 1300}, "qrels": {"count": 58757, "fields": {"relevance": {"counts_by_value": {"2": 1300, "1": 57457}}}}, "scoreddocs": {"count": 130000}}, "wikir/fr14k": {"docs": {"count": 736616, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}}, "wikir/fr14k/test": {"docs": {"_ref": "wikir/fr14k"}, "queries": {"count": 1400}, "qrels": {"count": 55647, "fields": {"relevance": {"counts_by_value": {"2": 1400, "1": 54247}}}}, "scoreddocs": {"count": 140000}}, "wikir/fr14k/training": {"docs": {"_ref": "wikir/fr14k"}, "queries": {"count": 11341}, "qrels": {"count": 609240, "fields": {"relevance": {"counts_by_value": {"2": 11341, "1": 597899}}}}, "scoreddocs": {"count": 1134100}}, "wikir/fr14k/validation": {"docs": {"_ref": "wikir/fr14k"}, "queries": {"count": 1400}, "qrels": {"count": 81255, "fields": {"relevance": {"counts_by_value": {"2": 1400, "1": 79855}}}}, "scoreddocs": {"count": 140000}}, "wikir/it16k": {"docs": {"count": 503012, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}}, "wikir/it16k/test": {"docs": {"_ref": "wikir/it16k"}, "queries": {"count": 1600}, "qrels": {"count": 49338, "fields": {"relevance": {"counts_by_value": {"2": 1600, "1": 47738}}}}, "scoreddocs": {"count": 160000}}, "wikir/it16k/training": {"docs": {"_ref": "wikir/it16k"}, "queries": {"count": 13418}, "qrels": {"count": 381920, "fields": {"relevance": {"counts_by_value": {"2": 13418, "1": 368502}}}}, "scoreddocs": {"count": 1341800}}, "wikir/it16k/validation": {"docs": {"_ref": "wikir/it16k"}, "queries": {"count": 1600}, "qrels": {"count": 45003, "fields": {"relevance": {"counts_by_value": {"2": 1600, "1": 43403}}}}, "scoreddocs": {"count": 160000}} } ================================================ FILE: ir_datasets/formats/__init__.py ================================================ from .base import GenericDoc, GenericQuery, GenericQrel, GenericScoredDoc, GenericDocPair, DocstoreBackedDocs, DocSourceSeekableIter, DocSource, SourceDocIter, BaseQlogs from .base import BaseDocs, BaseQueries, BaseQrels, BaseScoredDocs, BaseDocPairs from .csv_fmt import CsvDocs, CsvQueries, CsvDocPairs from .tsv import TsvDocs, TsvQueries, TsvDocPairs from .trec import TrecDocs, TrecQueries, TrecXmlQueries, TrecColonQueries, TrecQrels, TrecSubQrels, TrecPrels, TrecScoredDocs, TrecDoc, TitleUrlTextDoc, TrecQuery, TrecSubtopic, TrecQrel, TrecSubQrel, TrecPrel, TrecParsedDoc from .touche import ToucheQuery, ToucheTitleQuery, ToucheComparativeQuery, ToucheQualityQrel, ToucheQualityCoherenceQrel, ToucheComparativeStance, ToucheQualityComparativeStanceQrel, ToucheControversialStance, ToucheControversialStanceQrel, TouchePassageDoc, ToucheQueries, ToucheTitleQueries, ToucheComparativeQueries, ToucheQrels, ToucheQualityQrels, ToucheQualityCoherenceQrels, ToucheQualityComparativeStanceQrels, ToucheControversialStanceQrels, TouchePassageDocs from .touche_image import ToucheImageRanking, ToucheImageNode, ToucheImagePage, ToucheImageDoc, ToucheImageDocs from .webarc import WarcDocs, WarcDoc from .ntcir import NtcirQrels from .clirmatrix import CLIRMatrixQueries, CLIRMatrixQrels from .argsme import ArgsMeStance, ArgsMeMode, ArgsMeSourceDomain, ArgsMePremiseAnnotation, ArgsMePremise, ArgsMeAspect, ArgsMeDoc, ArgsMeProcessedDoc, ArgsMeDocs, ArgsMeProcessedDocs, ArgsMeCombinedDocs from .extracted_cc import ExctractedCCDoc, ExctractedCCDocs, ExctractedCCQuery, ExctractedCCQueries, ExctractedCCNoReportQuery, ExctractedCCNoReportNoHtNarQuery, ExctractedCCMultiMtQuery from .jsonl import JsonlDocs, JsonlQueries ================================================ FILE: ir_datasets/formats/argsme.py ================================================ from ast import literal_eval from csv import DictReader, field_size_limit from datetime import datetime from enum import Enum from io import TextIOWrapper from pathlib import Path from re import sub from sys import maxsize from typing import NamedTuple, List, Optional from ir_datasets import lazy_libs from ir_datasets.formats import BaseDocs from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS from ir_datasets.util import Cache, use_docstore class ArgsMeStance(Enum): """ See the corresponding Java source files from the args.me project: https://git.webis.de/code-research/arguana/args/args-framework/-/blob/master/src/main/java/me/args/argument/Stance.java """ PRO = 1 CON = 2 @staticmethod def from_json(json: str) -> "ArgsMeStance": if json == "PRO": return ArgsMeStance.PRO elif json == "CON": return ArgsMeStance.CON else: raise ValueError(f"Unknown stance {json}") class ArgsMeMode(Enum): person = 1 discussion = 2 @staticmethod def from_json(json: str) -> "ArgsMeMode": if json == "person": return ArgsMeMode.person elif json == "discussion": return ArgsMeMode.discussion else: raise ValueError(f"Unknown mode {json}") class ArgsMeSourceDomain(Enum): debateorg = 1 debatepedia = 2 debatewise = 3 idebate = 4 canadian_parliament = 5 @staticmethod def from_json(json: str) -> "ArgsMeSourceDomain": if json == "debate.org": return ArgsMeSourceDomain.debateorg elif json == "debatepedia": return ArgsMeSourceDomain.debatepedia elif json == "debatewise": return ArgsMeSourceDomain.debatewise elif json == "idebate": return ArgsMeSourceDomain.idebate elif json == "canadian-parliament": return ArgsMeSourceDomain.canadian_parliament else: raise ValueError(f"Unknown source domain {json}") class ArgsMePremiseAnnotation(NamedTuple): """ See the corresponding Java source files from the args.me project: https://git.webis.de/code-research/arguana/args/args-framework/-/blob/master/src/main/java/me/args/argument/PremiseAnnotation.java """ start: int end: int source: str @staticmethod def from_json(json: dict) -> "ArgsMePremiseAnnotation": start = int(json["start"]) end = int(json["end"]) source = str(json["source"]) return ArgsMePremiseAnnotation(start, end, source) class ArgsMePremise(NamedTuple): """ See the corresponding Java source files from the args.me project: https://git.webis.de/code-research/arguana/args/args-framework/-/blob/master/src/main/java/me/args/argument/Premise.java """ text: str stance: ArgsMeStance annotations: List[ArgsMePremiseAnnotation] @staticmethod def from_json(json: dict) -> "ArgsMePremise": return ArgsMePremise( str(json["text"]), ArgsMeStance.from_json(json["stance"]), [ ArgsMePremiseAnnotation.from_json(annotation) for annotation in json["annotations"] ] if "annotations" in json else [], ) class ArgsMeAspect(NamedTuple): name: str weight: float normalized_weight: float rank: int @staticmethod def from_json(json: dict) -> "ArgsMeAspect": name = str(json["name"]) weight = float(json["weight"]) normalized_weight = float(json["normalizedWeight"]) rank = int(json["rank"]) return ArgsMeAspect(name, weight, normalized_weight, rank) class ArgsMeSentence(NamedTuple): id: str text: str @staticmethod def from_json(json: dict) -> "ArgsMeSentence": return ArgsMeSentence( str(json["sent_id"]), str(json["sent_text"]), ) class ArgsMeDoc(NamedTuple): """ See the corresponding Java source files from the args.me project: https://git.webis.de/code-research/arguana/args/args-framework/-/blob/master/src/main/java/me/args/Argument.java https://git.webis.de/code-research/arguana/args/args-framework/-/blob/master/src/main/java/me/args/argument/Premise.java """ doc_id: str conclusion: str premises: List[ArgsMePremise] premises_texts: str # Premises texts concatenated with spaces. aspects: List[ArgsMeAspect] aspects_names: str # Aspects namews concatenated with spaces. source_id: str source_title: str source_url: Optional[str] source_previous_argument_id: Optional[str] source_next_argument_id: Optional[str] source_domain: Optional[ArgsMeSourceDomain] source_text: Optional[str] source_text_conclusion_start: Optional[int] source_text_conclusion_end: Optional[int] source_text_premise_start: Optional[int] source_text_premise_end: Optional[int] topic: str # Topic or discussion title. acquisition: datetime date: Optional[datetime] author: Optional[str] author_image_url: Optional[str] author_organization: Optional[str] author_role: Optional[str] mode: Optional[ArgsMeMode] def default_text(self): """ premises + conclusion """ return f"{self.premises_texts} {self.conclusion}" @staticmethod def from_json(json: dict) -> "ArgsMeDoc": context_json = json["context"] doc_id = str(json["id"]) conclusion = str(json["conclusion"]) premises = [ ArgsMePremise.from_json(premise) for premise in json["premises"] ] premises_texts = " ".join(premise.text for premise in premises) aspects = [ ArgsMeAspect.from_json(aspect) for aspect in context_json["aspects"] ] if "aspects" in context_json else [] aspects_names = " ".join(aspect.name for aspect in aspects) source_id = str(context_json["sourceId"]) source_title = str(context_json["sourceTitle"]) source_url = ( str(context_json["sourceUrl"]) if "sourceUrl" in context_json else None ) source_previous_argument_id = ( str(context_json["previousArgumentInSourceId"]) if ("previousArgumentInSourceId" in context_json and context_json["previousArgumentInSourceId"]) else None ) source_next_argument_id = ( str(context_json["nextArgumentInSourceId"]) if ("nextArgumentInSourceId" in context_json and context_json["nextArgumentInSourceId"]) else None ) source_domain = ( ArgsMeSourceDomain.from_json(context_json["sourceDomain"]) if "sourceDomain" in context_json else None ) source_text = ( str(context_json["sourceText"]) if "sourceText" in context_json else None ) source_text_conclusion_start = ( int(context_json["sourceTextConclusionStart"]) if "sourceTextConclusionStart" in context_json else None ) source_text_conclusion_end = ( int(context_json["sourceTextConclusionEnd"]) if "sourceTextConclusionEnd" in context_json else None ) source_text_premise_start = ( int(context_json["sourceTextPremiseStart"]) if "sourceTextPremiseStart" in context_json else None ) source_text_premise_end = ( int(context_json["sourceTextPremiseEnd"]) if "sourceTextPremiseEnd" in context_json else None ) topic = ( str(context_json["topic"]) if "topic" in context_json else str(context_json["discussionTitle"]) ) acquisition = datetime.fromisoformat( sub(r"Z$", "+00:00", context_json["acquisitionTime"]) ) date = ( datetime.fromisoformat( sub(r"Z$", "+00:00", context_json["date"]) ) if "date" in context_json else None ) author = ( str(context_json["author"]) if "author" in context_json else None ) author_image_url = ( str(context_json["authorImage"]) if "authorImage" in context_json else None ) author_organization = ( str(context_json["authorOrganization"]) if "authorOrganization" in context_json else None ) author_role = ( str(context_json["authorRole"]) if "authorRole" in context_json else None ) mode = ( ArgsMeMode.from_json(context_json["mode"]) if "mode" in context_json else None ) return ArgsMeDoc( doc_id=doc_id, conclusion=conclusion, premises=premises, premises_texts=premises_texts, aspects=aspects, aspects_names=aspects_names, source_id=source_id, source_title=source_title, source_url=source_url, source_previous_argument_id=source_previous_argument_id, source_next_argument_id=source_next_argument_id, source_domain=source_domain, source_text=source_text, source_text_conclusion_start=source_text_conclusion_start, source_text_conclusion_end=source_text_conclusion_end, source_text_premise_start=source_text_premise_start, source_text_premise_end=source_text_premise_end, topic=topic, acquisition=acquisition, date=date, author=author, author_image_url=author_image_url, author_organization=author_organization, author_role=author_role, mode=mode, ) class ArgsMeProcessedDoc(NamedTuple): """ See the corresponding Java source files from the args.me project: https://git.webis.de/code-research/arguana/args/args-framework/-/blob/master/src/main/java/me/args/Argument.java https://git.webis.de/code-research/arguana/args/args-framework/-/blob/master/src/main/java/me/args/argument/Premise.java """ doc_id: str conclusion: str premises: List[ArgsMePremise] premises_texts: str # Premises texts concatenated with spaces. aspects: List[ArgsMeAspect] aspects_names: str # Aspects namews concatenated with spaces. source_id: str source_title: str source_url: Optional[str] source_previous_argument_id: Optional[str] source_next_argument_id: Optional[str] source_domain: Optional[ArgsMeSourceDomain] source_text: Optional[str] source_text_conclusion_start: Optional[int] source_text_conclusion_end: Optional[int] source_text_premise_start: Optional[int] source_text_premise_end: Optional[int] topic: str # Topic or discussion title. acquisition: datetime date: Optional[datetime] author: Optional[str] author_image_url: Optional[str] author_organization: Optional[str] author_role: Optional[str] mode: Optional[ArgsMeMode] sentences: List[ArgsMeSentence] @staticmethod def from_csv(csv: dict) -> "ArgsMeProcessedDoc": csv["premises"] = literal_eval(csv["premises"]) csv["context"] = literal_eval(csv["context"]) doc = ArgsMeDoc.from_json(csv) sentences = [ ArgsMeSentence.from_json(json) for json in literal_eval(csv["sentences"]) ] return ArgsMeProcessedDoc( *doc, sentences=sentences, ) class ArgsMeDocs(BaseDocs): _source: Cache _namespace: Optional[str] _language: Optional[str] _count_hint: Optional[int] def __init__( self, cache: Cache, namespace: Optional[str] = None, language: Optional[str] = None, count_hint: Optional[int] = None, ): self._source = cache self._namespace = namespace self._language = language self._count_hint = count_hint def docs_path(self): return self._source.path() @use_docstore def docs_iter(self): ijson = lazy_libs.ijson() with self._source.stream() as json_stream: argument_jsons = ijson.items(json_stream, "arguments.item") for argument_json in argument_jsons: argument = ArgsMeDoc.from_json(argument_json) yield argument def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f"{self.docs_path()}.pklz4", init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=["doc_id"], count_hint=self._count_hint, options=options ) def docs_count(self): return self._count_hint def docs_cls(self): return ArgsMeDoc def docs_namespace(self): return self._namespace def docs_lang(self): return self._language class ArgsMeProcessedDocs(BaseDocs): _source: Cache _namespace: Optional[str] _language: Optional[str] _count_hint: Optional[int] def __init__( self, cache: Cache, namespace: Optional[str] = None, language: Optional[str] = None, count_hint: Optional[int] = None, ): self._source = cache self._namespace = namespace self._language = language self._count_hint = count_hint def docs_path(self): return self._source.path() @use_docstore def docs_iter(self): with self._source.stream() as csv_stream: lines = TextIOWrapper(csv_stream) field_size_limit(maxsize) reader = DictReader(lines) for argument_csv in reader: argument = ArgsMeProcessedDoc.from_csv(argument_csv) yield argument def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f"{self.docs_path()}.pklz4", init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=["doc_id"], count_hint=self._count_hint, options=options ) def docs_count(self): return self._count_hint def docs_cls(self): return ArgsMeProcessedDoc def docs_namespace(self): return self._namespace def docs_lang(self): return self._language class ArgsMeCombinedDocs(BaseDocs): _path: Path _sources: List[ArgsMeDocs] _namespace: Optional[str] _language: Optional[str] _count_hint: Optional[int] def __init__( self, path: Path, sources: List[ArgsMeDocs], namespace: Optional[str] = None, language: Optional[str] = None, count_hint: Optional[int] = None, ): self._path = path self._sources = sources self._namespace = namespace self._language = language self._count_hint = count_hint def docs_path(self): return self._path @use_docstore def docs_iter(self): for source in self._sources: for argument in source.docs_iter(): yield argument def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f"{self.docs_path()}.pklz4", init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=["doc_id"], count_hint=self._count_hint, options=options ) def docs_count(self): assert (sum( source.docs_count() for source in self._sources ) == self._count_hint) return self._count_hint def docs_cls(self): assert (all( source.docs_cls() == ArgsMeDoc for source in self._sources )) return ArgsMeDoc def docs_namespace(self): assert (all( source.docs_namespace() in self._namespace for source in self._sources )) return self._namespace def docs_lang(self): assert (all( source.docs_lang() == self._language for source in self._sources )) return self._language ================================================ FILE: ir_datasets/formats/base.py ================================================ import hashlib import json import types import itertools from typing import NamedTuple import ir_datasets _logger = ir_datasets.log.easy() class GenericDoc(NamedTuple): doc_id: str text: str def default_text(self): return self.text class GenericQuery(NamedTuple): query_id: str text: str def default_text(self): return self.text class GenericQrel(NamedTuple): query_id: str doc_id: str relevance: int class GenericScoredDoc(NamedTuple): query_id: str doc_id: str score: float class GenericDocPair(NamedTuple): query_id: str doc_id_a: str doc_id_b: str class BaseDocs: PREFIX = 'docs_' EXTENSIONS = {} def __getattr__(self, attr): if attr.startswith(self.PREFIX) and attr in self.EXTENSIONS: # Return method bound to this instance return types.MethodType(self.EXTENSIONS[attr], self) raise AttributeError(attr) def docs_iter(self): raise NotImplementedError() def docs_count(self): raise NotImplementedError() def docs_handler(self): return self def docs_cls(self): return GenericDoc def docs_namespace(self): return None # No namespace defined def docs_lang(self): return None # ISO 639-1 language code, or None for multiple/other/unknown class BaseQueries: PREFIX = 'queries_' EXTENSIONS = {} def __getattr__(self, attr): if attr.startswith(self.PREFIX) and attr in self.EXTENSIONS: # Return method bound to this instance return types.MethodType(self.EXTENSIONS[attr], self) raise AttributeError(attr) def queries_iter(self): raise NotImplementedError() def queries_handler(self): return self def queries_cls(self): return GenericQuery def queries_namespace(self): return None # No namespace defined def queries_lang(self): return None # ISO 639-1 language code, or None for multiple/other/unknown class BaseQrels: PREFIX = 'qrels_' EXTENSIONS = {} def __getattr__(self, attr): if attr.startswith(self.PREFIX) and attr in self.EXTENSIONS: # Return method bound to this instance return types.MethodType(self.EXTENSIONS[attr], self) raise AttributeError(attr) def qrels_iter(self): raise NotImplementedError() def qrels_defs(self): raise NotImplementedError() def qrels_path(self): raise NotImplementedError() def qrels_cls(self): return GenericQrel def qrels_handler(self): return self class BaseScoredDocs: PREFIX = 'scoreddocs_' EXTENSIONS = {} def __getattr__(self, attr): if attr.startswith(self.PREFIX) and attr in self.EXTENSIONS: # Return method bound to this instance return types.MethodType(self.EXTENSIONS[attr], self) raise AttributeError(attr) def scoreddocs_path(self): raise NotImplementedError() def scoreddocs_iter(self): raise NotImplementedError() def scoreddocs_cls(self): return GenericScoredDoc def scoreddocs_handler(self): return self class BaseDocPairs: PREFIX = 'docpairs_' EXTENSIONS = {} def __getattr__(self, attr): if attr.startswith(self.PREFIX) and attr in self.EXTENSIONS: # Return method bound to this instance return types.MethodType(self.EXTENSIONS[attr], self) raise AttributeError(attr) def docpairs_path(self): raise NotImplementedError() def docpairs_iter(self): raise NotImplementedError() def docpairs_cls(self): return GenericDocPair def docpairs_handler(self): return self class BaseQlogs: PREFIX = 'qlogs_' EXTENSIONS = {} def __getattr__(self, attr): if attr.startswith(self.PREFIX) and attr in self.EXTENSIONS: # Return method bound to this instance return types.MethodType(self.EXTENSIONS[attr], self) raise AttributeError(attr) def qlogs_iter(self): raise NotImplementedError() def qlogs_cls(self): raise NotImplementedError() def qlogs_count(self): raise NotImplementedError() def qlogs_handler(self): return self BaseQueries.EXTENSIONS['queries_dict'] = lambda x: {q.query_id: q for q in x.queries_iter()} def qrels_dict(qrels_handler): result = {} for qrel in qrels_handler.qrels_iter(): if qrel.query_id not in result: result[qrel.query_id] = {} result[qrel.query_id][qrel.doc_id] = qrel.relevance return result BaseQrels.EXTENSIONS['qrels_dict'] = qrels_dict def hasher(iter_fn, hashfn=hashlib.md5): def wrapped(self): h = hashfn() for record in getattr(self, iter_fn)(): js = [[field, value] for field, value in zip(record._fields, record)] h.update(json.dumps(js).encode()) return h.hexdigest() return wrapped BaseDocs.EXTENSIONS['docs_hash'] = hasher('docs_iter') BaseQueries.EXTENSIONS['queries_hash'] = hasher('queries_iter') BaseQrels.EXTENSIONS['qrels_hash'] = hasher('qrels_iter') BaseScoredDocs.EXTENSIONS['scoreddocs_hash'] = hasher('scoreddocs_iter') BaseDocPairs.EXTENSIONS['docpairs_hash'] = hasher('docpairs_iter') def _calc_metadata(iter_fn, metadata_fields=(), count_by_value_field=None): def wrapped(self, verbose=True, hashfn=hashlib.sha256): count = 0 it = getattr(self, iter_fn)() if verbose: it = _logger.pbar(it) field_lens = {f: 0 for f in metadata_fields} field_prefixes = {} count_by_field_values = {} for record in it: count += 1 for f in metadata_fields: field = getattr(record, f) field_lens[f] = max(field_lens[f], len(field.encode())) if f not in field_prefixes: field_prefixes[f] = field elif len(field_prefixes[f]) > 0: field_prefixes[f] = ''.join(x[0] for x in itertools.takewhile(lambda x: x[0] == x[1], zip(field_prefixes[f], field))) if count_by_value_field is not None: count_by_value_field_value = getattr(record, count_by_value_field) if count_by_value_field_value not in count_by_field_values: count_by_field_values[count_by_value_field_value] = 0 count_by_field_values[count_by_value_field_value] += 1 result = {'count': count} if metadata_fields: result['fields'] = {} for f in metadata_fields: result['fields'][f] = { 'max_len': field_lens[f], 'common_prefix': field_prefixes[f], } if count_by_value_field is not None: result.setdefault('fields', {}).setdefault(count_by_value_field, {})['counts_by_value'] = count_by_field_values return result return wrapped BaseDocs.EXTENSIONS['docs_calc_metadata'] = _calc_metadata('docs_iter', ('doc_id',)) BaseQueries.EXTENSIONS['queries_calc_metadata'] = _calc_metadata('queries_iter') BaseQrels.EXTENSIONS['qrels_calc_metadata'] = _calc_metadata('qrels_iter', count_by_value_field='relevance') BaseScoredDocs.EXTENSIONS['scoreddocs_calc_metadata'] = _calc_metadata('scoreddocs_iter') BaseDocPairs.EXTENSIONS['docpairs_calc_metadata'] = _calc_metadata('docpairs_iter') BaseQlogs.EXTENSIONS['qlogs_calc_metadata'] = _calc_metadata('qlogs_iter') class DocstoreBackedDocs(BaseDocs): """ A Docs implementation that defers all operations to a pre-built docstore instance. """ def __init__(self, docstore_lazy, docs_cls=GenericDoc, namespace=None, lang=None): self._docstore_lazy = docstore_lazy self._loaded_docstore = False self._docs_cls = docs_cls self._docs_namespace = namespace self._docs_lang = lang def docs_iter(self): return iter(self._docstore_lazy()) def docs_count(self): if self._loaded_docstore and self.docs_store().built(): return self.docs_store().count() def docs_cls(self): return self._docs_cls def docs_namespace(self): return self._docs_namespace def docs_lang(self): return self._docs_lang def docs_store(self): result = self._docstore_lazy() self._loaded_docstore = True return result class DocSourceSeekableIter: def __next__(self) -> NamedTuple: """ Returns the next document encountered """ raise NotImplementedError() def seek(self, pos): """ Seeks to the document as `index` pos within the source. """ raise NotImplementedError() def close(self): """ Performs any cleanup work when done with this iterator (e.g., close open files) """ pass def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() def __iter__(self): return self class DocSource: def __len__(self) -> int: """ Returns the number of documents in this source """ raise NotImplementedError() def __iter__(self) -> DocSourceSeekableIter: """ Returns a seekable iterator over this source """ raise NotImplementedError() class SourceDocIter: def __init__(self, docs, slice): self.docs = docs self.next_index = 0 self.slice = slice self.current_iter = None self.current_start_idx = 0 self.current_end_idx = 0 self.sources = docs.docs_source_iter() def __next__(self): if self.slice.start >= self.slice.stop: raise StopIteration if self.current_iter is None or self.current_end_idx <= self.slice.start: # First iteration or no docs remaining in this file if self.current_iter is not None: self.current_iter.close() self.current_iter = None # jump ahead to the file that contains the desired index first = True while first or self.current_end_idx < self.slice.start: source = next(self.sources) self.next_index = self.current_end_idx self.current_start_idx = self.current_end_idx self.current_end_idx = self.current_start_idx + len(source) first = False self.current_iter = iter(source) if self.next_index != self.slice.start: self.current_iter.seek(self.slice.start - self.current_start_idx) result = next(self.current_iter) self.next_index += 1 self.slice = slice(self.slice.start + (self.slice.step or 1), self.slice.stop, self.slice.step) return result def close(self): if self.current_iter is not None: self.current_iter.close() self.current_iter = None def __iter__(self): return self def __del__(self): self.close() def __getitem__(self, key): if isinstance(key, slice): # it[start:stop:step] new_slice = ir_datasets.util.apply_sub_slice(self.slice, key) return SourceDocIter(self.docs, new_slice) elif isinstance(key, int): # it[index] new_slice = ir_datasets.util.slice_idx(self.slice, key) new_it = SourceDocIter(self.docs, new_slice) result = next(new_it, StopIteration) if result is StopIteration: raise IndexError((self.slice, slice(key, key+1), new_slice)) return result raise TypeError('key must be int or slice') ================================================ FILE: ir_datasets/formats/clirmatrix.py ================================================ import codecs import json from . import TrecQrels, TrecQrel from .base import GenericQuery, BaseQueries class CLIRMatrixQueries(BaseQueries): def __init__(self, streamer, query_lang): super().__init__() self._streamer = streamer self.query_lang = query_lang def queries_iter(self): with self._streamer.stream() as stream: f = codecs.getreader('utf-8')(stream) for line in f: if line == '\n': continue #ignore blank lines j = json.loads(line) qid = j["src_id"] query = j["src_query"] yield GenericQuery(qid, query) def queries_namespace(self): return NAME def queries_cls(self): return GenericQuery def queries_lang(self): return self.query_lang class CLIRMatrixQrels(TrecQrels): def qrels_iter(self): with self._qrels_dlc.stream() as f: f = codecs.getreader('utf8')(f) for line in f: if line == '\n': continue # ignore blank lines j = json.loads(line) qid = j["src_id"] for did, score in j["tgt_results"]: yield TrecQrel(qid, did, int(score), '0') ================================================ FILE: ir_datasets/formats/csv_fmt.py ================================================ import sys import codecs import contextlib import csv from typing import Tuple import io import ir_datasets from .base import GenericDoc, GenericQuery, GenericDocPair, BaseDocs, BaseQueries, BaseDocPairs from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS class _CsvBase: def __init__(self, dlc, cls, datatype): super().__init__() self._dlc = dlc self._cls = cls self._datatype = datatype def _path(self, force=True): return self._dlc.path(force) def _iter(self): csv.field_size_limit(sys.maxsize // 1000) field_count = len(self._cls._fields) with self._dlc.stream() as f: f = codecs.getreader('utf8')(f) f = iter(f) next(f) # skip header row for cols in csv.reader(f): assert len(cols) == field_count yield self._cls(*cols) class CsvDocs(_CsvBase, BaseDocs): def __init__(self, docs_dlc, doc_cls=GenericDoc, doc_store_index_fields=None, namespace=None, lang=None, count_hint=None, docstore_path=None): super().__init__(docs_dlc, doc_cls, "docs") self._doc_store_index_fields = doc_store_index_fields self._docs_namespace = namespace self._docs_lang = lang self._count_hint = count_hint self._docstore_path = docstore_path if docstore_path is not None else f'{self.docs_path(force=False)}.pklz4' def docs_path(self, force=True): return self._path(force) @ir_datasets.util.use_docstore def docs_iter(self): return self._iter() def docs_cls(self): return self._cls def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): fields = (self._doc_store_index_fields or ['doc_id']) return PickleLz4FullStore( path=self._docstore_path, init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=fields, count_hint=self._count_hint, options=options ) def docs_namespace(self): return self._docs_namespace def docs_count(self): if self.docs_store().built(): return self.docs_store().count() return None def docs_lang(self): return self._docs_lang class CsvQueries(_CsvBase, BaseQueries): def __init__(self, queries_dlc, query_cls=GenericQuery, namespace=None, lang=None): super().__init__(queries_dlc, query_cls, "queries") self._queries_namespace = namespace self._queries_lang = lang def queries_path(self): return self._path() def queries_iter(self): return self._iter() def queries_cls(self): return self._cls def queries_namespace(self): return self._queries_namespace def queries_lang(self): return self._queries_lang class CsvDocPairs(_CsvBase, BaseDocPairs): def __init__(self, docpairs_dlc, docpair_cls=GenericDocPair): super().__init__(docpairs_dlc, docpair_cls, "docpairs") def docpairs_path(self): return self._path() def docpairs_iter(self): return self._iter() def docpairs_cls(self): return self._cls ================================================ FILE: ir_datasets/formats/extracted_cc.py ================================================ from typing import Dict, NamedTuple import json import ir_datasets from ir_datasets.formats.base import BaseDocs, BaseQueries from ir_datasets.indices.lz4_pickle import PickleLz4FullStore LANG_CODE_CONVERT = { 'zh': 'zho', 'fa': 'fas', 'ru': 'rus' } class ExctractedCCDoc(NamedTuple): doc_id: str title: str text: str url: str time: str cc_file: str def default_text(self): """ title and text """ return f'{self.title} {self.text}' class ExctractedCCDocs(BaseDocs): def __init__(self, docs_dlc, subset_lang=None, namespace=None, count=None, docstore_path=None): self._docs_dlc = docs_dlc self._subset_lang = subset_lang self._namespace = namespace self._count = count self._docstore_path = docstore_path def docs_path(self, force=True): if self._docstore_path: return self._docstore_path return self._docs_dlc.path(force) @ir_datasets.util.use_docstore def docs_iter(self): yield from self._internal_docs_iter() def _doc_store_path(self): return self.docs_path(force=False) def docs_store(self): return PickleLz4FullStore( path=f'{self._doc_store_path()}.pklz4', init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field='doc_id', index_fields=['doc_id'], count_hint=self._count, ) def _internal_docs_iter(self): if not isinstance(self._docs_dlc, list): docs_dlc = [self._docs_dlc] else: docs_dlc = self._docs_dlc for dlc in docs_dlc: with dlc.stream() as f: for line in f: line = json.loads(line) line['doc_id'] = line['id'] del line['id'] yield ExctractedCCDoc(**line) def docs_cls(self): return ExctractedCCDoc def docs_namespace(self): return self._namespace def docs_count(self): return self._count def docs_lang(self): return self._subset_lang class ExctractedCCQuery(NamedTuple): query_id: str title: str description: str ht_title: str ht_description: str mt_title: str mt_description: str narrative_by_relevance: Dict[str, str] report: str report_url: str report_date: str translation_lang: str def default_text(self): """ title """ return self.title class ExctractedCCNoReportQuery(NamedTuple): query_id: str title: str description: str narrative: str ht_title: str ht_description: str ht_narrative: str mt_title: str mt_description: str mt_narrative: str translation_lang: str def default_text(self): """ title """ return self.title class ExctractedCCNoReportNoHtNarQuery(NamedTuple): query_id: str title: str description: str narrative: str ht_title: str ht_description: str mt_title: str mt_description: str mt_narrative: str translation_lang: str def default_text(self): """ title """ return self.title class ExctractedCCMultiMtQuery(NamedTuple): query_id: str title: str description: str narrative: str fa_mt_title: str fa_mt_description: str fa_mt_narrative: str ru_mt_title: str ru_mt_description: str ru_mt_narrative: str zh_mt_title: str zh_mt_description: str zh_mt_narrative: str def default_text(self): """ title """ return self.title class ExctractedCCQueries(BaseQueries): def __init__(self, queries_dlc, subset_lang=None, filter_lwq=True, cls=ExctractedCCQuery, namespace=None): self._queries_dlc = queries_dlc if isinstance(queries_dlc, list) else [queries_dlc] self._subset_lang = subset_lang self._filter_lwq = filter_lwq self._namespace = namespace self._cls = cls self._subset_lang_three = LANG_CODE_CONVERT.get(self._subset_lang) def queries_path(self): return [ dlc.path() for dlc in self._queries_dlc ] def queries_cls(self): return self._cls def queries_namespace(self): return self._namespace def queries_iter(self): for dlc in self._queries_dlc: yield from self._internal_queries_iter(dlc) def _internal_queries_iter(self, dlc): with dlc.stream() as f: for line in f: line = json.loads(line) if not self._filter_lwq or self._subset_lang_three in line['languages_with_qrels']: yield self._produce_query(line) def _produce_query(self, line): resources = {} for tp in line['topics']: if tp['lang'] == 'eng': resources['org'] = tp else: if tp['source'] == 'human translation': resources['ht_{lang}'.format(**tp)] = tp else: # machine translation resources['mt_{lang}'.format(**tp)] = tp if tp['lang'] == self._subset_lang_three: if tp['source'] == 'human translation': resources['ht'] = tp else: # machine translation resources['mt'] = tp if self._cls is ExctractedCCQuery: return ExctractedCCQuery( query_id=line['topic_id'], title=resources['org']['topic_title'], description=resources['org']['topic_description'], ht_title=resources['ht']['topic_title'], ht_description=resources['ht']['topic_description'], mt_title=resources['mt']['topic_title'], mt_description=resources['mt']['topic_description'], narrative_by_relevance=line['narratives'][self._subset_lang_three], report=line['report']['text'], report_url=line['report']['url'], report_date=line['report']['date'], translation_lang=self._subset_lang ) elif self._cls is ExctractedCCNoReportQuery: return ExctractedCCNoReportQuery( query_id=line['topic_id'], title=resources['org']['topic_title'], description=resources['org']['topic_description'], narrative=resources['org']['topic_narrative'], ht_title=resources['ht']['topic_title'], ht_description=resources['ht']['topic_description'], ht_narrative=resources['ht']['topic_narrative'], mt_title=resources['mt']['topic_title'], mt_description=resources['mt']['topic_description'], mt_narrative=resources['mt']['topic_narrative'], translation_lang=self._subset_lang ) elif self._cls is ExctractedCCNoReportNoHtNarQuery: return ExctractedCCNoReportNoHtNarQuery( query_id=line['topic_id'], title=resources['org']['topic_title'], description=resources['org']['topic_description'], narrative=resources['org']['topic_narrative'], ht_title=resources['ht']['topic_title'], ht_description=resources['ht']['topic_description'], mt_title=resources['mt']['topic_title'], mt_description=resources['mt']['topic_description'], mt_narrative=resources['mt']['topic_narrative'], translation_lang=self._subset_lang ) elif self._cls is ExctractedCCMultiMtQuery: return ExctractedCCMultiMtQuery( query_id=line['topic_id'], title=resources['org']['topic_title'], description=resources['org']['topic_description'], narrative=resources['org']['topic_narrative'], fa_mt_title=resources['mt_fas']['topic_title'], fa_mt_description=resources['mt_fas']['topic_description'], fa_mt_narrative=resources['mt_fas']['topic_narrative'], ru_mt_title=resources['mt_rus']['topic_title'], ru_mt_description=resources['mt_rus']['topic_description'], ru_mt_narrative=resources['mt_rus']['topic_narrative'], zh_mt_title=resources['mt_zho']['topic_title'], zh_mt_description=resources['mt_zho']['topic_description'], zh_mt_narrative=resources['mt_zho']['topic_narrative'], ) ================================================ FILE: ir_datasets/formats/jsonl.py ================================================ import sys import codecs import contextlib import json from typing import Tuple import io import ir_datasets from .base import GenericDoc, GenericQuery, GenericDocPair, BaseDocs, BaseQueries, BaseDocPairs from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS class _JsonlBase: def __init__(self, dlcs, cls, datatype, mapping=None): super().__init__() self._dlcs = dlcs if isinstance(dlcs, (tuple, list)) else [dlcs] self._cls = cls self._datatype = datatype if mapping is None: self._mapping = {f: f for f in cls._fields} else: self._mapping = mapping def _path(self, force=True): return self._dlcs[0].path(force) def _iter(self): for dlc in self._dlcs: with dlc.stream() as f: for line in f: data = json.loads(line) yield self._cls(**{dockey: data[datakey] for dockey, datakey in self._mapping.items()}) class JsonlDocs(_JsonlBase, BaseDocs): def __init__(self, docs_dlcs, doc_cls=GenericDoc, mapping=None, doc_store_index_fields=None, namespace=None, lang=None, count_hint=None, docstore_path=None): super().__init__(docs_dlcs, doc_cls, "docs", mapping) self._doc_store_index_fields = doc_store_index_fields self._docs_namespace = namespace self._docs_lang = lang self._count_hint = count_hint self._docstore_path = docstore_path if docstore_path is not None else f'{self.docs_path(force=False)}.pklz4' def docs_path(self, force=True): return self._path(force) @ir_datasets.util.use_docstore def docs_iter(self): return self._iter() def docs_cls(self): return self._cls def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): fields = (self._doc_store_index_fields or ['doc_id']) return PickleLz4FullStore( path=self._docstore_path, init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=fields, count_hint=self._count_hint, options=options ) def docs_namespace(self): return self._docs_namespace def docs_count(self): if self.docs_store().built(): return self.docs_store().count() return None def docs_lang(self): return self._docs_lang class JsonlQueries(_JsonlBase, BaseQueries): def __init__(self, query_dlcs, query_cls=GenericQuery, mapping=None, lang=None, namespaec=None): super().__init__(query_dlcs, query_cls, "queries", mapping) self._queries_lang = lang self._queries_namespace = namespaec def queries_path(self, force=True): return self._path(force) def queries_iter(self): return self._iter() def queries_cls(self): return self._cls def queries_namespace(self): return self._queries_namespace def queries_lang(self): return self._queries_lang ================================================ FILE: ir_datasets/formats/ntcir.py ================================================ import codecs from . import TrecQrels, TrecQrel class NtcirQrels(TrecQrels): def qrels_iter(self): with self._qrels_dlc.stream() as f: f = codecs.getreader('utf8')(f) for line in f: if line == '\n': continue # ignore blank lines cols = line.rstrip().split() if len(cols) != 3: raise RuntimeError(f'expected 3 columns, got {len(cols)}') qid, did, score = cols score = score[1:] yield TrecQrel(qid, did, int(score), '0') ================================================ FILE: ir_datasets/formats/touche.py ================================================ from enum import Enum from io import TextIOWrapper from json import loads from typing import NamedTuple, Any, Optional, Dict, Tuple from xml.etree.ElementTree import parse, Element, ElementTree from ir_datasets.formats import BaseQueries, BaseQrels, TrecQrel, BaseDocs from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS from ir_datasets.util import Cache, use_docstore class ToucheQuery(NamedTuple): query_id: str title: str description: str narrative: str def default_text(self): """ title """ return self.title class ToucheTitleQuery(NamedTuple): query_id: str title: str def default_text(self): """ title """ return self.title class ToucheComparativeQuery(NamedTuple): query_id: str title: str objects: Tuple[str, str] description: str narrative: str def default_text(self): """ title """ return self.title class ToucheQualityQrel(NamedTuple): query_id: str doc_id: str relevance: int quality: int iteration: str class ToucheQualityCoherenceQrel(NamedTuple): query_id: str doc_id: str relevance: int quality: int coherence: int iteration: str class ToucheComparativeStance(Enum): FIRST = "FIRST" SECOND = "SECOND" NEUTRAL = "NEUTRAL" NO = "NO" class ToucheQualityComparativeStanceQrel(NamedTuple): query_id: str doc_id: str relevance: int quality: int stance: ToucheComparativeStance iteration: str class ToucheControversialStance(Enum): PRO = "PRO" CON = "CON" ONTOPIC = "ONTOPIC" class ToucheControversialStanceQrel(NamedTuple): query_id: str doc_id: str relevance: int stance: ToucheControversialStance class TouchePassageDoc(NamedTuple): doc_id: str text: str chatnoir_url: str def default_text(self): """ text """ return self.text class ToucheQueries(BaseQueries): _source: Any _namespace: Optional[str] _language: Optional[str] def __init__( self, source: Any, namespace: Optional[str] = None, language: Optional[str] = None, ): self._source = source self._namespace = namespace self._language = language def queries_path(self): return self._source.path() def queries_iter(self): with self._source.stream() as file: tree: ElementTree = parse(file) root: Element = tree.getroot() assert root.tag == "topics" for element in root: element: Element assert element.tag == "topic" number = int(element.findtext("number").strip()) title = element.findtext("title").strip() description = element.findtext("description").strip() narrative = element.findtext("narrative").strip() yield ToucheQuery( str(number), title, description, narrative, ) def queries_cls(self): return ToucheQuery def queries_namespace(self): return self._namespace def queries_lang(self): return self._language class ToucheTitleQueries(BaseQueries): _source: Any _namespace: Optional[str] _language: Optional[str] def __init__( self, source: Any, namespace: Optional[str] = None, language: Optional[str] = None, ): self._source = source self._namespace = namespace self._language = language def queries_path(self): return self._source.path() def queries_iter(self): with self._source.stream() as file: tree: ElementTree = parse(file) root: Element = tree.getroot() assert root.tag == "topics" for element in root: element: Element assert element.tag == "topic" number = int(element.findtext("number").strip()) title = element.findtext("title").strip() yield ToucheTitleQuery( str(number), title, ) def queries_cls(self): return ToucheTitleQuery def queries_namespace(self): return self._namespace def queries_lang(self): return self._language class ToucheComparativeQueries(BaseQueries): _source: Any _namespace: Optional[str] _language: Optional[str] def __init__( self, source: Any, namespace: Optional[str] = None, language: Optional[str] = None, ): self._source = source self._namespace = namespace self._language = language def queries_path(self): return self._source.path() def queries_iter(self): with self._source.stream() as file: tree: ElementTree = parse(file) root: Element = tree.getroot() assert root.tag == "topics" for element in root: element: Element assert element.tag == "topic" number = int(element.findtext("number").strip()) title = element.findtext("title").strip() objects = element.findtext("objects").split(",") objects = (obj.strip() for obj in objects) object1, object2 = objects description = element.findtext("description").strip() narrative = element.findtext("narrative").strip() yield ToucheComparativeQuery( str(number), title, (object1, object2), description, narrative, ) def queries_cls(self): return ToucheComparativeQuery def queries_namespace(self): return self._namespace def queries_lang(self): return self._language class ToucheQrels(BaseQrels): _source: Any _definitions: Dict[int, str] _allow_float_score: bool = False def __init__( self, source: Any, definitions: Dict[int, str], allow_float_score: bool = False, ): self._source = source self._definitions = definitions self._allow_float_score = allow_float_score def qrels_path(self): return self._source.path() def qrels_iter(self): print(self._source.path()) with self._source.stream() as file: with TextIOWrapper(file) as lines: lines = ( line.rstrip() for line in lines if line != "\n" # Ignore blank lines. ) for line in lines: cols = line.split() if len(cols) != 4: raise ValueError( f"Expected 4 relevance columns " f"but got {len(cols)}." ) qid, it, did, score = cols if self._allow_float_score: score = float(score) if not score.is_integer(): raise ValueError( f"Non-integer relevance score {score}." ) yield TrecQrel( query_id=qid, doc_id=did, relevance=int(score), iteration=it, ) def qrels_cls(self): return TrecQrel def qrels_defs(self): return self._definitions class ToucheQualityQrels(BaseQrels): _source: Any _source_quality: Any _definitions: Dict[int, str] def __init__( self, source: Any, source_quality: Any, definitions: Dict[int, str], ): self._source = source self._source_quality = source_quality self._definitions = definitions def qrels_path(self): return self._source.path() def qrels_iter(self): with self._source.stream() as file, \ self._source_quality.stream() as file_quality: with TextIOWrapper(file) as lines, \ TextIOWrapper(file_quality) as lines_quality: lines = ( line.rstrip() for line in lines if line != "\n" # Ignore blank lines. ) lines_quality = ( line.rstrip() for line in lines_quality if line != "\n" # Ignore blank lines. ) zipped_lines = zip(lines, lines_quality) for zipped_line in zipped_lines: line, line_quality = zipped_line cols = line.split() if len(cols) != 4: raise ValueError( f"Expected 4 relevance columns " f"but got {len(cols)}." ) qid, it, did, score = cols cols_quality = line_quality.split() if len(cols_quality) != 4: raise ValueError( f"Expected 4 quality columns " f"but got {len(cols_quality)}." ) qid_quality, it_quality, did_quality, score_quality \ = cols_quality if qid_quality != qid: raise ValueError( f"Quality query {qid_quality} does not match " f"relevance query {qid}." ) if did_quality != did: raise ValueError( f"Quality document {did_quality} does not match " f"relevance document {did}." ) if it_quality != it: raise ValueError( f"Quality iteration {it_quality} does not match " f"relevance iteration {it}." ) yield ToucheQualityQrel( query_id=qid, doc_id=did, relevance=int(score), quality=int(score_quality), iteration=it, ) def qrels_cls(self): return ToucheQualityQrel def qrels_defs(self): return self._definitions class ToucheQualityCoherenceQrels(BaseQrels): _source: Any _source_quality: Any _source_coherence: Any _definitions: Dict[int, str] def __init__( self, source: Any, source_quality: Any, source_coherence: Any, definitions: Dict[int, str], ): self._source = source self._source_quality = source_quality self._source_coherence = source_coherence self._definitions = definitions def qrels_path(self): return self._source.path() def qrels_iter(self): with self._source.stream() as file, \ self._source_quality.stream() as file_quality, \ self._source_coherence.stream() as file_coherence: with TextIOWrapper(file) as lines, \ TextIOWrapper(file_quality) as lines_quality, \ TextIOWrapper(file_coherence) as lines_coherence: lines = ( line.rstrip() for line in lines if line != "\n" # Ignore blank lines. ) lines_quality = ( line.rstrip() for line in lines_quality if line != "\n" # Ignore blank lines. ) lines_coherence = ( line.rstrip() for line in lines_coherence if line != "\n" # Ignore blank lines. ) zipped_lines = zip(lines, lines_quality, lines_coherence) for zipped_line in zipped_lines: line, line_quality, line_coherence = zipped_line cols = line.split() if len(cols) != 4: raise ValueError( f"Expected 4 relevance columns " f"but got {len(cols)}." ) qid, it, did, score = cols cols_quality = line_quality.split() if len(cols_quality) != 4: raise ValueError( f"Expected 4 quality columns " f"but got {len(cols_quality)}." ) qid_quality, it_quality, did_quality, score_quality \ = cols_quality if qid_quality != qid: raise ValueError( f"Quality query {qid_quality} does not match " f"relevance query {qid}." ) if did_quality != did: raise ValueError( f"Quality document {did_quality} does not match " f"relevance document {did}." ) if it_quality != it: raise ValueError( f"Quality iteration {it_quality} does not match " f"relevance iteration {it}." ) cols_coherence = line_coherence.split() if len(cols_coherence) != 4: raise ValueError( f"Expected 4 coherence columns " f"but got {len(cols_coherence)}." ) qid_coherence, it_coherence, did_coherence, \ score_coherence = cols_coherence if qid_coherence != qid: raise ValueError( f"Coherence query {qid_coherence} does not match " f"relevance query {qid}." ) if did_coherence != did: raise ValueError( f"Coherence document {did_coherence} " f"does not match " f"relevance document {did}." ) if it_coherence != it: raise ValueError( f"Coherence iteration {it_coherence} " f"does not match " f"relevance iteration {it}." ) yield ToucheQualityCoherenceQrel( query_id=qid, doc_id=did, relevance=int(score), quality=int(score_quality), coherence=int(score_coherence), iteration=it, ) def qrels_cls(self): return ToucheQualityCoherenceQrel def qrels_defs(self): return self._definitions class ToucheQualityComparativeStanceQrels(BaseQrels): _source: Any _source_quality: Any _source_stance: Any _definitions: Dict[int, str] def __init__( self, source: Any, source_quality: Any, source_stance: Any, definitions: Dict[int, str], ): self._source = source self._source_quality = source_quality self._source_stance = source_stance self._definitions = definitions def qrels_path(self): return self._source.path() def qrels_iter(self): with self._source.stream() as file, \ self._source_quality.stream() as file_quality, \ self._source_stance.stream() as file_stance: with TextIOWrapper(file) as lines, \ TextIOWrapper(file_quality) as lines_quality, \ TextIOWrapper(file_stance) as lines_stance: lines = ( line.rstrip() for line in lines if line != "\n" # Ignore blank lines. ) lines_quality = ( line.rstrip() for line in lines_quality if line != "\n" # Ignore blank lines. ) lines_stance = ( line.rstrip() for line in lines_stance if line != "\n" # Ignore blank lines. ) zipped_lines = zip(lines, lines_quality, lines_stance) for zipped_line in zipped_lines: line, line_quality, line_stance = zipped_line cols = line.split() if len(cols) != 4: raise ValueError( f"Expected 4 relevance columns " f"but got {len(cols)}." ) qid, it, did, score = cols cols_quality = line_quality.split() if len(cols_quality) != 4: raise ValueError( f"Expected 4 quality columns " f"but got {len(cols_quality)}." ) qid_quality, it_quality, did_quality, score_quality \ = cols_quality if qid_quality != qid: raise ValueError( f"Quality query {qid_quality} does not match " f"relevance query {qid}." ) if did_quality != did: raise ValueError( f"Quality document {did_quality} does not match " f"relevance document {did}." ) if it_quality != it: raise ValueError( f"Quality iteration {it_quality} does not match " f"relevance iteration {it}." ) cols_stance = line_stance.split() if len(cols_stance) != 4: raise ValueError( f"Expected 4 stance columns " f"but got {len(cols_stance)}." ) qid_stance, it_stance, did_stance, score_stance = \ cols_stance if qid_stance != qid: raise ValueError( f"Stance query {qid_stance} does not match " f"relevance query {qid}." ) if did_stance != did: raise ValueError( f"Stance document {did_stance} does not match " f"relevance document {did}." ) if it_stance != it: raise ValueError( f"Stance iteration {it_stance} does not match " f"relevance iteration {it}." ) yield ToucheQualityComparativeStanceQrel( query_id=qid, doc_id=did, relevance=int(score), quality=int(score_quality), stance=ToucheComparativeStance(score_stance), iteration=it, ) def qrels_cls(self): return ToucheQualityComparativeStanceQrel def qrels_defs(self): return self._definitions class ToucheControversialStanceQrels(BaseQrels): _source: Any _definitions: Dict[int, str] def __init__(self, source: Any, definitions: Dict[int, str]): self._source = source self._definitions = definitions def qrels_path(self): return self._source.path() def qrels_iter(self): with self._source.stream() as file: with TextIOWrapper(file) as lines: lines = ( line.rstrip() for line in lines if line != "\n" # Ignore blank lines. ) for line in lines: cols = line.split() if len(cols) != 4: raise ValueError( f"Expected 4 relevance and stance columns " f"but got {len(cols)}." ) qid, stance, did, score = cols yield ToucheControversialStanceQrel( query_id=qid, doc_id=did, relevance=int(score), stance=ToucheControversialStance(stance), ) def qrels_cls(self): return TrecQrel def qrels_defs(self): return self._definitions class TouchePassageDocs(BaseDocs): _source: Cache _namespace: Optional[str] _language: Optional[str] _count_hint: Optional[int] def __init__( self, cache: Cache, namespace: Optional[str] = None, language: Optional[str] = None, count_hint: Optional[int] = None, ): self._source = cache self._namespace = namespace self._language = language self._count_hint = count_hint def docs_path(self): return self._source.path() @use_docstore def docs_iter(self): with self._source.stream() as file: with TextIOWrapper(file) as lines: for line in lines: json = loads(line) yield TouchePassageDoc( doc_id=json["id"], text=json["contents"], chatnoir_url=json["chatNoirUrl"], ) def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f"{self.docs_path()}.pklz4", init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=["doc_id"], count_hint=self._count_hint, options=options ) def docs_count(self): return self._count_hint def docs_cls(self): return TouchePassageDoc def docs_namespace(self): return self._namespace def docs_lang(self): return self._language ================================================ FILE: ir_datasets/formats/touche_image.py ================================================ from io import TextIOWrapper from itertools import takewhile from json import loads from re import compile from typing import NamedTuple, Optional, Dict, List, Tuple from zipfile import ZipFile from ir_datasets.formats import BaseDocs from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS from ir_datasets.util import Cache, use_docstore class ToucheImageRanking(NamedTuple): query_id: str query: str rank: int # 1-indexed class ToucheImageNode(NamedTuple): xpath: str visible: bool id: Optional[str] classes: List[str] position: Tuple[float, float, float, float] # left top right bottom text: Optional[str] css: Dict[str, str] class ToucheImagePage(NamedTuple): page_id: str url: str rankings: List[ToucheImageRanking] dom_html: bytes xpaths: List[str] nodes: List[ToucheImageNode] # screenshot_png: bytes text: str # warc_gz: bytes class ToucheImageDoc(NamedTuple): doc_id: str png: bytes webp: bytes url: str phash: str # https://www.phash.org/ pages: List[ToucheImagePage] _PATTERN_IMAGE = compile("images/I[0-9a-f]{2}/I[0-9a-f]{16}/") _PATTERN_PAGE = compile( "images/I[0-9a-f]{2}/I[0-9a-f]{16}/pages/P[0-9a-f]{16}/" ) class ToucheImageDocs(BaseDocs): _source: Cache _source_nodes: Cache _source_png: Cache _namespace: Optional[str] _language: Optional[str] _count_hint: Optional[int] def __init__( self, source: Cache, source_nodes: Cache, source_png: Cache, namespace: Optional[str] = None, language: Optional[str] = None, count_hint: Optional[int] = None, ): self._source = source self._source_nodes = source_nodes self._source_png = source_png self._namespace = namespace self._language = language self._count_hint = count_hint def docs_path(self): return self._source.path() @use_docstore def docs_iter(self): with self._source.stream() as file, \ self._source_nodes.stream() as file_nodes, \ self._source_png.stream() as file_png: with ZipFile(file) as zip_file, \ ZipFile(file_nodes) as zip_file_nodes, \ ZipFile(file_png) as zip_file_png: paths = { *zip_file.namelist(), *zip_file_nodes.namelist(), *zip_file_png.namelist(), } paths = sorted(paths) image_paths = [ path for path in paths if _PATTERN_IMAGE.fullmatch(path) ] def _parse_node(json: dict) -> ToucheImageNode: position_json = json["position"] if isinstance(position_json, str): position_json = loads(position_json) return ToucheImageNode( xpath=json["xPath"], visible=bool(json["visible"]), id=( json["id"] if "id" in json else None ), classes=( json["classes"] if "classes" in json else [] ), position=( float(position_json[0]), float(position_json[1]), float(position_json[2]), float(position_json[3]), ), text=( json["text"] if "text" in json else None ), css=json["css"] if "css" in json else {}, ) def _parse_page(page_path: str) -> ToucheImagePage: with zip_file.open( f"{page_path}rankings.jsonl" ) as rankings_file: with TextIOWrapper(rankings_file) as lines: rankings_json = ( loads(line) for line in lines ) rankings = [ ToucheImageRanking( query_id=json["topic"], query=json["query"], rank=int(json["rank"]), ) for json in rankings_json ] with zip_file_nodes.open( f"{page_path}snapshot/nodes.jsonl" ) as nodes_file: with TextIOWrapper(nodes_file) as lines: nodes_json = ( loads(line) for line in lines ) nodes = [ _parse_node(json) for json in nodes_json ] with zip_file.open( f"{page_path}snapshot/image-xpath.txt" ) as xpaths_file: with TextIOWrapper(xpaths_file) as lines: xpaths = [ line for line in lines ] return ToucheImagePage( page_id=page_path.split("/")[-2], url=zip_file.read( f"{page_path}page-url.txt" ).decode().strip(), rankings=rankings, dom_html=zip_file.read( f"{page_path}snapshot/dom.html" ), xpaths=xpaths, nodes=nodes, # Mentioned in the dataset description # but not included in the Zenodo dataset. # screenshot_png=zip_file_screenshots.read( # f"{page_path}snapshot/screenshot.png" # ), text=zip_file.read( f"{page_path}snapshot/text.txt" ).decode(), # Mentioned in the dataset description # but not included in the Zenodo dataset. # warc_gz=zip_file_archives.read( # f"{page_path}snapshot/web-archive.warc.gz" # ), ) for index, image_path in enumerate(image_paths): page_paths = list(takewhile( lambda path: path.startswith(image_path), paths[index:], )) page_paths = [ path for path in page_paths if _PATTERN_PAGE.fullmatch(path) ] pages: List[ToucheImagePage] = [ _parse_page(page_path) for page_path in page_paths ] yield ToucheImageDoc( doc_id=image_path.split("/")[-2], png=zip_file_png.read(f"{image_path}image.png"), webp=zip_file.read(f"{image_path}image.webp"), url=zip_file.read( f"{image_path}image-url.txt" ).decode().strip(), phash=zip_file.read( f"{image_path}image-phash.txt" ).decode().strip(), pages=pages, ) def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f"{self.docs_path()}.pklz4", init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=["doc_id"], count_hint=self._count_hint, options=options ) def docs_count(self): return self._count_hint def docs_cls(self): return ToucheImageDoc def docs_namespace(self): return self._namespace def docs_lang(self): return self._language ================================================ FILE: ir_datasets/formats/trec.py ================================================ import io import codecs import tarfile import re import gzip from glob import glob as fnglob import xml.etree.ElementTree as ET from fnmatch import fnmatch from pathlib import Path from typing import NamedTuple import ir_datasets from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS from .base import GenericDoc, GenericQuery, GenericScoredDoc, BaseDocs, BaseQueries, BaseScoredDocs, BaseQrels class TrecDoc(NamedTuple): doc_id: str text: str marked_up_doc: str def default_text(self): """ text """ return self.text class TitleUrlTextDoc(NamedTuple): doc_id: str title: str url: str text: str def default_text(self): """ title and text """ return f'{self.title} {self.text}' class TrecParsedDoc(NamedTuple): doc_id: str title: str body: str marked_up_doc: bytes def default_text(self): """ title and body """ return f'{self.title} {self.body}' class TrecQuery(NamedTuple): query_id: str title: str description: str narrative: str def default_text(self): """ title """ return self.title class TrecSubtopic(NamedTuple): number: str text: str type: str class TrecQrel(NamedTuple): query_id: str doc_id: str relevance: int iteration: str class TrecSubQrel(NamedTuple): query_id: str doc_id: str relevance: int subtopic_id: str class TrecPrel(NamedTuple): query_id: str doc_id: str relevance: int method: int iprob: float # Default content tags from Anserini's TrecCollection CONTENT_TAGS = 'TEXT HEADLINE TITLE HL HEAD TTL DD DATE LP LEADPARA'.split() class TrecDocs(BaseDocs): def __init__(self, docs_dlc, encoding=None, path_globs=None, content_tags=CONTENT_TAGS, parser='BS4', namespace=None, lang=None, expected_file_count=None, docstore_size_hint=None, count_hint=None, docstore_path=None): self._docs_dlc = docs_dlc self._encoding = encoding self._path_globs = path_globs self._content_tags = content_tags self._parser = { 'BS4': self._parser_bs, 'text': self._parser_text, 'tut': self._parser_tut, 'sax': self._parser_sax, }[parser] self._doc = { 'BS4': TrecDoc, 'text': GenericDoc, 'tut': TitleUrlTextDoc, 'sax': TrecParsedDoc, }[parser] self._docs_namespace = namespace self._docs_lang = lang self._expected_file_count = expected_file_count self._docstore_size_hint = docstore_size_hint self._count_hint = count_hint self._docstore_path = docstore_path if expected_file_count is not None: assert self._path_globs is not None, "expected_file_count only supported with path_globs" def docs_path(self, force=True): return self._docs_dlc.path(force) @ir_datasets.util.use_docstore def docs_iter(self): if Path(self._docs_dlc.path()).is_dir(): if self._path_globs: file_count = 0 for glob in sorted(self._path_globs): glob_path = str(Path(self._docs_dlc.path())/glob) # IMPORTANT: cannot use Path().glob() here because the recusive ** will not follow symlinks. # Need to use glob.glob instead with recursive=True flag. for path in sorted(fnglob(glob_path, recursive=True)): file_count += 1 yield from self._docs_iter(path) if self._expected_file_count is not None: if file_count != self._expected_file_count: raise RuntimeError(f'found {file_count} files of the expected {self._expected_file_count} matching the following: {self._path_globs} under {self._docs_dlc.path()}. Make sure that directories are linked such that these globs match the correct number of files.') else: yield from self._docs_iter(self._docs_dlc.path()) else: if self._path_globs: file_count = 0 # tarfile, find globs, open in streaming mode (r|) with self._docs_dlc.stream() as stream: with tarfile.open(fileobj=stream, mode='r|gz') as tarf: for block in tarf: if any(fnmatch(block.name, g) for g in self._path_globs): file = tarf.extractfile(block) if block.name.endswith('.gz'): file = gzip.GzipFile(fileobj=file) yield from self._parser(file) file_count += 1 if self._expected_file_count is not None: if file_count != self._expected_file_count: raise RuntimeError(f'found {file_count} files of the expected {self._expected_file_count} matching the following: {self._path_globs} under {self._docs_dlc.path()}. Make sure that directories are linked such that these globs match the correct number of files.') else: with self._docs_dlc.stream() as f: yield from self._parser(f) def _docs_iter(self, path): if Path(path).is_file(): path_suffix = Path(path).suffix.lower() if path_suffix == '.gz': with gzip.open(path, 'rb') as f: yield from self._parser(f) elif path_suffix in ['.z', '.0z', '.1z', '.2z']: # unix "compress" command encoding unlzw3 = ir_datasets.lazy_libs.unlzw3() with io.BytesIO(unlzw3.unlzw(Path(path))) as f: yield from self._parser(f) else: with open(path, 'rb') as f: yield from self._parser(f) elif Path(path).is_dir(): for child in path.iterdir(): yield from self._docs_iter(child) def _parser_bs(self, stream): BeautifulSoup = ir_datasets.lazy_libs.bs4().BeautifulSoup f = codecs.getreader(self._encoding or 'utf8')(stream, errors='replace') doc_id, doc_markup = None, '' in_tag = False for line in f: if line.startswith('<DOCNO>'): doc_id = line.replace('<DOCNO>', '').replace('</DOCNO>\n', '').strip() elif line == '</DOC>\n': soup = BeautifulSoup(f'<OUTER>\n{doc_markup}\n</OUTER>', 'lxml') text = soup.get_text() yield TrecDoc(doc_id, text, doc_markup) doc_id, doc_markup = None, '' else: if in_tag: doc_markup += line if line.startswith('</'): if any(line.startswith(f'</{tag}>') for tag in self._content_tags): in_tag -= 1 if line.startswith('<'): if any(line.startswith(f'<{tag}>') for tag in self._content_tags): in_tag += 1 if in_tag == 1: doc_markup += line def _parser_text(self, stream): f = codecs.getreader(self._encoding or 'utf8')(stream, errors='replace') doc_id, doc_text = None, '' in_tag = False for line in f: if line.startswith('<DOCNO>'): doc_id = line.replace('<DOCNO>', '').replace('</DOCNO>\n', '').strip() elif line == '</DOC>\n': yield GenericDoc(doc_id, doc_text) doc_id, doc_text = None, '' else: if line.startswith('</'): if any(line.startswith(f'</{tag}>') for tag in self._content_tags): in_tag = False if in_tag: doc_text += line if line.startswith('<'): if any(line.startswith(f'<{tag}>') for tag in self._content_tags): in_tag = True def _parser_tut(self, stream): f = codecs.getreader(self._encoding or 'utf8')(stream, errors='replace') doc_id, doc_title, doc_url, doc_text = None, None, None, '' in_tag = False for line in f: if line.startswith('<DOCNO>'): doc_id = line.replace('<DOCNO>', '').replace('</DOCNO>\n', '').strip() if line.startswith('<TITLE>'): doc_title = line.replace('<TITLE>', '').replace('\n', '').strip() if line.startswith(''): doc_url = line.replace('', '').replace('\n', '').strip() elif line == '\n': yield TitleUrlTextDoc(doc_id, doc_title, doc_url, doc_text) doc_id, doc_title, doc_url, doc_text = None, None, None, '' else: if line.startswith(''): in_tag = False if in_tag: doc_text += line if line.startswith(''): in_tag = True def _parser_sax(self, stream): field_defs = [] field_defs.append({'docno'}) field_defs.append({'headline', 'title', 'h3', 'h4'}) field_defs.append({c.lower() for c in CONTENT_TAGS} - field_defs[-1]) buffer = bytearray() while True: if b'\n' not in buffer: chunk = stream.read1() if chunk == b'': break buffer.extend(chunk) else: idx = buffer.index(b'\n') full_doc = bytes(buffer[:idx+7]) doc_id, title, body = ir_datasets.util.html_parsing.sax_html_parser(full_doc, force_encoding=self._encoding or 'utf8', fields=field_defs) yield TrecParsedDoc(doc_id, title, body, full_doc.strip()) del buffer[:idx+7] def docs_cls(self): return self._doc def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): if self._docstore_path is not None: ds_path = self._docstore_path else: ds_path = f'{self.docs_path(force=False)}.pklz4' return PickleLz4FullStore( path=ds_path, init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], size_hint=self._docstore_size_hint, count_hint=self._count_hint, options=options ) def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_namespace(self): return self._docs_namespace def docs_lang(self): return self._docs_lang DEFAULT_QTYPE_MAP = { ' *(Number:)?': 'query_id', ' *(Topic:)?': 'title', '<desc> *(Description:)?': 'description', '<narr> *(Narrative:)?': 'narrative' } class TrecQueries(BaseQueries): def __init__(self, queries_dlc, qtype=TrecQuery, qtype_map=None, encoding=None, namespace=None, lang=None, remove_tags=('',)): self._queries_dlc = queries_dlc self._qtype = qtype self._qtype_map = qtype_map or DEFAULT_QTYPE_MAP self._encoding = encoding self._queries_namespace = namespace self._queries_lang = lang self._remove_tags = remove_tags def queries_path(self): return self._queries_dlc.path() def queries_iter(self): fields, reading = {}, None with self._queries_dlc.stream() as f: f = codecs.getreader(self._encoding or 'utf8')(f) for line in f: if line.startswith(''): assert len(fields) == len(self._qtype._fields), fields for tag in self._remove_tags: fields = {k: v.replace(tag, '') for k, v in fields.items()} yield self._qtype(*(fields[f].strip() for f in self._qtype._fields)) fields, reading = {}, None match_any = False for tag, target in self._qtype_map.items(): match = re.match(tag, line) if match: fields[target] = line[match.end():] reading = target match_any = True break if not match_any and reading and not line.startswith('<'): fields[reading] += line def queries_cls(self): return self._qtype def queries_namespace(self): return self._queries_namespace def queries_lang(self): return self._queries_lang class TrecXmlQueries(BaseQueries): def __init__(self, queries_dlc, qtype=TrecQuery, qtype_map=None, encoding=None, subtopics_key='subtopics', namespace=None, lang=None): self._queries_dlc = queries_dlc self._qtype = qtype self._qtype_map = qtype_map or {f: f for f in qtype._fields} self._encoding = encoding self._subtopics_key = subtopics_key self._queries_namespace = namespace self._queries_lang = lang def queries_path(self): return self._queries_dlc.path() def queries_iter(self): with self._queries_dlc.stream() as f: f = codecs.getreader(self._encoding or 'utf8')(f) for topic_el in ET.fromstring(f.read()): item = [None for _ in self._qtype._fields] if 'number' in topic_el.attrib: item[self._qtype._fields.index('query_id')] = topic_el.attrib['number'] subtopics = [] for attr in topic_el.attrib: if attr in self._qtype_map: text = topic_el.attrib[attr] field = self._qtype_map[attr] item[self._qtype._fields.index(field)] = text if topic_el.tag in self._qtype_map: text = ''.join(topic_el.itertext()) field = self._qtype_map[topic_el.tag] item[self._qtype._fields.index(field)] = text for field_el in topic_el: if field_el.tag in self._qtype_map: text = ''.join(field_el.itertext()) field = self._qtype_map[field_el.tag] item[self._qtype._fields.index(field)] = text if field_el.tag == 'subtopic': text = ''.join(field_el.itertext()) subtopics.append(TrecSubtopic(field_el.attrib['number'], text, field_el.attrib['type'])) if self._subtopics_key in self._qtype._fields: item[self._qtype._fields.index('subtopics')] = tuple(subtopics) qid_field = self._qtype._fields.index('query_id') item[qid_field] = item[qid_field].strip() # remove whitespace from query_ids yield self._qtype(*item) def queries_cls(self): return self._qtype def queries_namespace(self): return self._queries_namespace def queries_lang(self): return self._queries_lang class TrecColonQueries(BaseQueries): def __init__(self, queries_dlc, encoding=None, namespace=None, lang=None): self._queries_dlc = queries_dlc self._encoding = encoding self._queries_namespace = namespace self._queries_lang = lang def queries_iter(self): with self._queries_dlc.stream() as f: f = codecs.getreader(self._encoding or 'utf8')(f) for line in f: query_id, text = line.split(':', 1) text = text.rstrip('\n') yield GenericQuery(query_id, text) def queries_path(self): return self._queries_dlc.path() def queries_cls(self): return GenericQuery def queries_namespace(self): return self._queries_namespace def queries_lang(self): return self._queries_lang class TrecQrels(BaseQrels): def __init__(self, qrels_dlc, qrels_defs, format_3col=False): self._qrels_dlc = qrels_dlc self._qrels_defs = qrels_defs self._format_3col = format_3col def qrels_path(self): return self._qrels_dlc.path() def qrels_iter(self): if isinstance(self._qrels_dlc, list): for dlc in self._qrels_dlc: yield from self._qrels_internal_iter(dlc) else: yield from self._qrels_internal_iter(self._qrels_dlc) def _qrels_internal_iter(self, dlc): with dlc.stream() as f: f = codecs.getreader('utf8')(f) for line in f: if line == '\n': continue # ignore blank lines cols = line.rstrip().split() if self._format_3col: if len(cols) != 3: raise RuntimeError(f'expected 3 columns, got {len(cols)}') qid, did, score = cols it = 'Q0' else: if len(cols) != 4: raise RuntimeError(f'expected 4 columns, got {len(cols)}') qid, it, did, score = cols yield TrecQrel(qid, did, int(score), it) def qrels_cls(self): return TrecQrel def qrels_defs(self): return self._qrels_defs class TrecPrels(TrecQrels): def qrels_iter(self): with self._qrels_dlc.stream() as f: f = codecs.getreader('utf8')(f) for line in f: if line == '\n': continue # ignore blank lines cols = line.rstrip().split() if len(cols) != 5: raise RuntimeError(f'expected 5 columns, got {len(cols)}') qid, did, rel, method, iprob = cols yield TrecPrel(qid, did, int(rel), int(method), float(iprob)) def qrels_cls(self): return TrecPrel class TrecSubQrels(BaseQrels): def __init__(self, qrels_dlc, qrels_defs): self._qrels_dlc = qrels_dlc self._qrels_defs = qrels_defs def qrels_path(self): return self._qrels_dlc.path() def qrels_iter(self): if isinstance(self._qrels_dlc, list): for dlc in self._qrels_dlc: yield from self._qrels_internal_iter(dlc) else: yield from self._qrels_internal_iter(self._qrels_dlc) def _qrels_internal_iter(self, dlc): with dlc.stream() as f: f = codecs.getreader('utf8')(f) for line in f: if line == '\n': continue # ignore blank lines cols = line.rstrip().split() if len(cols) != 4: raise RuntimeError(f'expected 4 columns, got {len(cols)}') qid, sid, did, score = cols yield TrecSubQrel(qid, did, int(score), sid) def qrels_cls(self): return TrecSubQrel def qrels_defs(self): return self._qrels_defs class TrecScoredDocs(BaseScoredDocs): def __init__(self, scoreddocs_dlc, negate_score=False): self._scoreddocs_dlc = scoreddocs_dlc self._negate_score = negate_score def scoreddocs_path(self): return self._scoreddocs_dlc.path() def scoreddocs_iter(self): with self._scoreddocs_dlc.stream() as f: f = codecs.getreader('utf8')(f) for line in f: cols = line.rstrip().split() if len(cols) == 6: qid, _, did, _, score, _ = cols # TREC-style (qid iteration did score rank score runtag elif len(cols) == 2: qid, did, score = *cols, '0' # MS MARCO-style (qid did -- only) elif len(cols) == 3: qid, did, score = cols # MMARCO-style (qid did score) score = float(score) if self._negate_score: score = -score yield GenericScoredDoc(qid, did, score) ================================================ FILE: ir_datasets/formats/tsv.py ================================================ import contextlib from typing import Tuple import io import ir_datasets from .base import GenericDoc, GenericQuery, GenericDocPair, BaseDocs, BaseQueries, BaseDocPairs from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS class FileLineIter: def __init__(self, dlc, start=None, stop=None, step=1): self.dlc = dlc self.stream_idx = 0 self.stream = None self.pos = -1 self.start = start self.stop = stop self.step = step self.ctxt = contextlib.ExitStack() def __next__(self): if self.stop is not None and self.start >= self.stop: self.ctxt.close() raise StopIteration if self.stream is None: if isinstance(self.dlc, list): self.stream = io.TextIOWrapper(self.ctxt.enter_context(self.dlc[self.stream_idx].stream())) else: self.stream = io.TextIOWrapper(self.ctxt.enter_context(self.dlc.stream())) line = '' while self.pos < self.start: line = self.stream.readline() if line != '\n': self.pos += 1 if line == '': if isinstance(self.dlc, list): self.stream_idx += 1 if self.stream_idx < len(self.dlc): self.stream = io.TextIOWrapper(self.ctxt.enter_context(self.dlc[self.stream_idx].stream())) line = self.stream.readline() else: raise StopIteration() else: raise StopIteration() self.start += self.step return line def __iter__(self): return self def __del__(self): self.ctxt.close() def __getitem__(self, key): if not isinstance(key, slice): raise TypeError('key must be slice') start, stop, step = self.start, self.stop, self.step if key.start is not None: if not isinstance(key.start, int): raise TypeError('start must be int') if key.start < 0: if stop is None: raise ValueError('start cannot be negative with unknown size') start = stop + key.start else: start = start + key.start if key.stop is not None: if not isinstance(key.stop, int): raise TypeError('stop must be int') if key.stop < 0: if stop is None: raise ValueError('stop cannot be negative with unknown size') stop = stop + (key.stop + 1) else: stop = key.stop if key.step is not None: if not isinstance(key.step, int): raise TypeError('step must be int') if key.step <= 0: raise ValueError('step must be a positive') step = self.step * key.step return FileLineIter(self.dlc, start, stop, step) class TsvIter: def __init__(self, cls, line_iter): self.cls = cls self.line_iter = line_iter def __iter__(self): return self def __next__(self): line = next(self.line_iter) cols = line.rstrip('\n').split('\t') num_cols = len(self.cls._fields) last_field = self.cls.__annotations__[self.cls._fields[-1]] if hasattr(self.cls, '__annotations__') else None if last_field == Tuple[str, ...]: if len(cols) < len(self.cls._fields) - 1: raise RuntimeError(f'expected at least {len(self.cls._fields)-1} fields, got {len(cols)}') if len(cols) == len(self.cls._fields) - 1: cols += ((),) else: cols[len(self.cls._fields)-1] = tuple(cols[len(self.cls._fields)-1:]) cols = cols[:len(self.cls._fields)] else: if len(cols) != len(self.cls._fields): raise RuntimeError(f'expected {len(self.cls._fields)} fields, got {len(cols)}') return self.cls(*cols) def __getitem__(self, key): return TsvIter(self.cls, self.line_iter[key]) class _TsvBase: def __init__(self, dlc, cls, datatype, skip_first_line=False): super().__init__() self._dlc = dlc self._cls = cls self._datatype = datatype self._skip_first_line = skip_first_line def _path(self, force=True): return self._dlc.path(force) def _iter(self): stop = None if hasattr(self, f'{self._datatype}_count'): stop = getattr(self, f'{self._datatype}_count')() start = 1 if self._skip_first_line else 0 return TsvIter(self._cls, FileLineIter(self._dlc, start=start, stop=stop, step=1)) class TsvDocs(_TsvBase, BaseDocs): def __init__(self, docs_dlc, doc_cls=GenericDoc, doc_store_index_fields=None, namespace=None, lang=None, skip_first_line=False, docstore_size_hint=None, count_hint=None): super().__init__(docs_dlc, doc_cls, "docs", skip_first_line=skip_first_line) self._doc_store_index_fields = doc_store_index_fields self._docs_namespace = namespace self._docs_lang = lang self._docstore_size_hint = docstore_size_hint self._count_hint = count_hint def docs_path(self, force=True): return self._path(force) @ir_datasets.util.use_docstore def docs_iter(self): return self._iter() def docs_cls(self): return self._cls def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS): fields = (self._doc_store_index_fields or ['doc_id']) return PickleLz4FullStore( path=f'{self.docs_path(force=False)}.pklz4', init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=fields, size_hint=self._docstore_size_hint, count_hint=self._count_hint, options=options ) def docs_namespace(self): return self._docs_namespace def docs_count(self): if self.docs_store().built(): return self.docs_store().count() return None def docs_lang(self): return self._docs_lang class TsvQueries(_TsvBase, BaseQueries): def __init__(self, queries_dlc, query_cls=GenericQuery, namespace=None, lang=None): super().__init__(queries_dlc, query_cls, "queries") self._queries_namespace = namespace self._queries_lang = lang def queries_path(self): return self._path() def queries_iter(self): return self._iter() def queries_cls(self): return self._cls def queries_namespace(self): return self._queries_namespace def queries_lang(self): return self._queries_lang class TsvDocPairs(_TsvBase, BaseDocPairs): def __init__(self, docpairs_dlc, docpair_cls=GenericDocPair): super().__init__(docpairs_dlc, docpair_cls, "docpairs") def docpairs_path(self): return self._path() def docpairs_iter(self): return self._iter() def docpairs_cls(self): return self._cls ================================================ FILE: ir_datasets/formats/webarc.py ================================================ import gzip import re from contextlib import contextmanager, ExitStack from typing import NamedTuple import ir_datasets from ir_datasets.formats import BaseDocs _logger = ir_datasets.log.easy() class WarcDoc(NamedTuple): doc_id: str url: str date: str http_headers: bytes body: bytes body_content_type: str def default_text(self): try: return ir_datasets.util.sax_html_parser(self.body, headers=self.http_headers, fields=[{'title', 'body'}])[0] except UnicodeDecodeError: _logger.info(f'UnicodeDecodeError when parsing doc_id={self.doc_id}') return '' class WarcDocs(BaseDocs): def __init__(self, id_header='WARC-TREC-ID', warc_cw09=False, lang=None): super().__init__() self.id_header = id_header self.warc_cw09 = warc_cw09 self._docs_lang = lang def docs_iter(self): return ir_datasets.indices.WarcIter(self, slice(0, self.docs_count())) def _docs_warc_lib(self): if self.warc_cw09: return ir_datasets.lazy_libs.warc_clueweb09() return ir_datasets.lazy_libs.warc() def _docs_ctxt_iter_warc(self, warcf): warc = self._docs_warc_lib() if isinstance(warcf, str): warcf = gzip.open(warcf, 'rb') f = warc.WARCFile(fileobj=warcf) for doc in filter(lambda d: d.type == 'response', f): did = doc[self.id_header] url = doc['WARC-Target-URI'] date = doc['WARC-Date'] payload = doc.payload.read() split = re.split(b'\r?\n\r?\n', payload, maxsplit=1) if len(split) == 1: http_headers, body = split[0], b'' else: http_headers, body = split content_type = re.search(b'Content-Type:(.*)', http_headers, flags=re.IGNORECASE) if content_type: try: content_type = content_type.group(1).decode().strip() content_type = content_type.split(';') content_type = content_type[0] except UnicodeDecodeError: content_type = '' else: content_type = '' yield WarcDoc(did, url, date, http_headers, body, content_type) def docs_path(self, force=True): raise NotImplementedError def _docs_iter_source_files(self): raise NotImplementedError def _docs_id_to_source_file(self, doc_id): # For Warc Docstore lookups raise NotImplementedError def _docs_warc_file_counts(self): raise NotImplementedError def _docs_source_file_to_checkpoint(self, source_file): # For Warc Docstore lookups return None def docs_store(self, options=ir_datasets.indices.DEFAULT_DOCSTORE_OPTIONS): docstore = ir_datasets.indices.ClueWebWarcDocstore(self) return ir_datasets.indices.CacheDocstore(docstore, f'{self.docs_path(force=False)}.cache', options=options) def docs_cls(self): return WarcDoc def docs_count(self): return sum(self._docs_warc_file_counts().values()) def docs_lang(self): return self._docs_lang ================================================ FILE: ir_datasets/indices/__init__.py ================================================ from .base import Docstore, DEFAULT_DOCSTORE_OPTIONS, DocstoreOptions, FileAccess from .indexed_tsv_docstore import IndexedTsvDocstore from .zpickle_docstore import ZPickleDocStore from .numpy_sorted_index import NumpySortedIndex, NumpyPosIndex from .lz4_pickle import Lz4PickleLookup, PickleLz4FullStore from .cache_docstore import CacheDocstore from .clueweb_warc import ClueWebWarcIndex, ClueWebWarcDocstore, WarcIter ================================================ FILE: ir_datasets/indices/base.py ================================================ from dataclasses import dataclass, field from enum import Enum class FileAccess(Enum): FILE = 0 MMAP = 1 MEMORY = 2 @dataclass() class DocstoreOptions: #: How to access the document content file_access: FileAccess = field(default=FileAccess.FILE) DEFAULT_DOCSTORE_OPTIONS = DocstoreOptions() class Docstore: def __init__(self, doc_cls, id_field='doc_id', options: DocstoreOptions=DEFAULT_DOCSTORE_OPTIONS): self._doc_cls = doc_cls self._id_field = id_field self._id_field_idx = doc_cls._fields.index(id_field) self._options = options def get(self, doc_id, field=None): result = self.get_many([doc_id], field) if result: return result[doc_id] raise KeyError(f'doc_id={doc_id} not found') def get_many(self, doc_ids, field=None): result = {} field_idx = self._doc_cls._fields.index(field) if field is not None else None for doc in self.get_many_iter(doc_ids): if field is not None: result[getattr(doc, self._id_field)] = doc[field_idx] else: result[getattr(doc, self._id_field)] = doc return result def get_many_iter(self, doc_ids): raise NotImplementedError() def clear_cache(self): pass ================================================ FILE: ir_datasets/indices/cache_docstore.py ================================================ import os from contextlib import contextmanager import ir_datasets from . import Docstore, Lz4PickleLookup, DEFAULT_DOCSTORE_OPTIONS class CacheDocstore(Docstore): def __init__(self, full_store, path, cache_cls=Lz4PickleLookup, options=DEFAULT_DOCSTORE_OPTIONS): super().__init__(full_store._doc_cls, full_store._id_field, options=options) self.full_store = full_store self._path = path self.cache = cache_cls(path, self._doc_cls, self._id_field, [self._id_field], file_access=options.file_access) def get_many_iter(self, doc_ids): doc_ids_remaining = set(doc_ids) for doc in self.cache[doc_ids]: yield doc doc_ids_remaining.discard(doc[self._id_field_idx]) if doc_ids_remaining: # fall back on full_store & cache the results with self.cache.transaction() as trans: for doc in self.full_store.get_many_iter(doc_ids_remaining): yield doc trans.add(doc) def clear_cache(self): self.cache.clear() self.full_store.clear_cache() ================================================ FILE: ir_datasets/indices/clueweb_warc.py ================================================ import io import os import gzip from contextlib import ExitStack import ir_datasets from . import Docstore class WarcIndexFile: def __init__(self, fileobj, mode, doc_id_size=25): lz4 = ir_datasets.lazy_libs.lz4_frame() self.fileobj = lz4.frame.open(fileobj, mode, compression_level=lz4.frame.COMPRESSIONLEVEL_MAX) self.doc_id_size = doc_id_size self.pos = 0 def write(self, doc_id, doc_idx, state, pos, out_offset): zdict, bits, byte = state assert len(doc_id.encode()) == self.doc_id_size out = doc_id.encode() + \ doc_idx.to_bytes(4, 'little') + \ pos.to_bytes(4, 'little') + \ bits.to_bytes(1, 'little') + \ byte.to_bytes(1, 'little') + \ zdict + \ out_offset.to_bytes(4, 'little') self.fileobj.write(out) self.fileobj.flush() def read(self): ldid = self.doc_id_size chunk = self.fileobj.read(ldid + 4 + 4 + 1 + 1 + (32 * 1024) + 4) if not chunk: raise EOFError() chunk = io.BytesIO(chunk) doc_id = chunk.read(ldid).decode() doc_idx = int.from_bytes(chunk.read(4), 'little') pos = self.pos + int.from_bytes(chunk.read(4), 'little') bits = int.from_bytes(chunk.read(1), 'little') byte = int.from_bytes(chunk.read(1), 'little') zdict = chunk.read(32 * 1024) out_offset = int.from_bytes(chunk.read(4), 'little') state = (zdict, bits, byte) self.pos = pos return (doc_id, doc_idx, state, pos, out_offset) def peek_doc_id(self): return self.fileobj.peek(self.doc_id_size)[:self.doc_id_size].decode() def peek_doc_idx(self): int_bytes = self.fileobj.peek(self.doc_id_size + 4)[self.doc_id_size:self.doc_id_size+4] if int_bytes == b'': return None return int.from_bytes(int_bytes, 'little') def __bool__(self): # TODO: a better way? Does lz4 handle peeks okay? return self.fileobj.peek(1) != b'' def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() def close(self): self.fileobj.close() class ClueWebWarcIndex: def __init__(self, source_path, index_path, id_field='WARC-TREC-ID', warc_cw09=False): self.source_path = source_path self.index_path = index_path self.zlib_state = ir_datasets.lazy_libs.zlib_state() self.id_field = id_field self.warc_cw09 = warc_cw09 def build(self, checkpoint_freq=8*1024*1024): warc = ir_datasets.lazy_libs.warc_clueweb09() if self.warc_cw09 else ir_datasets.lazy_libs.warc() next_checkpoint = None last_chekpoint_pos = 0 with self.zlib_state.GzipStateFile(self.source_path, keep_last_state=True) as f, \ warc.WARCFile(fileobj=f) as f_warc, \ ir_datasets.util.finialized_file(self.index_path, 'wb') as f_tmp, \ WarcIndexFile(f_tmp, 'wb') as f_chk: doc_idx = 0 for doc in f_warc: if doc.type == 'warcinfo': continue if next_checkpoint: state, pos, out_offset = next_checkpoint f_chk.write(doc[self.id_field], doc_idx, state, pos - last_chekpoint_pos, out_offset) last_chekpoint_pos = pos next_checkpoint = None if f.last_state_pos and (f.last_state_pos >= last_chekpoint_pos + checkpoint_freq): doc.payload.read() # advance the reader to the end of the current file next_checkpoint = (f.last_state, f.last_state_pos, f.output_pos - f.last_state_output_pos + 4) # +4 for \r\n\r\n if next_checkpoint[2] < 0: next_checkpoint = None # split part way through the header... Skip this doc (will checkpoint in next iteration) doc_idx += 1 def built(self): return os.path.exists(self.index_path) def get_many_iter(self, doc_ids, docs_obj): doc_ids = sorted(set(doc_ids)) with ExitStack() as stack: f = stack.enter_context(self.zlib_state.GzipStateFile(self.source_path)) f_chk = stack.enter_context(WarcIndexFile(self.index_path, 'rb')) state, pos, out_offset = None, None, None while doc_ids: next_doc_id = f_chk.peek_doc_id() if doc_ids[0] < next_doc_id or next_doc_id == '': if state is not None: f.zseek(pos, state) f.read(out_offset) doc_iter = docs_obj._docs_ctxt_iter_warc(f) for doc in doc_iter: brk = False while doc.doc_id >= doc_ids[0]: if doc.doc_id == doc_ids[0]: yield doc doc_ids = doc_ids[1:] # pop -- either not found or found if not doc_ids or (next_doc_id != '' and doc_ids[0] >= next_doc_id): brk = True break if brk: break if doc_ids and f_chk: doc_id, doc_idx, state, pos, out_offset = f_chk.read() class ClueWebWarcDocstore(Docstore): def __init__(self, warc_docs, options=None): from ir_datasets.indices import DEFAULT_DOCSTORE_OPTIONS options = options or DEFAULT_DOCSTORE_OPTIONS super().__init__(warc_docs.docs_cls(), 'doc_id', options=options) self.warc_docs = warc_docs def get_many_iter(self, doc_ids): result = {} files_to_search = {} for doc_id in doc_ids: source_file = self.warc_docs._docs_id_to_source_file(doc_id) if source_file is not None: if source_file not in files_to_search: files_to_search[source_file] = [] files_to_search[source_file].append(doc_id) for source_file, doc_ids in files_to_search.items(): doc_ids = sorted(doc_ids) checkpoint_file = self.warc_docs._docs_source_file_to_checkpoint(source_file) if checkpoint_file: index = ClueWebWarcIndex(source_file, checkpoint_file) yield from index.get_many_iter(doc_ids, self.warc_docs) else: for doc in self.warc_docs._docs_ctxt_iter_warc(source_file): if doc_ids[0] == doc.doc_id: yield doc doc_ids = doc_ids[1:] if not doc_ids: break # file finished class WarcIter: def __init__(self, warc_docs, slice): self.next_index = 0 self.warc_docs = warc_docs self.slice = slice self.current_file = None self.current_file_source = None self.current_file_start_idx = 0 self.current_file_end_idx = 0 self.current_chk = None self.file_iter = warc_docs._docs_iter_source_files() def __next__(self): if self.slice.start >= self.slice.stop: raise StopIteration while self.next_index != self.slice.start or self.current_file is None or self.current_file_end_idx <= self.slice.start: if self.current_file is None or self.current_file_end_idx <= self.slice.start: # First iteration or no docs remaining in this file if self.current_file is not None: self.current_file.close() self.current_file_source.close() self.current_file = None self.current_file_source = None if self.current_chk: self.current_chk.close() self.current_chk = None # jump ahead to the file that contains the desired index first = True while first or self.current_file_end_idx < self.slice.start: source_file = next(self.file_iter) self.next_index = self.current_file_end_idx self.current_file_start_idx = self.current_file_end_idx self.current_file_end_idx = self.current_file_start_idx + self.warc_docs._docs_warc_file_counts()[source_file] first = False self.current_file_source = ir_datasets.lazy_libs.zlib_state().GzipStateFile(source_file) self.current_file = self.warc_docs._docs_ctxt_iter_warc(self.current_file_source) checkpoint_file = self.warc_docs._docs_source_file_to_checkpoint(source_file) if checkpoint_file: self.current_chk = WarcIndexFile(checkpoint_file, 'rb') elif self.current_chk and self.current_file_start_idx + self.current_chk.peek_doc_idx() < self.slice.start: # We have a checkpoint file and the next block starts after this checkpoint. # So we can use zseek to jump ahead to it. while self.current_chk and self.current_file_start_idx + self.current_chk.peek_doc_idx() < self.slice.start: doc_id, doc_idx, state, pos, out_offset = self.current_chk.read() self.current_file_source.zseek(pos, state) self.current_file_source.read(out_offset) self.next_index = self.current_file_start_idx + doc_idx else: # No checkpoint file available or as far as we can get with checkpoint; do slow read ahead for _ in zip(range(self.slice.start - self.next_index), self.current_file): # The zip here will stop at after either as many docs we must advance, or however # many docs remain in the file. In the latter case, we'll just drop out into the # next iteration of the while loop and pick up the next file. self.next_index += 1 result = next(self.current_file) self.next_index += 1 self.slice = slice(self.slice.start + (self.slice.step or 1), self.slice.stop, self.slice.step) return result def close(self): if self.current_file_source is not None: self.current_file_source.close() self.current_file_source = None if self.current_file is not None: self.current_file.close() self.current_file = None if self.current_chk: self.current_chk.close() self.current_chk = None self.file_iter = None def __iter__(self): return self def __del__(self): self.close() def __getitem__(self, key): if isinstance(key, slice): # it[start:stop:step] new_slice = ir_datasets.util.apply_sub_slice(self.slice, key) return WarcIter(self.warc_docs, new_slice) elif isinstance(key, int): # it[index] new_slice = ir_datasets.util.slice_idx(self.slice, key) new_it = WarcIter(self.warc_docs, new_slice) try: return next(new_it) except StopIteration as e: raise IndexError((self.slice, slice(key, key+1), new_slice)) raise TypeError('key must be int or slice') ================================================ FILE: ir_datasets/indices/indexed_tsv_docstore.py ================================================ import os import shutil import json import zlib import pickle from contextlib import contextmanager import ir_datasets _logger = ir_datasets.log.easy() class ZPickleKeyValueStore: def __init__(self, path, value_encoder=None): self._path = path self._idx = None self._bin = None def built(self): return len(self) > 0 def idx(self): if self._idx is None: self._idx = NumpyPosIndex(os.path.join(self._path, 'idx')) return self._idx def bin(self): if self._bin is None: self._bin = open(os.path.join(self._path, 'bin'), 'rb') return self._bin def purge(self): if self._idx: self._idx.close() self._idx = None if self._bin: self._bin.close() self._bin = None @contextmanager def transaction(self): os.makedirs(self._path, exist_ok=True) with ZPickleDocStoreTransaction(self) as trans: yield trans def __getitem__(self, value): if isinstance(value, tuple) and len(value) == 2: key, field = value else: # assume key and all fields key, field = value, Ellipsis binf = self.bin() binf.seek(self.idx().get(key)) content_length = int.from_bytes(binf.read(4), 'little') content = binf.read(content_length) content = zlib.decompress(content) content = pickle.loads(content) if content[0] != key: raise KeyError(f'key={key} not found') if field is Ellipsis: return dict(content[1:]) for f, val in content[1:]: if field == f: return val raise KeyError(f'field={field} not found for key={key}') def path(self, force=True): return self._path def __iter__(self): # iterates documents binf = self.bin() binf.seek(0) while binf.read(1): # peek binf.seek(-1) # un-peek content_length = int.from_bytes(binf.read(4), 'little') content = binf.read(content_length) content = zlib.decompress(content) content = pickle.loads(content) yield content[0], dict(content[1:]) def __len__(self): # number of keys return len(self.idx()) class IndexedTsvKeyValueStore: def __init__(self, path, value_encoder=None): self._path = path self._value_encoder = value_encoder self._idx = None self._tsv = None def built(self): return len(self) > 0 def idx(self): if self._idx is None: self._idx = NumpyPosIndex(os.path.join(self._path, 'idx')) return self._idx def tsv(self): if self._tsv is None: self._tsv = open(os.path.join(self._path, 'tsv'), 'rt') return self._tsv def purge(self): if self._idx: self._idx.close() self._idx = None if self._tsv: self._tsv.close() self._tsv = None @contextmanager def transaction(self): os.makedirs(self._path, exist_ok=True) with IndexedTsvDocStoreTransaction(self) as trans: yield trans def __getitem__(self, value): if isinstance(value, tuple) and len(value) == 2: key, field = value else: # assume key and all fields key = value field = ... record = {} tsv = self.tsv() tsv.seek(self.idx().get(key)) for line in tsv: cols = line.rstrip().split('\t') if len(cols) == 1: if cols[0] != key: break # end of doc else: continue # key verified l_field, l_text = cols if field is Ellipsis: if self._value_encoder == 'json': l_text = json.loads(l_text) record[l_field] = l_text else: if l_field == field: if self._value_encoder == 'json': l_text = json.loads(l_text) return l_text if field is Ellipsis: if not record: raise KeyError(f'key={key} not found') return record raise KeyError(f'key={key} field={field} not found') def path(self, force=True): return self._path def __iter__(self): # iterates documents tsv = self.tsv() tsv.seek(0) key = None doc = None for line in tsv: cols = line.rstrip().split('\t') if len(cols) == 1: if doc is not None: yield key, doc key = cols[0] doc = {} else: if self._value_encoder == 'json': cols[1] = json.loads(cols[1]) doc[cols[0]] = cols[1] if doc is not None: yield key, doc def __len__(self): # number of keys return len(self.idx()) class IndexedTsvDocStoreTransaction: def __init__(self, docstore): self.docstore = docstore self.path = self.docstore.path() self.idx = NumpyPosIndex(os.path.join(self.path, 'idx')) self.tsv = open(os.path.join(self.path, 'tsv'), 'wt') def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): if not exc_val: self.commit() else: self.discard() def commit(self): self.idx.commit() self.tsv.flush() self.tsv.close() # self.docstore.merge(IndexedTsvDocstore(self.path)) # shutil.rmtree(self.path) def discard(self): shutil.rmtree(self.path) def add(self, key, fields): self.idx.add(key, self.tsv.tell()) self.tsv.write(f'{key}\n') for field, value in zip(type(fields)._fields, fields): if self.docstore._value_encoder == 'json': value = json.dumps(value) elif self.docstore._value_encoder is None: value = value.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ') self.tsv.write(f'{field}\t{value}\n') class ZPickleDocStoreTransaction: def __init__(self, docstore): self.docstore = docstore self.path = self.docstore.path() self.idx = NumpyPosIndex(os.path.join(self.path, 'idx')) self.bin = open(os.path.join(self.path, 'bin'), 'wb') def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): if not exc_val: self.commit() else: self.discard() def commit(self): self.idx.commit() self.bin.flush() self.bin.close() def discard(self): shutil.rmtree(self.path) def add(self, key, fields): self.idx.add(key, self.bin.tell()) content = tuple(zip(type(fields)._fields, fields)) content = pickle.dumps(content) content = zlib.compress(content) content_length = len(content) self.bin.write(content_length.to_bytes(4, 'little')) self.bin.write(content) class NumpyPosIndex: def __init__(self, path): self.path = path self.data = None self.mmap1 = None self.mmap2 = None self.doccount = None self.didlen = None self.np = ir_datasets.lazy_libs.numpy() def add(self, did, idx): if self.data is None: self.data = {} self.data[did] = idx def commit(self): didlen = max(len(x) for x in self.data) sorted_data = sorted(self.data.items()) # Use zero-terminated bytes here (S) rather than unicode type (U) because U includes a ton # of extra padding (for longer unicode formats), which can inflate the size of the index greatly. array1 = self.np.array([x[0].encode('utf8') for x in sorted_data], dtype=f'S{didlen}') array2 = self.np.array([x[1] for x in sorted_data], dtype=f'int64') m1 = self.np.memmap(f'{self.path}.did', dtype=array1.dtype, mode='w+', shape=array1.shape) m1[:] = array1[:] del m1 m2 = self.np.memmap(f'{self.path}.pos', dtype=array2.dtype, mode='w+', shape=array2.shape) m2[:] = array2[:] del m2 with ir_datasets.util.finialized_file(f'{self.path}.meta', 'wt') as f: f.write(f'{didlen} {len(self.data)}') self.data = None def _lazy_load(self): if self.mmap1 is None: with open(f'{self.path}.meta', 'rt') as f: self.didlen, self.doccount = f.read().split() self.didlen, self.doccount = int(self.didlen), int(self.doccount) self.mmap1 = self.np.memmap(f'{self.path}.did', dtype=f'S{self.didlen}', mode='r', shape=(self.doccount,)) self.mmap2 = self.np.memmap(f'{self.path}.pos', dtype='int64', mode='r', shape=(self.doccount,)) def get(self, did): self._lazy_load() did = did.encode('utf8') loc = self.np.searchsorted(self.mmap1, did) if self.mmap1[loc] == did: return self.mmap2[loc] return 0 def close(self): if self.mmap1 is not None: del self.mmap1 self.mmap1 = None if self.mmap2 is not None: del self.mmap2 self.mmap2 = None self.data = None def __iter__(self): # iterates keys self._lazy_load() for i in range(len(self)): yield self.mmap1[i].decode('utf8') def __len__(self): # number of keys self._lazy_load() return self.doccount def dir_size(path): # Adapted from total = 0 for entry in os.scandir(path): if entry.is_file(): total += entry.stat().st_size elif entry.is_dir(): total += dir_size(entry.path) return total class IndexedTsvDocstore: file_ext = 'itsv' def __init__(self, path, doc_cls, value_encoder='json', id_field='doc_id', store=IndexedTsvKeyValueStore): self._path = path self._doc_cls = doc_cls self._id_field = id_field self._id_field_idx = doc_cls._fields.index(id_field) self._store = store(path, value_encoder=value_encoder) def built(self): return os.path.exists(self._path) def purge(self): self._store.purge() def build(self, documents): with self._store.transaction() as trans: for doc in documents: trans.add(doc[self._id_field_idx], doc) def get(self, did, field=None): if field is not None: return self._store[did, field] result = self._store[did] return self._doc_cls(*(result[f] for f in self._doc_cls._fields)) def get_many(self, dids, field=None): result = {} for did in dids: try: result[did] = self.get(did, field) except ValueError: pass return result def num_docs(self): return len(self._store) def docids(self): return iter(self._store.idx()) def iter_docs(self): for did, fields in iter(self._store): yield self._doc_cls(*(fields[f] for f in self._doc_cls._fields)) def path(self, force=True): return self._path def file_size(self): return dir_size(self._path) ================================================ FILE: ir_datasets/indices/lz4_pickle.py ================================================ import io import mmap import os import pickle from ir_datasets.indices import DEFAULT_DOCSTORE_OPTIONS, FileAccess try: import fcntl except: fcntl = None # not available on Windows :shrug: from contextlib import contextmanager import ir_datasets from . import Docstore, NumpySortedIndex, NumpyPosIndex _logger = ir_datasets.log.easy() def _read_next(f, data_cls): lz4 = ir_datasets.lazy_libs.lz4_block() content_length = int.from_bytes(f.read(4), "little") content = f.read(content_length) content = lz4.block.decompress(content) content = pickle.loads(content) return data_cls(*content) def _skip_next(f): content_length = int.from_bytes(f.read(4), "little") f.seek(content_length, io.SEEK_CUR) def _write_next(f, record): lz4 = ir_datasets.lazy_libs.lz4_block() content = tuple(record) content = pickle.dumps(content) content = lz4.block.compress(content, store_size=True) content_length = len(content) f.write(content_length.to_bytes(4, "little")) f.write(content) def safe_str(s): return "".join(c for c in s if c.isalnum() or c == "_") class Lz4PickleIter: def __init__(self, lookup, slice): self.next_index = 0 self.lookup = lookup self.slice = slice self.bin = None self.pos_idx = None def __next__(self): if self.slice.start >= self.slice.stop: raise StopIteration if self.bin is None: self.bin = self.lookup.bin() # Fast -- lookup keeps track of position of each index if self.pos_idx is None: self.pos_idx = self.lookup.pos() # get the new positions in the bin new_pos = self.pos_idx[self.slice.start][0] if new_pos == -1: # start of the iter (self.next_index == self.slice.start == 0) new_pos = 0 self.bin.seek( new_pos ) # this seek is smart -- if alrady in buffer, skips to that point self.next_index = self.slice.start result = _read_next(self.bin, self.lookup._doc_cls) self.next_index += 1 self.slice = slice( self.slice.start + (self.slice.step or 1), self.slice.stop, self.slice.step ) return result def __iter__(self): return self def __del__(self): # if self.bin is not None: # self.bin.close() # self.bin = None # if self.pos_idx: # self.pos_idx.close() # self.pos_idx = None pass def __getitem__(self, key): if isinstance(key, slice): # it[start:stop:step] new_slice = ir_datasets.util.apply_sub_slice(self.slice, key) return Lz4PickleIter(self.lookup, new_slice) elif isinstance(key, int): # it[index] new_slice = ir_datasets.util.slice_idx(self.slice, key) new_it = Lz4PickleIter(self.lookup, new_slice) try: return next(new_it) except StopIteration as e: raise IndexError(e) raise TypeError("key must be int or slice") class Lz4PickleLookup: def __init__( self, path, doc_cls, key_field, index_fields, key_field_prefix=None, file_access=FileAccess.FILE, ): self._path = path self._key_field = key_field self._key_idx = doc_cls._fields.index(key_field) self._index_fields = list(index_fields) self._doc_cls = doc_cls self._bin = None self._bin_path = os.path.join(self._path, "bin") self._pos = None self._pos_path = os.path.join(self._path, "bin.pos") self._idx = None self._idx_path = os.path.join(self._path, f"idx.{safe_str(self._key_field)}") self._key_field_prefix = key_field_prefix self._meta_path = os.path.join(self._path, "bin.meta") self._file_access = file_access # check that the fields match meta_info = " ".join(doc_cls._fields) if os.path.exists(self._meta_path): with open(self._meta_path, "rt") as f: existing_meta = f.read() assert ( existing_meta == meta_info ), f"fields do not match; you may need to re-build this store {path}" def bin(self): if self._bin is None: if self._file_access == FileAccess.FILE: _logger.info(f"Opening {self._bin_path} with direct file access") self._bin = open(self._bin_path, "rb") elif self._file_access == FileAccess.MEMORY: _logger.info(f"Opening {self._bin_path} in memory") with open(self._bin_path, "rb") as f: data = bytearray(f.read()) # mutable buffer self._bin = io.BytesIO(data) # Use the same buffer elif self._file_access == FileAccess.MMAP: _logger.info(f"Opening {self._bin_path} with MMAP") f = open(self._bin_path, "rb") try: self._bin = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) finally: f.close() # mapping stays valid after this else: assert False, f"File access {self._file_access} not supported / {FileAccess.FILE}" return self._bin def pos(self): if self._pos is None: self._pos = NumpyPosIndex(self._pos_path, file_access=self._file_access) return self._pos def idx(self): if self._idx is None: self._idx = NumpySortedIndex(self._idx_path, file_access=self._file_access) return self._idx def close(self): if self._idx: self._idx.close() self._idx = None if self._pos: self._pos.close() self._pos = None if self._bin: self._bin.close() self._bin = None def clear(self): self.close() if os.path.exists(self._bin_path): os.remove(self._bin_path) if os.path.exists(self._pos_path): os.remove(self._pos_path) NumpySortedIndex(self._idx_path).clear() def __del__(self): self.close() @contextmanager def transaction(self): if not os.path.exists(self._path): os.makedirs(self._path, exist_ok=True) if not os.path.exists(self._meta_path): meta_info = " ".join(self._doc_cls._fields) with open(self._meta_path, "wt") as f: f.write(meta_info) with Lz4PickleTransaction(self) as trans: yield trans def __getitem__(self, values): if isinstance(values, str): values = (values,) # for removing long doc_id prefixes if self._key_field_prefix: values = [ v[len(self._key_field_prefix) :] for v in values if v.startswith(self._key_field_prefix) ] poss = self.idx()[values] poss = sorted(poss) # go though the file in increasing order-- better for HDDs binf = None for pos in poss: if pos == -1: continue # not found if binf is None: binf = self.bin() binf.seek(pos) yield _read_next(binf, self._doc_cls) def path(self, force=True): return self._path def __iter__(self): return Lz4PickleIter(self, slice(0, len(self), 1)) def __len__(self): # number of keys return len(self.pos()) class Lz4PickleTransaction: def __init__(self, lookup): self.lookup = lookup self.path = self.lookup.path() self.bin = None self.pos = None self.idxs = None self.start_pos = None def __enter__(self): self.bin = open(self.lookup._bin_path, "ab") if fcntl: fcntl.lockf(self.bin, fcntl.LOCK_EX) self.start_pos = self.bin.tell() # for rolling back self.pos = NumpyPosIndex(self.lookup._pos_path) self.idxs = [] for index_field in self.lookup._index_fields: idx_path = os.path.join(self.lookup._path, f"idx.{safe_str(index_field)}") self.idxs.append(NumpySortedIndex(idx_path)) return self def __exit__(self, exc_type, exc_val, exc_tb): if self.idxs is not None: if not exc_val: self.commit() else: self.rollback() def commit(self): self.pos.commit() self.pos = None for idx in self.idxs: idx.commit() self.idxs = None self.bin.flush() if fcntl: fcntl.lockf(self.bin, fcntl.LOCK_UN) self.bin.close() self.bin = None def rollback(self): self.bin.truncate(self.start_pos) # remove appended content if fcntl: fcntl.lockf(self.bin, fcntl.LOCK_UN) self.bin.close() self.bin = None self.pos.close() self.pos = None for idx in self.idxs: idx.close() self.idxs = None def add(self, record): bin_pos = self.bin.tell() self.pos.add(bin_pos) for idx, field in zip(self.idxs, self.lookup._index_fields): value = getattr(record, field) # remove long doc_id prefixes to cut down on storage if field == self.lookup._key_field and self.lookup._key_field_prefix: assert value.startswith(self.lookup._key_field_prefix) value = value[len(self.lookup._key_field_prefix) :] idx.add(value, bin_pos) _write_next(self.bin, record) class PickleLz4FullStore(Docstore): def __init__( self, path, init_iter_fn, data_cls, lookup_field, index_fields, key_field_prefix=None, size_hint=None, count_hint=None, options=DEFAULT_DOCSTORE_OPTIONS, ): super().__init__(data_cls, lookup_field, options=options) self.path = path self.init_iter_fn = init_iter_fn self.lookup = Lz4PickleLookup( path, data_cls, lookup_field, index_fields, key_field_prefix, file_access=options.file_access, ) self.size_hint = size_hint self.count_hint = count_hint def get_many_iter(self, keys): self.build() yield from self.lookup[keys] def build(self): if not self.built(): if self.size_hint: ir_datasets.util.check_disk_free(self.path, self.size_hint) with self.lookup.transaction() as trans, _logger.duration( "building docstore" ): count_hint = self.count_hint # either a callable or int or None if callable(count_hint): count_hint = ( count_hint() ) # allows for deferred loading of metadata; should return an int or None for doc in _logger.pbar( self.init_iter_fn(), "docs_iter", unit="doc", total=count_hint ): trans.add(doc) def built(self): return len(self.lookup) > 0 def clear_cache(self): self.lookup.clear() def __iter__(self): self.build() return iter(self.lookup) def count(self): self.build() return len(self.lookup) ================================================ FILE: ir_datasets/indices/numpy_sorted_index.py ================================================ import os import ir_datasets from ir_datasets.indices import FileAccess class NumpySortedIndex: def __init__(self, path, file_access=FileAccess.MMAP): self.path = path self.transaction = None self.mmap_keys = None self.mmap_poss = None self.doccount = None self.keylen = None self.np = None self.file_access = file_access def add(self, key, idx): if self.transaction is None: self.transaction = {} self.transaction[key] = idx def commit(self): self._lazy_load() if self.transaction is None: return transaction = sorted(self.transaction.items()) transaction = [(x[0].encode('utf8'), x[1]) for x in transaction] if self._exists(): transaction += [(k, p) for k, p in zip(self.mmap_keys, self.mmap_poss) if (k.decode() not in self.transaction)] transaction = sorted(transaction) keys = [x[0] for x in transaction] poss = [x[1] for x in transaction] self.keylen = max(len(k) for k in keys) self.doccount = len(keys) # Use zero-terminated bytes here (S) rather than unicode type (U) because U includes a ton # of extra padding (for longer unicode formats), which can inflate the size of the index greatly. keys = self.np.array(keys, dtype=f'S{self.keylen}') poss = self.np.array(poss, dtype='int64') self.mmap_keys = self.np.memmap(f'{self.path}.key', dtype=keys.dtype, mode='w+', shape=keys.shape) self.mmap_keys[:] = keys[:] self.mmap_poss = self.np.memmap(f'{self.path}.pos', dtype=poss.dtype, mode='w+', shape=poss.shape) self.mmap_poss[:] = poss[:] with ir_datasets.util.finialized_file(f'{self.path}.meta', 'wt') as f: f.write(f'{self.keylen} {self.doccount}') self.transaction = None def _exists(self): return os.path.exists(f'{self.path}.key') def _lazy_load(self): if self.np is None: self.np = ir_datasets.lazy_libs.numpy() if self.mmap_keys is None and self._exists(): with open(f'{self.path}.meta', 'rt') as f: self.keylen, self.doccount = f.read().split() self.keylen, self.doccount = int(self.keylen), int(self.doccount) if self.file_access == FileAccess.MEMORY: self.mmap_keys = self.np.fromfile(f'{self.path}.key', dtype=f'S{self.keylen}', count=self.doccount) self.mmap_poss = self.np.fromfile(f'{self.path}.pos', dtype='int64', count=self.doccount) else: self.mmap_keys = self.np.memmap(f'{self.path}.key', dtype=f'S{self.keylen}', mode='r', shape=(self.doccount,)) self.mmap_poss = self.np.memmap(f'{self.path}.pos', dtype='int64', mode='r', shape=(self.doccount,)) def __getitem__(self, keys): self._lazy_load() if isinstance(keys, str): keys = (keys,) if not self._exists(): return [-1 for _ in keys] keys = self.np.array([key.encode('utf8') for key in keys], dtype=f'S{self.keylen}') locs = self.np.searchsorted(self.mmap_keys, keys) locs[locs >= self.mmap_keys.shape[0]] = self.mmap_keys.shape[0] - 1 # could be placed AFTER existing keys mask = self.mmap_keys[locs] == keys return ((self.mmap_poss[locs] * mask) + (~mask * -1)).tolist() def close(self): if self.mmap_keys is not None: del self.mmap_keys self.mmap_keys = None if self.mmap_poss is not None: del self.mmap_poss self.mmap_poss = None self.data = None def clear(self): self.close() for file in ['meta', 'key', 'pos']: path = f'{self.path}.{file}' if os.path.exists(path): os.remove(path) def __del__(self): self.close() def __iter__(self): # iterates keys self._lazy_load() if self._exists(): for i in range(len(self)): yield self.mmap_keys[i].decode('utf8') def __len__(self): # number of keys self._lazy_load() if self._exists(): return self.doccount return 0 class NumpyPosIndex: def __init__(self, path, file_access=FileAccess.MMAP): self.path = path self.file_access = file_access self.transaction = None self.mmap = None self.np = None def add(self, idx): if self.transaction is None: self.transaction = [] self.transaction.append(idx) def commit(self): self._lazy_load() if self.transaction is None: return if self.mmap is not None: del self.mmap self.mmap = None if os.path.exists(self.path): current_count = os.stat(self.path).st_size // 8 mmap = self.np.memmap(self.path, dtype='int64', mode='r+', shape=(current_count + len(self.transaction),)) else: mmap = self.np.memmap(self.path, dtype='int64', mode='w+', shape=(len(self.transaction),)) mmap[-len(self.transaction):] = self.np.array(self.transaction, dtype='int64') del mmap self.transaction = None def _exists(self): return os.path.exists(self.path) def _lazy_load(self): if self.np is None: self.np = ir_datasets.lazy_libs.numpy() if self.mmap is None and self._exists(): current_count = os.stat(self.path).st_size // 8 if self.file_access == FileAccess.MEMORY: self.mmap = self.np.fromfile(self.path, dtype='int64', count=current_count) else: self.mmap = self.np.memmap(self.path, dtype='int64', mode='r', shape=(current_count,)) def __getitem__(self, idxs): self._lazy_load() if isinstance(idxs, int): idxs = (idxs,) if not self._exists(): return [-1 for _ in idxs] idxs = self.np.array(idxs) mask = idxs > 0 & idxs < self.mmap.shape[0] return ((self.mmap[idxs] * mask) + (~mask * -1)).tolist() def close(self): if self.mmap is not None: del self.mmap self.mmap = None def clear(self): self.close() if os.path.exists(self.path): os.remove(self.path) def __del__(self): self.close() def __iter__(self): # iterates keys self._lazy_load() if self._exists(): for i in range(len(self)): yield self.mmap[i] def __len__(self): # number of keys self._lazy_load() if self._exists(): return self.mmap.shape[0] return 0 ================================================ FILE: ir_datasets/indices/zpickle_docstore.py ================================================ import os import shutil import json import zlib import pickle from contextlib import contextmanager from .indexed_tsv_docstore import NumpyPosIndex import ir_datasets _logger = ir_datasets.log.easy() class ZPickleKeyValueStore: def __init__(self, path, id_idx, doc_cls): self._path = path self._id_idx = id_idx self._doc_cls = doc_cls self._idx = None self._bin = None def built(self): return len(self) > 0 def idx(self): if self._idx is None: self._idx = NumpyPosIndex(os.path.join(self._path, 'idx')) return self._idx def bin(self): if self._bin is None: self._bin = open(os.path.join(self._path, 'bin'), 'rb') return self._bin def purge(self): if self._idx: self._idx.close() self._idx = None if self._bin: self._bin.close() self._bin = None @contextmanager def transaction(self): os.makedirs(self._path, exist_ok=True) with ZPickleDocStoreTransaction(self) as trans: yield trans def __getitem__(self, value): if isinstance(value, tuple) and len(value) == 2: key, field = value else: # assume key and all fields key, field = value, Ellipsis binf = self.bin() binf.seek(self.idx().get(key)) content_length = int.from_bytes(binf.read(4), 'little') content = binf.read(content_length) content = zlib.decompress(content) content = pickle.loads(content) if content[self._id_idx][1] != key: raise KeyError(f'key={key} not found') if field is Ellipsis: content = dict(content) return self._doc_cls(*(content.get(f) for f in self._doc_cls._fields)) for f, val in content: if field == f: return val raise KeyError(f'field={field} not found for key={key}') def path(self, force=True): return self._path def __iter__(self): # iterates documents binf = self.bin() binf.seek(0) while binf.read(1): # peek binf.seek(-1, 1) # un-peek content_length = int.from_bytes(binf.read(4), 'little') content = binf.read(content_length) content = zlib.decompress(content) content = pickle.loads(content) content = dict(content) yield self._doc_cls(*(content.get(f) for f in self._doc_cls._fields)) def __len__(self): # number of keys return len(self.idx()) class ZPickleDocStoreTransaction: def __init__(self, docstore): self.docstore = docstore self.path = self.docstore.path() self.idx = NumpyPosIndex(os.path.join(self.path, 'idx')) self.bin = open(os.path.join(self.path, 'bin'), 'wb') def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): if not exc_val: self.commit() else: self.discard() def commit(self): self.idx.commit() self.bin.flush() self.bin.close() def discard(self): shutil.rmtree(self.path) def add(self, key, fields): self.idx.add(key, self.bin.tell()) content = tuple(zip(type(fields)._fields, fields)) content = pickle.dumps(content) content = zlib.compress(content) content_length = len(content) self.bin.write(content_length.to_bytes(4, 'little')) self.bin.write(content) class ZPickleDocStore: file_ext = 'zpkl' def __init__(self, path, doc_cls, id_field='doc_id'): self._path = path self._doc_cls = doc_cls self._id_field = id_field self._id_field_idx = doc_cls._fields.index(id_field) self._store = ZPickleKeyValueStore(path, self._id_field_idx, self._doc_cls) def built(self): return os.path.exists(self._path) def purge(self): self._store.purge() def build(self, documents): with self._store.transaction() as trans: for doc in documents: trans.add(doc[self._id_field_idx], doc) def get(self, did, field=None): if field is not None: return self._store[did, field] return self._store[did] def get_many(self, dids, field=None): result = {} for did in dids: try: result[did] = self.get(did, field) except ValueError: pass return result def num_docs(self): return len(self._store) def docids(self): return iter(self._store.idx()) def __iter__(self): return iter(self._store) def path(self, force=True): return self._path ================================================ FILE: ir_datasets/lazy_libs.py ================================================ # These libraries can add a bunch of overhead when imported -- which is bad for command line # utilities. This file loads them lazily if they are needed. _cache = {} def numpy(): if 'numpy' not in _cache: import numpy _cache['numpy'] = numpy return _cache['numpy'] def tqdm(): if 'tqdm' not in _cache: import tqdm _cache['tqdm'] = tqdm return _cache['tqdm'] def requests(): if 'requests' not in _cache: import requests _cache['requests'] = requests return _cache['requests'] def bs4(): if 'bs4' not in _cache: try: import bs4 except ImportError as ie: raise ImportError("This dataset requires beautifulsoup4. Run 'pip install ir_datasets[beautifulsoup4]' to install dependencies for this dataset") from ie _cache['bs4'] = bs4 return _cache['bs4'] def inscriptis(): if 'inscriptis' not in _cache: try: import inscriptis except ImportError as ie: raise ImportError("This dataset requires inscriptis. Run 'pip install ir_datasets[inscriptis]' to install dependencies for this dataset") from ie _cache['inscriptis'] = inscriptis return _cache['inscriptis'] def yaml(): if 'yaml' not in _cache: import yaml _cache['yaml'] = yaml return _cache['yaml'] def json(): if 'json' not in _cache: import json _cache['json'] = json return _cache['json'] def trec_car(): if 'trec_car' not in _cache: try: import trec_car.read_data except ImportError as ie: raise ImportError("This dataset requires trec-car-tools. Run 'pip install ir_datasets[car]' to install dependencies for this dataset") from ie _cache['trec_car'] = trec_car return _cache['trec_car'] def warc(): if 'warc' not in _cache: try: import warc except ImportError as ie: raise ImportError("This dataset requires warc. Run 'pip install ir_datasets[warc]' to install dependencies for this dataset") from ie _cache['warc'] = warc return _cache['warc'] def warc_clueweb09(): if 'warc_clueweb09' not in _cache: try: import warc3_wet_clueweb09 except ImportError as ie: raise ImportError("This dataset requires warc. Run 'pip install ir_datasets[warc]' to install dependencies for this dataset") from ie _cache['warc_clueweb09'] = warc3_wet_clueweb09 return _cache['warc_clueweb09'] def lz4_block(): if 'lz4_block' not in _cache: import lz4.block _cache['lz4_block'] = lz4 return _cache['lz4_block'] def lz4_frame(): if 'lz4_frame' not in _cache: import lz4.frame _cache['lz4_frame'] = lz4 return _cache['lz4_frame'] def zlib_state(): if 'zlib_state' not in _cache: try: import zlib_state except ImportError as ie: raise ImportError("This dataset requires zlib-state. Run 'pip install ir_datasets[zlib-state]' to install dependencies for this dataset") from ie _cache['zlib_state'] = zlib_state return _cache['zlib_state'] def xml_etree(): if 'xml_etree' not in _cache: import xml.etree.ElementTree as ET _cache['xml_etree'] = ET return _cache['xml_etree'] def lxml_html(): if 'lxml_html' not in _cache: import lxml.html _cache['lxml_html'] = lxml.html return _cache['lxml_html'] def ijson(): if 'ijson' not in _cache: try: import ijson except ImportError as ie: raise ImportError("This dataset requires ijson. Run 'pip install ir_datasets[ijson]' to install dependencies for this dataset") from ie _cache['ijson'] = ijson return _cache['ijson'] def pyautocorpus(): if 'pyautocorpus' not in _cache: try: import pyautocorpus except ImportError as ie: raise ImportError("This dataset requires pyautocorpus. Run 'pip install ir_datasets[pyautocorpus]' to install dependencies for this dataset") from ie _cache['pyautocorpus'] = pyautocorpus return _cache['pyautocorpus'] def unlzw3(): if 'unlzw3' not in _cache: try: import unlzw3 except ImportError as ex: raise ImportError("This dataset requires unlzw3. Run 'pip install ir_datasets[unlzw3]' to install dependencies for this dataset") from ex _cache['unlzw3'] = unlzw3 return _cache['unlzw3'] def pyarrow_parquet(): if 'pyarrow_parquet' not in _cache: try: import pyarrow.parquet except ImportError as ie: raise ImportError("This dataset requires pyarrow. Run 'pip install ir_datasets[pyarrow]' to install dependencies for this dataset") from ie _cache['pyarrow_parquet'] = pyarrow.parquet return _cache['pyarrow_parquet'] ================================================ FILE: ir_datasets/log.py ================================================ import sys import logging import operator from contextlib import contextmanager from time import time import ir_datasets class TqdmHandler(logging.StreamHandler): def __init__(self): logging.StreamHandler.__init__(self) def emit(self, record): msg = self.format(record) try: ir_datasets.lazy_libs.tqdm().tqdm.write(msg, file=sys.stderr) except AttributeError: sys.stderr.write(msg) LOGGER_LEVELS = { 'CRITICAL': 50, 'DEBUG': 10, 'ERROR': 40, 'FATAL': 50, 'INFO': 20, 'NOTSET': 0, 'WARN': 30, 'WARNING': 30, } _logger_cache = {} class Logger: def __init__(self, name): self.name = name if name else "ir_datasets" self._logger = None def logger(self): if self._logger is None: console_formatter = logging.Formatter('[%(levelname)s] %(message)s') handler = TqdmHandler() handler.setFormatter(console_formatter) if self.name not in _logger_cache: logger = logging.getLogger(self.name) logger.addHandler(handler) logger.propagate = False logger.setLevel(logging.INFO) _logger_cache[self.name] = logger self._logger = _logger_cache[self.name] return self._logger def debug(self, text, **kwargs): self.logger().debug(text, **kwargs) def info(self, text, **kwargs): self.logger().info(text, **kwargs) def warn(self, text, **kwargs): self.logger().warning(text, **kwargs) def error(self, text, **kwargs): self.logger().error(text, **kwargs) def critical(self, text, **kwargs): self.logger().critical(text, **kwargs) def log(self, level, text, **kwargs): self.logger().log(LOGGER_LEVELS.get(level, 50), text, **kwargs) def pbar(self, it, *args, **kwargs): level = kwargs.pop('level', 'INFO') quiet = kwargs.pop('quiet', False) if 'ncols' not in kwargs: kwargs['ncols'] = 80 if 'desc' in kwargs: if not quiet: self.log(level, '[starting] {desc}'.format(**kwargs)) if 'leave' not in kwargs: kwargs['leave'] = False if 'total' not in kwargs and hasattr(it, '__len__'): kwargs['total'] = len(it) elif 'total' not in kwargs and hasattr(it, '__length_hint__'): kwargs['total'] = operator.length_hint(it) if 'smoothing' not in kwargs: kwargs['smoothing'] = 0. # disable smoothing by default; mean over entire life of pbar pbar = ir_datasets.lazy_libs.tqdm().tqdm(it, *args, **kwargs) try: yield from pbar except: if not quiet: pbar.bar_format = '{desc}: [{elapsed}] [{n_fmt}{unit}] [{rate_fmt}]' self.log(level, '[error] ' + str(pbar)) raise else: if not quiet: pbar.bar_format = '{desc}: [{elapsed}] [{n_fmt}{unit}] [{rate_fmt}]' self.log(level, '[finished] ' + str(pbar)) @contextmanager def pbar_raw(self, *args, **kwargs): level = kwargs.pop('level', 'INFO') quiet = kwargs.pop('quiet', False) if 'total' not in kwargs and 'total_from' in kwargs: total_from = kwargs.pop('total_from') if hasattr(total_from, '__len__'): kwargs['total'] = len(total_from) elif hasattr(total_from, '__length_hint__'): kwargs['total'] = operator.length_hint(total_from) else: raise ValueError('total_from does not have __len__ or __length_hint__') if 'ncols' not in kwargs: kwargs['ncols'] = 80 if 'desc' in kwargs: if not quiet: self.log(level, '[starting] {desc}'.format(**kwargs)) if 'leave' not in kwargs: kwargs['leave'] = False if 'smoothing' not in kwargs: kwargs['smoothing'] = 0. # disable smoothing by default; mean over entire life of pbar with ir_datasets.lazy_libs.tqdm().tqdm(*args, **kwargs) as pbar: try: yield pbar except: if not quiet: pbar.bar_format = '{desc}: [{elapsed}] [{n_fmt}{unit}] [{rate_fmt}]' self.log(level, '[error] ' + str(pbar)) raise else: if not quiet: pbar.bar_format = '{desc}: [{elapsed}] [{n_fmt}{unit}] [{rate_fmt}]' self.log(level, '[finished] ' + str(pbar)) @contextmanager def duration(self, message, level='INFO'): t = time() self.logger().log(LOGGER_LEVELS[level], f'[starting] {message}') try: yield except: output_duration = format_interval(time() - t) self.logger().log(LOGGER_LEVELS[level], f'[error] {message} [{output_duration}]') raise else: output_duration = format_interval(time() - t) self.logger().log(LOGGER_LEVELS[level], f'[finished] {message} [{output_duration}]') def easy(name=None): """ Returns a logger with the caller's __name__ """ return Logger(name) def format_interval(t): # adapted from tqdm.format_interval, but with better support for short durations (under 1min) mins, s = divmod(t, 60) h, m = divmod(int(mins), 60) if h: return '{0:d}:{1:02d}:{2:02.0f}'.format(h, m, s) if m: return '{0:02d}:{1:02.0f}'.format(m, s) if s >= 1: return '{0:.2f}s'.format(s) return '{0:.0f}ms'.format(s*1000) ================================================ FILE: ir_datasets/util/__init__.py ================================================ import re import os import math import functools import shutil from contextlib import contextmanager from threading import Lock from pathlib import Path import tempfile import ir_datasets from .. import log from .fileio import IterStream, Cache, TarExtract, TarExtractAll, RelativePath, GzipExtract, Lz4Extract, ZipExtract, ZipExtractCache, StringFile, ReTar, Bz2Extract, PackageDataFile from .download import Download, DownloadConfig, BaseDownload, RequestsDownload, LocalDownload, _DownloadConfig from .hash import HashVerificationError, HashVerifier, HashStream from .metadata import MetadataComponent, MetadataProvider, default_metadata_provider, count_hint from .registry import Registry from .html_parsing import sax_html_parser _logger = log.easy() def tmp_path(): p = Path(os.environ.get('IR_DATASETS_TMP', os.path.join(tempfile.gettempdir(), 'ir_datasets'))) if not p.exists(): # per #107, we likely need both the exists check AND the exist_ok for occasional failures when directory is linked to NFS p.mkdir(parents=True, exist_ok=True) return p def home_path(): p = Path(os.environ.get('IR_DATASETS_HOME', Path.home() / '.ir_datasets')) if not p.exists(): # per #107, we likely need both the exists check AND the exist_ok for occasional failures when directory is linked to NFS p.mkdir(parents=True, exist_ok=True) return p @contextmanager def finialized_file(path, mode): if path == os.devnull: with open(path, mode) as f: yield f else: try: with open(f'{path}.tmp', mode) as f: yield f os.replace(f'{path}.tmp', path) except: try: os.remove(f'{path}.tmp') except: pass # ignore raise class Lazy: def __init__(self, fn): self._lock = Lock() self._fn = fn self._loaded = False self._result = None def __call__(self): if not self._loaded: with self._lock: if not self._loaded: # repeat condition from above in thread-safe way self._result = self._fn() self._loaded = True return self._result @property def is_loaded(self): return self._loaded def apply_sub_slice(orig_slice: slice, new_slice: slice): start, stop, step = None, None, None if new_slice.start is not None: if isinstance(new_slice.start, int): if new_slice.start < 0: if orig_slice.stop is None: raise ValueError('start cannot be negative with unknown size') start = orig_slice.stop + new_slice.start else: start = orig_slice.start + new_slice.start elif isinstance(new_slice.start, float): if orig_slice.stop is None: raise ValueError('start cannot be float with unknown size') if not (0. <= new_slice.stop <= 1.): raise ValueError('start must be in interval [0,1] if float') size = orig_slice.stop - (orig_slice.start or 0) start = (new_slice.start * size) + (orig_slice.start or 0) start = math.floor(start) else: raise TypeError('start must be int or float') else: start = orig_slice.start if new_slice.stop is not None: if isinstance(new_slice.stop, int): if new_slice.stop < 0: if orig_slice.stop is None: raise ValueError('stop cannot be negative with unknown size') stop = orig_slice.stop + new_slice.stop else: stop = min(orig_slice.start + new_slice.stop, orig_slice.stop) elif isinstance(new_slice.stop, float): if orig_slice.stop is None: raise ValueError('stop cannot be float with unknown size') if not (0. <= new_slice.stop <= 1.): raise ValueError('stop must be in interval [0,1] if float') size = orig_slice.stop - (orig_slice.start or 0) stop = (new_slice.stop * size) + (orig_slice.start or 0) stop = math.floor(stop) else: raise TypeError('stop must be int or float') else: stop = orig_slice.stop if new_slice.step is not None: if isinstance(new_slice.step, int): if new_slice.step <= 0: raise ValueError('step must be a positive') step = (orig_slice.step or 1) * new_slice.step else: raise TypeError('step must be int') else: step = orig_slice.step return slice(start, stop, step) def slice_idx(orig_slice: slice, index: int): if index >= 0: index = orig_slice.start + index else: index = orig_slice.stop + index return slice(index, min(index + 1, orig_slice.stop)) class DocstoreSplitter: def __init__(self, it, docs_store): self.it = it self.docs_store = docs_store def __iter__(self): return self def __next__(self): return next(self.it) def __getitem__(self, key): return iter(self.docs_store)[key] def use_docstore(fn): # For use as an @annotation # use docs_store if it's already built, otherwise only use it if the user # specifies a split (docs_it[split]) @functools.wraps(fn) def wrapper(self): docs_store = self.docs_store() if docs_store.built(): return iter(docs_store) # iterate from the docstore -- really fast return DocstoreSplitter(fn(self), docs_store) # avoid building docstore if not needed return wrapper class Migrator: def __init__(self, version_file, version, affected_files, message=None, wrapped=None): self._wrapped = wrapped self._version_file = Path(version_file) self._version = version self._affected_files = affected_files self._message = message self._state = 'NOT_CHECKED' def __getattr__(self, attr): item = getattr(self._wrapped, attr) if callable(item): item = self._migrate(item) return item def __call__(self, wrapped): return Migrator(self._version_file, self._version, self._affected_files, self._message, wrapped) def _migrate(self, fn): # optionally wrap the function to perform cleanup of affected files if not self._state == 'OK': @functools.wraps(fn) def wrapped(*args, **kwargs): if not self._state == 'OK': self._version_file.parent.mkdir(parents=True, exist_ok=True) if not self._version_file.exists() or self._read_version() != self._version: self._state = 'IN_PROGRESS' paths_to_remove = [f for f in self._affected_files if os.path.exists(f)] if paths_to_remove: if self._message: _logger.info(self._message) for file in paths_to_remove: if Path(file).is_file(): os.unlink(file) else: shutil.rmtree(file) with self._version_file.open('wt') as f: f.write(self._version) self._state = 'OK' return fn(*args, **kwargs) return wrapped return fn def _read_version(self): with self._version_file.open('rt') as f: return f.read() def check_disk_free(target_path, required_size, message='Insufficient disk space: {target_path} requires {required_size_fmt} but only {free_size_fmt} is available ({missing_size_fmt} more needed)'): """ Checks if there is required_size bytes available on the device associated with target_path (or the closest target_path parent that exists if it doesn't exist). If there isn't enough space, throws an error with the specified message (or generic default message). The check is skipped if IR_DATASETS_SKIP_DISK_FREE is true. """ skip = os.environ.get('IR_DATASETS_SKIP_DISK_FREE', 'false').lower() == 'true' if skip: return path = Path(target_path) while not path.exists(): path = path.parent _, _, free_size = shutil.disk_usage(str(path)) if free_size < required_size: missing_size = required_size - free_size missing_size_fmt = format_file_size(missing_size) required_size_fmt = format_file_size(required_size) free_size_fmt = format_file_size(free_size) raise ValueError(message.format( target_path=target_path, required_size=required_size, required_size_fmt=required_size_fmt, missing_size=missing_size, missing_size_fmt=missing_size_fmt, free_size=free_size, free_size_fmt=free_size_fmt)) def format_file_size(size): unit = '{:.0f}B' units = ['{:.1f}KB', '{:.1f}MB', '{:.1f}GB', '{:.1f}TB'] while units and size > 1000: size = size / 1000 unit = units.pop(0) return unit.format(size) def ws_tok(s): s = re.sub('[^A-Za-z0-9 ]', ' ', s) left = 0 for m in re.finditer(r"\s+", s): right, next = m.span() if right != left: yield s[left:right] left = next if left != len(s): yield s[left:len(s)] ================================================ FILE: ir_datasets/util/docs/__init__.py ================================================ ================================================ FILE: ir_datasets/util/docs/lazy.py ================================================ from abc import ABC, abstractmethod from functools import cached_property, lru_cache from typing import Iterator, Protocol, Sequence, Union import ir_datasets from ir_datasets.formats import BaseDocs, GenericDoc from ir_datasets.indices import PickleLz4FullStore from ir_datasets.indices.base import DEFAULT_DOCSTORE_OPTIONS, Docstore _logger = ir_datasets.log.easy() class IRDSDocuments(BaseDocs): """Document collection based on another ir_datasets one""" def __init__(self, ds_id: str): """Construct a new lazy docs :param ds_id: The ID of the ir_datasets collection """ self._ds_id = ds_id @cached_property def docs(self): return ir_datasets.load(self._ds_id) def docs_cls(self): return self.docs.docs_cls() def docs_lang(self): return self.docs.docs_lang() def docs_count(self): return self.docs.docs_count() def docs_iter(self): return self.docs.docs_iter() class LazyDocs(IRDSDocuments): """Proxy for a IR dataset collection""" def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS): return self.docs.docs_store(field=field, options=options) class DirectAccessDocs(Protocol): def __call__(self) -> Sequence: """Returns a sequence of documents""" ... class DocsListView: """View over document lists""" def __init__(self, docs: "DocsList", slice: slice): self._docs = docs self._indices = [c for c in slice] def __getitem__(self, slice: Union[int, slice]): if isinstance(slice, int): return self._docs.get(slice) return DocsListView(self, self._docs, slice[slice]) class DocsList(ABC): """A document list""" @abstractmethod def get(self, ix: int): ... @abstractmethod def __len__(self): ... def __getitem__(self, slice: Union[int, slice]): if isinstance(slice, int): return self.get(slice) return DocsListView(self, slice) class LazyDocsIter: """Lazy document iterator: materializes the list only if a specific index is requested""" def __init__(self, _get_list_fn: DirectAccessDocs, iter): self._get_list_fn = _get_list_fn self._iter = iter @cached_property def _list(self): return self._get_list_fn() def __getitem__(self, slice: Union[int, slice]): return self._list[slice] def __iter__(self): return self def __next__(self): return next(self._iter) class BaseTransformedDocs(BaseDocs): def __init__(self, docs: BaseDocs, cls, store_name, count=None): """Document collection tranformed using a transform function :param docs: The base documents :param store_name: The name of the LZ4 document store """ self._docs = docs self._cls = cls self._store_name = store_name self._count_hint = count def docs_cls(self): return self._cls def docs_lang(self): return self._docs.docs_lang() def docs_count(self): return self._count_hint or self._docs.docs_count() @lru_cache def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f"{ir_datasets.util.home_path()}/{self._store_name}.pklz4", init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=[field], count_hint=self._count_hint, options=options ) class TransformedDocs(BaseTransformedDocs): def __init__( self, docs: BaseDocs, cls, transform=None, store_name=None, count=None ): """Document collection tranformed using a transform function :param docs: The base documents :param transform: The transformation function :param store_name: if set, creates a LZ4 document store, otherwise transform on the fly, defaults to None """ super().__init__(docs, cls, store_name, count=count) self._transform = transform or self @lru_cache def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS): if self._store_name is None: return TransformedDocstore(self._docs.docs_store(field, options=options), self._transform) return super().docs_store(options=options) def docs_iter(self): for doc in map(self._transform, self._docs.docs_iter()): if doc is not None: yield doc class TransformedDocstore(Docstore): """On the fly transform of documents""" def __init__(self, store, transform): self._store = store self._transform = transform def get_many(self, doc_ids, field=None): return { key: self._transform(doc) for key, doc in self._store.get_many(doc_ids, field) } class IterDocs(BaseDocs): """Documents based on an iterator""" def __init__( self, corpus_name, docs_iter_fn: Iterator, docs_lang="en", docs_namespace=None, docs_cls=GenericDoc, count_hint=None, ): super().__init__() self._corpus_name = corpus_name self._docs_iter_fn = docs_iter_fn self._docs_cls = docs_cls self._count_hint = count_hint self._docs_namespace = docs_namespace self._docs_lang = docs_lang def docs_count(self): if self.docs_store().built(): return self.docs_store().count() return self._count_hint def docs_iter(self): def iter(): with _logger.duration(f"processing {self._corpus_name}"): yield from self._docs_iter_fn() return LazyDocsIter(lambda: iter(self.docs_store()), iter()) def docs_cls(self): return self._docs_cls @lru_cache def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS): return PickleLz4FullStore( path=f"{ir_datasets.util.home_path()}/{self._corpus_name}.pklz4", init_iter_fn=self._docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=[field], count_hint=self._count_hint, options=options ) def docs_namespace(self): return self._docs_namespace def docs_lang(self): return self._docs_lang ================================================ FILE: ir_datasets/util/docs/multiple.py ================================================ from typing import List, Sequence, Tuple, Optional from functools import cached_property, lru_cache from dataclasses import dataclass import ir_datasets from ir_datasets.formats import BaseDocs from ir_datasets.indices.base import Docstore, DEFAULT_DOCSTORE_OPTIONS from ir_datasets.indices.lz4_pickle import PickleLz4FullStore from ir_datasets.util.docs.lazy import LazyDocsIter _logger = ir_datasets.log.easy() @dataclass(frozen=True) class PrefixedDocsSpec: prefix: str """The prefix for this document collection""" docs: BaseDocs """The base documents""" has_prefix: bool = False """Whether documents have already the prefix""" @cached_property def length(self): return len(self.prefix) class PrefixedDocstore(Docstore): def __init__(self, docs_mapping: List[PrefixedDocsSpec], field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS): self._id_field = field self._stores = [ (mapping, mapping.docs.docs_store(field=field, options=options)) for mapping in docs_mapping ] def get_many(self, doc_ids: Sequence[str], field=None): assert field is None result = {} if field is None or field == self._id_field: # If field is ID field, remove the prefix for mapping, store in self._stores: if _doc_ids := [ doc_id if mapping.has_prefix else doc_id[mapping.length:] for doc_id in doc_ids if doc_id.startswith(mapping.prefix) ]: if mapping.has_prefix: result.update(store.get_many(_doc_ids)) else: for key, doc in store.get_many(_doc_ids).items(): key = f"{mapping.prefix}{key}" result[key] = doc._replace(doc_id=key) else: # Just use the field for mapping, store in self._stores: if mapping.has_prefix: result.update(store.get_many(_doc_ids)) else: for key, doc in store.get_many(doc_ids): key = f"{mapping.prefix}{key}" result[key] = doc._replace(doc_id=key) return result class PrefixedDocs(BaseDocs): """Mixes documents and use a prefix to distinguish them""" def __init__(self, store_name: Optional[str], *docs_mapping: PrefixedDocsSpec): """Each mapping = (prefix, documents, boolean indicating whether documents have already a prefix) :param store_name: if set, creates a LZ4 document store, otherwise transform on the fly. """ self._store_name = store_name self._docs_mapping = docs_mapping assert len(self._docs_mapping) > 0, "No document mapping provided" @cached_property def lazy_self(self): try: self._docs_cls = self._docs_mapping[0].docs.docs_cls() if not all( mapping.docs.docs_cls() == self._docs_cls for mapping in self._docs_mapping[1:] ): _logger.error( f"Differing classes for documents, got {[mapping.docs.docs_cls() for mapping in self._docs_mapping[1:]]}") raise AttributeError("Differing classes for documents") self._docs_lang = self._docs_mapping[0].docs.docs_lang() if any( mapping.docs.docs_lang() != self._docs_lang for mapping in self._docs_mapping[1:] ): self._docs_lang = None self._docs_namespace = self._docs_mapping[0].docs.docs_namespace() if any( mapping.docs.docs_namespace() != self._docs_namespace for mapping in self._docs_mapping[1:] ): self._docs_namespace = None except Exception: _logger.logger().exception("Error while computing lazy attributes") return None return self def docs_cls(self): return self.lazy_self._docs_cls def docs_namespace(self): return self.lazy_self._docs_namespace def docs_lang(self): return self.lazy_self._docs_lang def __iter__(self): return self.docs_iter() def _iter(self): for mapping in self._docs_mapping: for doc in mapping.docs.docs_iter(): if not mapping.has_prefix: doc = doc._replace(doc_id=f"{mapping.prefix}{doc.doc_id}") yield doc def docs_iter(self): # Otherwise, only build it if needed return LazyDocsIter(lambda: iter(self.docs_store()), self._iter()) @lru_cache() def docs_count(self): counts = [mapping.docs.docs_count() for mapping in self._docs_mapping] if any(count is None for count in counts): return None return sum(counts) @lru_cache def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS): # If no store name, we use dynamic access if self._store_name is None: return PrefixedDocstore(self._docs_mapping, field=field, options=options) # otherwise, builds a store return PickleLz4FullStore( path=f"{ir_datasets.util.home_path()}/{self._store_name}.pklz4", init_iter_fn=self._iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=[field], count_hint=self.docs_count(), options=options ) ================================================ FILE: ir_datasets/util/docs/subset.py ================================================ import array import os from functools import cached_property, lru_cache from typing import Optional import ir_datasets from ir_datasets.formats import BaseDocs from ir_datasets.util import BaseDownload from ir_datasets.util.docs.lazy import DocsList, LazyDocsIter from ir_datasets.indices import DEFAULT_DOCSTORE_OPTIONS _logger = ir_datasets.log.easy() class DocsSubsetList(DocsList): """List view of a document subset""" def __init__(self, main: "DocsSubset", indices: array.array): self._main = main self._indices = indices def get(self, ix: int): count = 0 for removed_ix in self._indices: if ix <= removed_ix: count += 1 else: break return self._main[ix] def __len__(self): return super().__len__() class Dupes: def __init__(self, base: BaseDownload, prefix: Optional[str] = None): self._base = base self._prefix = prefix self._prefix_len = len(prefix) if prefix else 0 self._remove_prefix = self.remove_prefix if prefix else lambda x: x def remove_prefix(self, doc_id: str): if doc_id.startswith(self._prefix): return doc_id[self._prefix_len :] @cached_property def doc_ids(self): doc_ids = set() with self._base.stream() as fp: for line in fp: if doc_id := self._remove_prefix(line.strip().decode("utf-8")): doc_ids.add(doc_id) return doc_ids def has(self, doc_id: str): return doc_id in self.doc_ids def __len__(self): return len(self.doc_ids) class ColonCommaDupes(Dupes): """Dupes with the format doc_id:dupe_1_id,dupe_2_id,... """ @cached_property def doc_ids(self): doc_ids = set() with self._base.stream() as fp: for line in fp: _, dupes = line.strip().decode("utf-8").split(":") for doc_id in dupes.split(","): if doc_id := self._remove_prefix(doc_id): doc_ids.add(doc_id) return doc_ids class DocsSubset(BaseDocs): """Document collection minus a set of duplicated""" def __init__(self, store_name: str, docs: BaseDocs, removed_ids: "Dupes"): self._docs = docs self._store_name = store_name self._removed_ids = removed_ids self._store = None def docs_list(self): @lru_cache() def indices(): """Stores the indices of removed documents""" indices_path = f"{ir_datasets.util.home_path()}/{self._store_name}.intarray" indices = array.array("L") if not os.path.exists(indices_path): for ix, doc in enumerate( _logger.pbar( iter(self.docs_iter()), total=self.docs_count(), desc="identifying removed documents", ) ): if self._removed_ids.has(doc.doc_id): indices.append(indices) with ir_datasets.util.finialized_file(indices_path, "wb") as fout: fout.write(indices.tobytes()) return indices else: with indices_path.open("rb") as fin: indices.frombytes(fin) return indices return DocsSubsetList(self._docs.docs_iter(), indices) def docs_cls(self): return self._docs.docs_cls() def docs_lang(self): return self._docs.docs_lang() def docs_count(self): if count := self._docs.docs_count(): return count - len(self._removed_ids) return None def docs_iter(self): return LazyDocsIter( self.docs_list, ( doc for doc in self._docs.docs_iter() if not self._removed_ids.has(doc.doc_id) ), ) def docs_namespace(self): return self._docs.docs_namespace() def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS): return self._docs.docs_store(field=field, options=options) ================================================ FILE: ir_datasets/util/download.py ================================================ import json import pkgutil import os import sys from pathlib import Path import atexit from collections import deque import io import shutil import tempfile import contextlib import ir_datasets from ir_datasets import util __all__ = ['Download', 'BaseDownload', 'RequestsDownload'] _logger = ir_datasets.log.easy() class BaseDownload: def stream(self): raise NotImplementedError() class GoogleCloudBucketStream(BaseDownload): def __init__(self, url, tries=None): self.uri = url.replace("https://storage.googleapis.com/", "gs://") self.tries = tries def __repr__(self): return f'GoogleCloudBucketStream({repr(self.uri)}, tries={self.tries})' @contextlib.contextmanager def stream(self): import subprocess proc = subprocess.Popen(['gsutil', 'cat', self.uri], stdout=subprocess.PIPE) with io.BufferedReader(proc.stdout, buffer_size=io.DEFAULT_BUFFER_SIZE) as stream: yield stream class GoogleDriveDownload(BaseDownload): def __init__(self, url, tries=None): self.url = url self.tries = tries def stream(self): # For Google Drive, we may get a "large file" warning that means we need to "confirm". # This just involves pulling a cookie out of the response and adding it to the URL. requests = ir_datasets.lazy_libs.requests() http_args = { 'url': self.url, 'stream': True, # return the response as a stream, rather than loading it all into memory 'headers': {'User-Agent': f'ir_datasets/{ir_datasets.__version__}'}, # identify itself 'timeout': float(os.environ.get('IR_DATASETS_DL_TIMEOUT', '15')), # raise error if 15 seconds pass without any data from the socket 'verify': os.environ.get('IR_DATASETS_DL_SKIP_SSL', '').lower() != 'true', # skip SSL verification if user specifies } url = self.url with _logger.duration('Google Drive verification'), requests.get(**http_args) as response: # Adapted from cookies = response.cookies for k, v in response.cookies.items(): if k.startswith("download_warning"): url += "&confirm=" + v return RequestsDownload(url, self.tries, cookies).stream() class RequestsDownload(BaseDownload): def __init__(self, url, tries=None, cookies=None, headers=None, auth=None): self.url = url self.tries = tries self.cookies = cookies self.headers = headers self.auth = auth @contextlib.contextmanager def stream(self): with io.BufferedReader(util.IterStream(iter(self)), buffer_size=io.DEFAULT_BUFFER_SIZE) as stream: yield stream def __iter__(self): requests = ir_datasets.lazy_libs.requests() http_args = { 'url': self.url, 'stream': True, # return the response as a stream, rather than loading it all into memory 'headers': {'User-Agent': f'ir_datasets/{ir_datasets.__version__}'}, # identify itself 'timeout': float(os.environ.get('IR_DATASETS_DL_TIMEOUT', '15')), # raise error if 15 seconds pass without any data from the socket 'verify': os.environ.get('IR_DATASETS_DL_SKIP_SSL', '').lower() != 'true', # skip SSL verification if user specifies 'cookies': self.cookies, } # apply headers if provided if self.headers: http_args['headers'].update(self.headers) if self.auth: self._handle_auth(http_args) done = False pbar = None response = None skip = 0 default_tries = self.tries if self.tries is not None else int(os.environ.get('IR_DATASETS_DL_TRIES', '3')) remaining_tries = default_tries with contextlib.ExitStack() as stack: while not done: try: response = stack.enter_context(requests.get(**http_args)) if pbar is None: dlen = response.headers.get('content-length') if dlen is not None: dlen = int(dlen) fmt = '{desc}: {percentage:3.1f}%{r_bar}' if os.environ.get('IR_DATASETS_DL_DISABLE_PBAR', '').lower() == 'true': pbar_f = stack.enter_context(open(os.devnull, 'w')) # still maintain the pbar, but write to /dev/null else: pbar_f = None # defaults to stderr pbar = stack.enter_context(_logger.pbar_raw(desc=self.url, total=dlen, unit='B', unit_scale=True, bar_format=fmt, file=pbar_f)) for data in self._iter_response_data(response, http_args, skip): pbar.update(len(data)) if response.headers.get('accept-ranges') == 'bytes': # since we got more data and the server accepts range requests, reset the "tries" counter remaining_tries = default_tries yield data except requests.exceptions.RequestException as ex: remaining_tries -= 1 if remaining_tries <= 0: raise # no more tries if response is not None and response.headers.get('accept-ranges') == 'bytes': # woo hoo! We can issue a range request, so we don't need to download all the data again, # just pick up from where we left off. _logger.info(f'download error: {ex}. Retrying range "{pbar.n}-" [{remaining_tries} attempts left]') http_args['headers']['Range'] = f'bytes={pbar.n}-' skip = 0 elif pbar is not None: # The server doesn't accept range requests, so we'll need to re-download the file up to # where we got, and then start up again from there _logger.info(f'download error: {ex}. Retrying from start (skipping {pbar.n} bytes) because server doesn\'t accept range requests [{remaining_tries} attempts left]') if 'Range' in http_args['headers']: del http_args['headers']['Range'] skip = pbar.n else: # We didn't get any data, start from the start _logger.info(f'download error: {ex}. Retrying from start.') if 'Range' in http_args['headers']: del http_args['headers']['Range'] skip = 0 else: done = True pbar.bar_format = '{desc} [{elapsed}] [{n_fmt}] [{rate_fmt}]' def _iter_response_data(self, response, http_args, skip): with contextlib.ExitStack() as stack: skip_pbar = None if skip > 0: fmt = '{desc}: {percentage:3.1f}%{r_bar}' skip_pbar = stack.enter_context(_logger.pbar_raw(desc=f'skipping ahead to {skip}', total=skip, unit='B', unit_scale=True, bar_format=fmt)) # Some web servers (which?) annoyingly set the content-encoding to gzip when the file itself is # gzipped. This will transparently decompress the stream here, and would mean that hash verification # would fail. So instead, detect this situation and use the raw stream in that case here. Note that # we DO normally want this transparent decompression. # An example is NFCorpus: if http_args['url'].endswith('.gz') and response.headers.get('content-encoding') == 'gzip': data_iter = response.raw.stream(io.DEFAULT_BUFFER_SIZE, decode_content=False) else: data_iter = response.iter_content(chunk_size=io.DEFAULT_BUFFER_SIZE) for data in data_iter: if skip > 0: data, skipped = data[skip:], len(data[:skip]) skip -= skipped skip_pbar.update(skipped) if data: yield data def __repr__(self): return f'RequestsDownload({repr(self.url)}, tries={self.tries})' def _handle_auth(self, http_args): auth_dir = util.home_path() / 'auth' if not auth_dir.exists(): auth_dir.mkdir(parents=True, exist_ok=True) auth_path = auth_dir / self.auth if auth_path.exists(): with auth_path.open('rt') as fin: lines = fin.read().split('\n') if len(lines) < 2: raise RuntimeError(f'{str(auth_path)} in incorrect format. Set the first line as the username and the second line as the password.') uname, pwd = lines[0].strip(), lines[1].strip() http_args['auth'] = (uname, pwd) else: _logger.info('To download {url}, you need to enter a username and password. To avoid this message in the future, you may ' 'also set them in a file''named {auth_path}, with the first line as the username and the second line as the ' 'password.'.format(auth_path=str(auth_path), **http_args)) uname = input('enter username for {url}: '.format(**http_args)) pwd = input('enter password for {url}: '.format(**http_args)) http_args['auth'] = (uname, pwd) class LocalDownload(BaseDownload): def __init__(self, path, message=None, mkdir=True): self._path = Path(path) if mkdir: self._path.parent.mkdir(parents=True, exist_ok=True) self._message = message def path(self, force=True): if force and not self._path.exists(): if self._message: _logger.info(self._message) raise FileNotFoundError(self._path) return self._path @contextlib.contextmanager def stream(self): with self.path().open('rb') as f: yield f _ENCOUNTERD_DUAS = set() def _cleanup_tmp(file): try: os.remove(file.name) except FileNotFoundError: pass class Download: _dua_ctxt = deque([None]) def __init__(self, mirrors, cache_path=None, expected_md5=None, dua=None, stream=False, size_hint=None): self.mirrors = list(mirrors) self.expected_md5 = expected_md5 self.dua = dua or self._dua_ctxt[-1] self._cache_path = cache_path self._stream = stream self._path = None self._size_hint = size_hint def path(self, force=True): if self._path is not None: return self._path if self._cache_path is not None: download_path = self._cache_path if not force: return download_path if os.path.exists(download_path) and download_path != os.devnull: self._path = download_path return self._path else: tmpfile = tempfile.NamedTemporaryFile(delete=False, dir=util.tmp_path()) atexit.register(_cleanup_tmp, tmpfile) download_path = tmpfile.name # must force in this case, even if user asks not to if self.dua is not None and self.dua not in _ENCOUNTERD_DUAS: _logger.info(self.dua) _ENCOUNTERD_DUAS.add(self.dua) errors = [] Path(download_path).parent.mkdir(parents=True, exist_ok=True) if self._size_hint: util.check_disk_free(download_path, self._size_hint) for mirror in self.mirrors: try: with util.finialized_file(download_path, 'wb') as f: with mirror.stream() as stream: stream = util.HashStream(stream, self.expected_md5, algo='md5') shutil.copyfileobj(stream, f) break except Exception as e: errors.append((mirror, e)) if not isinstance(mirror, LocalDownload): _logger.warn(f'Download failed: {e}') else: if len(self.mirrors) == 1: raise errors[0][1] if len(self.mirrors) == 2 and isinstance(self.mirrors[0], LocalDownload): raise errors[1][1] raise RuntimeError('All download sources failed', errors) self._path = download_path return self._path @contextlib.contextmanager def stream(self): if self._stream: assert len(self.mirrors) == 1, "cannot stream with multiple mirrors" with self.mirrors[0].stream() as stream: stream = util.HashStream(stream, self.expected_md5, algo='md5') yield stream else: with open(self.path(), 'rb') as f: yield f @classmethod @contextlib.contextmanager def dua_ctxt(cls, dua): cls._dua_ctxt.append(dua) yield cls._dua_ctxt.pop() class _DownloadConfig: def __init__(self, file=None, base_path=None, contents=None, dua=None, parser="yaml"): self._file = file self._base_path = base_path self._contents = contents self._dua = dua self._parser = parser self.home_path = None self.download_path = None def contents(self): if self._contents is None: data = pkgutil.get_data('ir_datasets', self._file) self._contents = json.loads(data) return self._contents def context(self, key, base_path=None, dua=None): contents = self.contents() return _DownloadConfig(contents=contents[key] if key else contents, base_path=base_path or self._base_path, dua=dua or self._dua) def get_home_path(self): if self.home_path is None: self.home_path = util.home_path() return self.home_path def get_download_path(self): if self.download_path is None: self.download_path = Path(self.get_home_path()) / 'downloads' self.download_path.parent.mkdir(parents=True, exist_ok=True) return self.download_path def __getitem__(self, key): dlc = self.contents()[key] sources = [] cache_path = None download_args = dlc.get('download_args', {}) if 'auth' in dlc: download_args['auth'] = dlc['auth'] if 'cache_path' in dlc: if self._base_path: cache_path = os.path.join(self._base_path, dlc['cache_path']) else: cache_path = dlc['cache_path'] if 'url' in dlc: small_file_size = int(os.environ.get('IR_DATASETS_SMALL_FILE_SIZE', '5000000')) if not dlc.get('skip_local') and dlc.get('expected_md5') and not dlc.get('size_hint', small_file_size) < small_file_size: local_path = Path(self.get_download_path()) / dlc['expected_md5'] local_msg = (f'If you have a local copy of {dlc["url"]}, you can symlink it here ' f'to avoid downloading it again: {local_path}') sources.append(LocalDownload(local_path, local_msg, mkdir=False)) if dlc['url'].startswith("https://storage.googleapis.com/") and 'google.colab' in sys.modules: sources.append(GoogleCloudBucketStream(dlc['url'], **download_args)) elif dlc['url'].startswith('https://drive.google.com/'): sources.append(GoogleDriveDownload(dlc['url'], **download_args)) else: sources.append(RequestsDownload(dlc['url'], **download_args)) if dlc.get('irds_mirror') and dlc.get('expected_md5'): # this file has the irds mirror to fall back on sources.append(RequestsDownload(f'https://mirror.ir-datasets.com/{dlc["expected_md5"]}')) elif 'instructions' in dlc: if 'cache_path' in dlc: local_path = Path(cache_path) else: local_path = Path(util.home_path()) / 'downloads' / dlc['expected_md5'] sources.append(LocalDownload(local_path, dlc['instructions'].format(path=local_path))) else: raise RuntimeError('Must either provide url or instructions') return Download(sources, expected_md5=dlc.get('expected_md5'), cache_path=cache_path, dua=self._dua, stream=dlc.get('stream', False), size_hint=dlc.get('size_hint')) DownloadConfig = _DownloadConfig(file='etc/downloads.json') ================================================ FILE: ir_datasets/util/fileio.py ================================================ import os import contextlib import shutil from pathlib import Path from fnmatch import fnmatch import tarfile import gzip import bz2 import io from zipfile import ZipFile import ir_datasets from ir_datasets import util __all__ = ['IterStream', 'Cache', 'TarExtract', 'TarExtractAll', 'RelativePath', 'GzipExtract', 'Lz4Extract', 'ZipExtract', 'ZipExtractCache', 'StringFile', 'PackageDataFile'] _logger = ir_datasets.log.easy() class IterStream(io.RawIOBase): def __init__(self, it): super().__init__() self.leftover = None self.it = it def readable(self): return True def readinto(self, b): pos = 0 try: while pos < len(b): l = len(b) - pos # We're supposed to return at most this much chunk = self.leftover or next(self.it) output, self.leftover = chunk[:l], chunk[l:] b[pos:pos+len(output)] = output pos += len(output) return pos except StopIteration: return pos # indicate EOF class Cache: def __init__(self, streamer, path): self._streamer = streamer self._path = path def verify(self): if not self._path.exists(): # stream not cached # write stream to a .tmpXX file and then move it to the # correct path when successfully downloaded. self._path.parent.mkdir(parents=True, exist_ok=True) with contextlib.ExitStack() as ctxt: tmp_idx = 0 while True: try: f = open(f'{self._path}.tmp{tmp_idx}', 'xb') # exclusive open ctxt.push(f) break # success opening file except IOError: tmp_idx += 1 if tmp_idx >= 100: # Up to 100 attempts to find a file raise try: with self._streamer.stream() as stream: shutil.copyfileobj(stream, f) f.close() # close file before move... Needed because of Windows shutil.move(f.name, self._path) finally: if Path(f.name).exists(): Path(f.name).unlink() @contextlib.contextmanager def stream(self): self.verify() with self._path.open('rb') as f: yield f def path(self, force=True): if force: self.verify() return self._path class TarExtract: def __init__(self, streamer, tar_path, compression='gz'): self._streamer = streamer self._tar_path = tar_path self._compression = compression @contextlib.contextmanager def stream(self): with contextlib.ExitStack() as ctxt, self._streamer.stream() as stream: # IMPORTANT: open this file in streaming mode (| in mode). This means that the # content need not be written to disk or be fully read. tarf = ctxt.enter_context(tarfile.open(fileobj=stream, mode=f'r|{self._compression or ""}')) for block in tarf: if block.name == self._tar_path: result = tarf.extractfile(block) break else: raise RuntimeError(f'{self._tar_path} not found in tar file') yield result class TarExtractAll: def __init__(self, streamer, extract_path, compression='gz', path_globs=None): self._streamer = streamer self._extract_path = extract_path self._compression = compression self._path_globs = path_globs def path(self, force=True): if force and not os.path.exists(self._extract_path): try: with self._streamer.stream() as stream, tarfile.open(fileobj=stream, mode=f'r|{self._compression or ""}') as tarf, \ _logger.duration('extracting from tar file'): if self._path_globs is None: # shortcut to extract everything tarf.extractall(self._extract_path) else: for member in tarf: if any(fnmatch(member.name, g) for g in self._path_globs): tarf.extract(member, self._extract_path) except: if os.path.exists(self._extract_path): shutil.rmtree(self._extract_path) raise return self._extract_path def stream(self): raise NotImplementedError() class RelativePath: def __init__(self, streamer, path): self._streamer = streamer self._path = path def path(self, force=True): return os.path.join(self._streamer.path(force), self._path) @contextlib.contextmanager def stream(self): with open(self.path(), 'rb') as f: yield f class ReTar: def __init__(self, streamer, output_file, keep_globs, compression='gz'): self._streamer = streamer self._output_file = Path(output_file) self._keep_globs = keep_globs self._compression = compression @contextlib.contextmanager def stream(self): if not self._output_file.exists(): with contextlib.ExitStack() as ctxt, self._streamer.stream() as stream: ctxt.enter_context(_logger.duration('re-taring file')) outf = ctxt.enter_context(util.finialized_file(self._output_file, 'wb')) o_tarf = ctxt.enter_context(tarfile.open(fileobj=outf, mode=f'w|{self._compression or ""}')) # IMPORTANT: open this file in streaming mode (| in mode). This means that the # content need not be written to disk or be fully read. i_tarf = ctxt.enter_context(tarfile.open(fileobj=stream, mode=f'r|{self._compression or ""}')) for block in i_tarf: if any(fnmatch(block.name, g) for g in self._keep_globs): o_tarf.addfile(block, i_tarf.extractfile(block)) _logger.info(f'extracted {block.name}') with self._output_file.open('rb') as f: yield f class GzipExtract: def __init__(self, streamer): self._streamer = streamer def __getattr__(self, attr): return getattr(self._streamer, attr) @contextlib.contextmanager def stream(self): with self._streamer.stream() as stream: yield gzip.GzipFile(fileobj=stream) class Bz2Extract: def __init__(self, streamer): self._streamer = streamer def __getattr__(self, attr): return getattr(self._streamer, attr) @contextlib.contextmanager def stream(self): with self._streamer.stream() as stream: yield bz2.BZ2File(stream) class Lz4Extract: def __init__(self, streamer): self._streamer = streamer def __getattr__(self, attr): return getattr(self._streamer, attr) @contextlib.contextmanager def stream(self): lz4 = ir_datasets.lazy_libs.lz4_frame() with self._streamer.stream() as stream: yield lz4.frame.open(stream, 'rb') class ZipExtract: def __init__(self, dlc, zip_path): self.dlc = dlc self.zip_path = zip_path def path(self, force=True): return self.dlc.path(force) @contextlib.contextmanager def stream(self): with contextlib.ExitStack() as ctxt: with _logger.duration('opening zip file'): zipf = ctxt.enter_context(ZipFile(self.path())) result = zipf.open(self.zip_path) yield result class ZipExtractCache: def __init__(self, dlc, extract_path): self.dlc = dlc self.extract_path = extract_path def path(self, force=True): if force and not os.path.exists(self.extract_path): try: with ZipFile(self.dlc.path()) as zipf: zipf.extractall(self.extract_path) except: if os.path.exists(self.extract_path): shutil.rmtree(self.extract_path) raise return self.extract_path def stream(self): raise NotImplementedError class StringFile: def __init__(self, contents, path='MOCK'): if isinstance(contents, str): contents = contents.encode() # to bytes self.contents = contents self._path = path def path(self, force=True): return self._path @contextlib.contextmanager def stream(self): yield io.BytesIO(self.contents) class PackageDataFile: def __init__(self, path, package='ir_datasets'): self._package = package self._path = path def path(self, force=True): return self._path @contextlib.contextmanager def stream(self): import pkgutil data = pkgutil.get_data(self._package, self._path) yield io.BytesIO(data) ================================================ FILE: ir_datasets/util/hash.py ================================================ import io import hashlib import ir_datasets __all__ = ['HashVerificationError', 'HashVerifier', 'HashStream'] _logger = ir_datasets.log.easy() class HashVerificationError(IOError): pass class HashVerifier: def __init__(self, expected, algo='md5'): self.expected = expected self.algo = algo self.hasher = None def update(self, b): self.hasher.update(b) def __enter__(self): self.hasher = hashlib.new(self.algo) return self def __exit__(self, exc_type, exc_val, exc_tb): if exc_val is None: h = self.hasher.hexdigest().lower() if self.expected is not None: if self.expected.lower() != h: raise HashVerificationError(f"Expected {self.algo} hash to be {self.expected} but got {h}") else: _logger.warn(f'consider adding expected_{self.algo}={repr(h)} to ensure data integrity') class HashStream(io.RawIOBase): def __init__(self, stream, expected, algo='md5'): super().__init__() self._stream = stream self._verifier = HashVerifier(expected, algo) self._verifier.__enter__() def readable(self): return True def readinto(self, b): count = self._stream.readinto(b) self._verifier.update(b[:count]) if count == 0: self._verifier.__exit__(None, None, None) return count ================================================ FILE: ir_datasets/util/html_parsing.py ================================================ from collections import deque import re import io import ir_datasets def find_charset(text): if text is None: return None if isinstance(text, str): text = text.encode() try: idx = text.index(b'charset=') match = re.match(b'charset= *["\']?([a-zA-Z0-9-_]+)', text[idx:]) if match: return match.group(1).decode(errors='ignore') except ValueError: pass return None def decode_html(body, headers=None): for encoding in [find_charset(body), find_charset(headers), 'utf8']: if encoding is not None: try: return body.decode(encoding, errors='ignore') except LookupError: # charset not found pass # continue on to next encoding -- utf8 will always be found def sax_html_parser(body, headers=None, force_encoding=None, fields=None): if fields is None: fields = [{'title'}, None] etree = ir_datasets.lazy_libs.lxml_html().etree sax = SaxExtractor(fields=fields) parser = etree.HTMLParser(target=sax) if isinstance(body, bytes): if force_encoding is None: body = decode_html(body, headers) else: body = body.decode(force_encoding, errors='ignore') parser.feed(body) parser.close() return sax.get_values() class SaxExtractor: IGNORE_TAGS = {'noscript', 'meta', 'input', 'script', 'style'} def __init__(self, fields): self.fields = fields self.field_values = [[] for _ in fields] self.field_stacks = [deque() if f is not None else None for f in fields] self.ignore_tag_stack = deque() def get_values(self): return tuple(self._join_text(v) for v in self.field_values) def _join_text(self, text): res = ''.join(text) res = res.replace('\r\n', '\n').replace('\r', '\n') # CR/LF normalisation res = res.replace('\t', ' ') # tab/space normalisation res = re.sub('\n +', '\n', res) # remove spaces from start of lines res = re.sub(' +\n', '\n', res) # remove spaces from end of lines res = re.sub('\n{2,}', '\n', res) # collapse multiple empty lines res = re.sub(' {2,}', ' ', res) # collapse multiple spaces return res.strip() # remove final leading/trailing whitespace def data(self, data): if not self.ignore_tag_stack: any_match = False for vals, stack in zip(self.field_values, self.field_stacks): if (stack is None and not any_match) or stack: vals.append(data) any_match = True def start(self, tag, attrs): tag = tag.lower() for tags, stack in zip(self.fields, self.field_stacks): if tags is not None and tag in tags: stack.append(tag) if tag in self.IGNORE_TAGS: self.ignore_tag_stack.append(tag) def end(self, tag): tag = tag.lower() for stack in self.field_stacks: if stack and stack[-1] == tag: stack.pop() if self.ignore_tag_stack and self.ignore_tag_stack[-1] == tag: self.ignore_tag_stack.pop() def close(self): pass def comment(self, data): pass def doctype(self, *args): pass def pi(self, *args): pass ================================================ FILE: ir_datasets/util/metadata.py ================================================ import json from typing import Callable, Optional, Dict, Any from functools import partial import ir_datasets from .fileio import PackageDataFile class MetadataComponent: def __init__(self, dataset_id, dataset, provider=None): self._dataset_id = dataset_id self._dataset = dataset self._metadata_provider = provider if provider is not None else default_metadata_provider for etype in ir_datasets.EntityType: if dataset.has(etype): setattr(self, f'{etype.value}_metadata', partial(self._metadata, etype)) setattr(self, f'{etype.value}_count', partial(self._count, etype)) def dataset_id(self): return self._dataset_id def metadata(self): result = {} for etype in ir_datasets.EntityType: if self._dataset.has(etype): result[etype.value] = self._metadata(etype) return result def _metadata(self, etype: ir_datasets.EntityType): return self._metadata_provider.get_metadata(self._dataset_id, etype) def _count(self, etype): result = None if hasattr(self._dataset, f'{etype.value}_count'): try: result = getattr(self._dataset, f'{etype.value}_count')() # may also return None except RuntimeError: pass # swallow error and fall back onto metadata except FileNotFoundError: pass # swallow error and fall back onto metadata if result is None: metadata = self._metadata(etype) if 'count' in metadata: result = metadata['count'] return result class MetadataProvider: def __init__(self, metadata_loader: Callable[[], Dict[str, Any]]): self._metadata = None self._metadata_loader = metadata_loader def get_metadata(self, dsid: str, entity_type: ir_datasets.EntityType) -> Dict[str, Any]: entity_type = ir_datasets.EntityType(entity_type) # validate & allow strings if self._metadata is None: self._metadata = self._metadata_loader() result = self._metadata.get(dsid, {}).get(entity_type.value, {}) while '_ref' in result: result = self._metadata.get(result['_ref'], {}).get(entity_type.value, {}) return result @staticmethod def json_loader(dlc): def wrapped(): with dlc.stream() as s: return json.load(s) return wrapped default_metadata_provider = MetadataProvider(MetadataProvider.json_loader(PackageDataFile('etc/metadata.json'))) def count_hint( dsid: str, etype: ir_datasets.EntityType = ir_datasets.EntityType.docs, metadata_provier: Optional[MetadataProvider] = None) -> Callable[[], Optional[int]]: """ Returns a lambda expression that provides the count from metadata (if available) for the given dataset's etype. This is frequently used to provide a time estimate for building docstores. It returns a lambda expression so that the metadata does not need to be loaded when the package is imported; only when the value is actually requested. """ if metadata_provier is None: metadata_provier = default_metadata_provider return lambda: metadata_provier.get_metadata(dsid, etype).get('count') ================================================ FILE: ir_datasets/util/registry.py ================================================ import os import re import ir_datasets from .metadata import MetadataComponent __all__ = 'Registry' _logger = ir_datasets.log.easy() class Registry: def __init__(self, allow_overwrite=False): self._registered = {} self._patterns = [] self._allow_overwrite = allow_overwrite def __getitem__(self, key): if key not in self._registered: for pattern, initializer in self._patterns: match = pattern.match(key) if match: dataset = initializer(key, match.groups()) self.register(key, dataset) break result = self._registered[key] if hasattr(result, 'deprecated'): if os.environ.get('IR_DATASETS_SKIP_DEPRECATED_WARNING', '').lower() != 'true': _logger.info(result.deprecated()) return result def __iter__(self): return iter(self._registered.keys()) def register(self, name, obj): from ..datasets.base import Dataset if name in self._registered: if self._allow_overwrite: _logger.warn(f"{name} already exists in this registry. Overwriting.") else: raise RuntimeError(f"{name} already exists in this registry.") if not hasattr(obj, 'metadata'): obj = Dataset(MetadataComponent(name, obj), obj) # add metadata from default provider self._registered[name] = obj def register_pattern(self, pattern, initializer): self._patterns.append((re.compile(pattern), initializer)) ================================================ FILE: ir_datasets/wrappers/__init__.py ================================================ from .html_extractor import HtmlDocExtractor ================================================ FILE: ir_datasets/wrappers/html_extractor.py ================================================ import math import os import multiprocessing from threading import Semaphore import ir_datasets _logger = ir_datasets.log.easy() def bs4_extract(html): ignore = {'[document]', 'noscript', 'header', 'html', 'meta', 'head', 'input', 'script', 'style'} bs4 = ir_datasets.lazy_libs.bs4() soup = bs4.BeautifulSoup(html, 'html.parser') output = '' for t in soup.find_all(text=True): if t.parent.name not in ignore and not isinstance(t, bs4.element.Comment): output += '{} '.format(t) return output def inscriptis_extract(html): get_text = ir_datasets.lazy_libs.inscriptis().get_text return get_text(html.decode("utf-8",'ignore')) class HtmlDocIter: def __init__(self, it, extractor): self.it = it self.extractor = extractor self.mapped_it = _doc_map_it(it, self.extractor) def __next__(self): return next(self.mapped_it) def __iter__(self): return self def __getitem__(self, key): if isinstance(key, int): doc = self.it[key] return _doc_map((doc, self.extractor._field_content_type, self.extractor._extractor, self.extractor._dataset.docs_cls())) return HtmlDocIter(self.it[key], self.extractor) class HtmlDocExtractor: def __init__(self, dataset, extractor='bs4', parallel=0.8): self._dataset = dataset self._extractor = extractor if isinstance(parallel, int) and parallel < 0: parallel = os.cpu_count() + parallel if isinstance(parallel, float): parallel = math.floor(os.cpu_count() * parallel) if parallel < 1: parallel = 1 self._parallel = parallel docs_cls = self._dataset.docs_cls() fields = {f: i for i, f in enumerate(docs_cls._fields)} field_content_type = [] for field, i in fields.items(): if field.endswith('_content_type'): base_field = field[:-len('_content_type')] if base_field in fields: field_content_type.append((fields[base_field], i)) self._field_content_type = field_content_type if len(field_content_type) == 0: _logger.warn('no fields with _content_type found; HtmlDocExtractor will have not effect.') def __getattr__(self, attr): return getattr(self._dataset, attr) def docs_iter(self): return HtmlDocIter(self._dataset.docs_iter(), self) def docs_store(self, options=ir_datasets.indices.DEFAULT_DOCSTORE_OPTIONS): return HtmlDocExtractorDocStoreWrapper(self._dataset.docs_store(options=options), self) class HtmlDocExtractorDocStoreWrapper(ir_datasets.indices.Docstore): def __init__(self, docstore, extractor): super().__init__(docstore._doc_cls, docstore._id_field) self.docstore = docstore self.extractor = extractor def get_many_iter(self, doc_ids): return _doc_map_it(self.docstore.get_many_iter(doc_ids), self.extractor) def clear_cache(self): self.docstore.clear_cache() def _doc_map_it(it, extractor): docs_cls = extractor._dataset.docs_cls() arg_iter = ((doc, extractor._field_content_type, extractor._extractor, docs_cls) for doc in it) if extractor._parallel == 1: return map(_doc_map, arg_iter) # By default, pool.imap is super greedy and will read in as much of it as possible. # This could mean loading too much into memory, or simply doing extra work. So instead, # we do two things: # 1) Wait until the first call to next() before sending *anything* to the pool, and # 2) Limit the imap buffer to a maximum size by only releasing data from the source iter # when the buffer has space. semaphore = Semaphore(extractor._parallel) def it_in(): while semaphore.acquire(): result = next(arg_iter, StopIteration) if result is StopIteration: break yield result def it_out(): # thi will only start with the first call to next() with multiprocessing.Pool(extractor._parallel) as pool: for result in pool.imap(_doc_map, it_in()): semaphore.release() # allow next item to begin processing yield result return it_out() def _doc_map(args): doc, field_content_type, extractor, docs_cls = args extractor = { 'bs4': bs4_extract, 'inscriptis': inscriptis_extract, }[extractor] result = list(doc) any_updates = False for content_idx, type_idx in field_content_type: if result[type_idx] in ('text/html', 'application/xhtml+xml'): result[content_idx] = extractor(result[content_idx]) result[type_idx] = 'text/plain' any_updates = True if any_updates: doc = docs_cls(*result) return doc ================================================ FILE: pyproject.toml ================================================ [build-system] requires = ["setuptools>=42", "wheel"] build-backend = "setuptools.build_meta" [project] name = "ir_datasets" description = "provides a common interface to many IR ad-hoc ranking benchmarks, training datasets, etc." readme = "README.md" authors = [ {name = "Sean MacAvaney", email = "sean.macavaney@glasgow.ac.uk"}, ] maintainers = [ {name = "Sean MacAvaney", email = "sean.macavaney@glasgow.ac.uk"}, ] requires-python = ">=3.8" classifiers = [ "Programming Language :: Python", "Operating System :: OS Independent", "Topic :: Text Processing", "Topic :: Text Processing :: Indexing", "License :: OSI Approved :: MIT License", ] dynamic = ["version", "dependencies"] [tool.setuptools] include-package-data = true [tool.setuptools.packages.find] exclude = ["test"] [tool.setuptools.dynamic] version = {attr = "ir_datasets.__version__"} dependencies = {file = ["requirements.txt"]} [project.optional-dependencies] car = [ "trec-car-tools>=2.5.4", ] warc = [ "warc3-wet>=0.2.3", "warc3-wet-clueweb09>=0.2.5" ] pyautocorpus = [ "pyautocorpus>=0.1.12" ] pyarrow = [ "pyarrow>=16.1.0" ] unlzw3 = [ "unlzw3>=0.2.1" ] beautifulsoup4 = [ "beautifulsoup4>=4.4.1" ] inscriptis = [ "inscriptis>=2.2.0" ] zlib-state = [ "zlib-state>=0.1.3" ] ijson = [ "ijson>=3.1.3" ] all = [ "trec-car-tools>=2.5.4", "warc3-wet>=0.2.3", "warc3-wet-clueweb09>=0.2.5", "pyarrow>=16.1.0", "pyautocorpus>=0.1.12", "unlzw3>=0.2.1", "beautifulsoup4>=4.4.1", "inscriptis>=2.2.0", "zlib-state>=0.1.3", "ijson>=3.1.3" ] [project.urls] "Homepage" = "https://ir-datasets.com/" "Documentation" = "https://project.readthedocs.io/" "Source" = "https://github.com/allenai/ir_datasets" "Issues" = "https://github.com/allenai/ir_datasets/issues" "Bug Tracker" = "https://github.com/allenai/ir_datasets/issues" [project.scripts] ir_datasets = "ir_datasets:main_cli" ================================================ FILE: requirements-test.txt ================================================ pyautocorpus>=0.1.1 ================================================ FILE: requirements.txt ================================================ lxml>=4.5.2,<6.0.0 numpy>=1.18.1 pyyaml>=5.3.1 requests>=2.22.0 tqdm>=4.38.0 lz4>=3.1.10 ================================================ FILE: test/__init__.py ================================================ ================================================ FILE: test/downloads.py ================================================ import requests import gzip import io import random import sys import json import time import datetime from contextlib import contextmanager import re import os import unittest import argparse import json import ir_datasets logger = ir_datasets.log.easy() @contextmanager def tmp_environ(**kwargs): orig_values = {} for key, value in kwargs.items(): orig_values[key] = os.environ.get(key) os.environ[key] = value try: yield finally: for key, value in kwargs.items(): orig_value = orig_values[key] if orig_value is not None: os.environ[key] = orig_value else: del os.environ[key] class TestDownloads(unittest.TestCase): dlc_filter = None output_path = None rand_delay = None # useful for being nice to servers when running tests by adding a random delay between tests output_data = [] head_precheck = False def test_downloads(self): with open('ir_datasets/etc/downloads.json') as f: data = json.load(f) try: self._test_download_iter(data) # clirmatrix - there's a ton of files, test a few random ones each time if self.dlc_filter == '^clirmatrix/': dlc = ir_datasets.datasets.clirmatrix.DownloadConfig.context('clirmatrix', ir_datasets.util.home_path()/'clirmatrix') with dlc['downloads'].stream() as s: with gzip.open(s) as g: clir_dlc = json.load(g) for top_key in clir_dlc.keys(): sub_keys = list(clir_dlc[top_key].keys()) for sub_key in random.sample(sub_keys, 10): res = self._test_download(clir_dlc[top_key][sub_key], f'clirmatrix/{top_key}/{sub_key}') if res['result'] != 'HEAD_SKIP': self.output_data.append(res) finally: if self.output_path is not None: with open(self.output_path, 'wt') as f: json.dump(self.output_data, f) def _test_download_iter(self, data, prefix=''): with tmp_environ(IR_DATASETS_DL_TRIES='10'): # give the test up to 10 attempts to download if 'url' in data and 'expected_md5' in data: if self.dlc_filter is None or re.search(self.dlc_filter, prefix) and not data.get('skip_test', False) and not data.get('auth', False): res = self._test_download(data, prefix) if res['result'] != 'HEAD_SKIP': self.output_data.append(res) elif 'instructions' in data: pass else: for key in data.keys(): self._test_download_iter(data[key], prefix=f'{prefix}/{key}' if prefix else key) def _test_download(self, data, download_id): record = { 'name': download_id, 'url': data['url'], 'time': datetime.datetime.now().isoformat(), 'duration': None, 'result': 'IN_PROGRESS', 'fail_messagae': None, 'md5': data['expected_md5'], 'size': 0, } with self.subTest(download_id): if self.rand_delay is not None: # sleep in range of [0.5, 1.5] * rand_delay seconds time.sleep(random.uniform(self.rand_delay * 0.5, self.rand_delay * 1.5)) start = time.time() if self.head_precheck: try: requests.head(data['url'], allow_redirects=True).raise_for_status() logger.info('HEAD request for {url} successful'.format(**data)) record['result'] = 'HEAD_SKIP' return record # skip if HEAD request passes except: logger.info('HEAD request for {url} failed; verifying download'.format(**data)) try: download = ir_datasets.util.Download([ir_datasets.util.RequestsDownload(data['url'], **data.get('download_args', {}))], expected_md5=data['expected_md5'], stream=True) with download.stream() as stream: inp = stream.read(io.DEFAULT_BUFFER_SIZE) while len(inp) > 0: record['size'] += len(inp) inp = stream.read(io.DEFAULT_BUFFER_SIZE) record['duration'] = time.time() - start record['result'] = 'PASS' except KeyboardInterrupt: record['duration'] = time.time() - start record['result'] = 'USER_SKIP' self.skipTest('Test skipped by user') except Exception as ex: record['duration'] = time.time() - start has_mirror = 'irds_mirror' in data record['result'] = 'FAIL' if not has_mirror else 'FAIL_BUT_HAS_MIRROR' record['fail_messagae'] = str(ex) if not has_mirror: raise # only raise error if it doesn't have a mirror to avoid spurious errors from the github action return record if __name__ == '__main__': argv = sys.argv for i, arg in enumerate(argv): if arg == '--filter': TestDownloads.dlc_filter = argv[i+1] argv = argv[:i] + argv[i+2:] for i, arg in enumerate(argv): if arg == '--output': TestDownloads.output_path = argv[i+1] argv = argv[:i] + argv[i+2:] for i, arg in enumerate(argv): if arg == '--randdelay': TestDownloads.rand_delay = float(argv[i+1]) argv = argv[:i] + argv[i+2:] for i, arg in enumerate(argv): if arg == '--head_precheck': TestDownloads.head_precheck = True argv = argv[:i] + argv[i+1:] unittest.main(argv=argv) ================================================ FILE: test/dummy/docs.tsv ================================================ T1 CUT, CAP AND BALANCE. TAXED ENOUGH ALREADY! T2 Take a look at and to see these beautiful hotels. T3 US News named the Top10 best hotels in the US--and Trump Int'l Hotel & Tower NYC and Trump Int'l Hotel & Tower Chicago are on the list! T4 My interview last night with Greta on the GOP going El Fold T5 More thoughts on the debt ceiling in today's #trumpvlog... T6 On our YouTube channel- the opening of the incredible Trump Ocean Club in Panama.... T7 Derek Jeter's baseball and more in today's #trumpvlog... T8 Republicans should not negotiate against themselves again with @BarackObama in today's debt talks--First and foremost CUT,CAP and BALANCE. T9 Congress is back.TIME TO CUT, CAP AND BALANCE.There is no revenue problem.The Debt Limit cannot be raised until Obama spending is contained. T10 Perhaps this is the kind of thinking we need in Washington ... T11 Tomorrow we'll be going to Panama for the opening of our new hotel. It's a fantastic building in a fantastic location. T12 Wishing everyone a wonderful Independence Day holiday weekend, a great celebration for a great country. T13 Did my weekly phoner on Fox & Friends this morning...sounding off on issues of the day ... T14 I was in San Jose, CA on Saturday for a sit-down interview for the ACN national meeting which was attended by over 20,000 people. Huge! T15 I've been visiting Trump Int'l Golf Links Scotland and the course will be unmatched anywhere in the world. Spectacular! ================================================ FILE: test/dummy/qrels ================================================ 1 0 T1 0 1 0 T2 0 1 0 T4 1 1 0 T5 0 1 0 T6 0 1 0 T7 0 1 0 T8 1 1 0 T9 1 1 0 T10 0 1 0 T11 0 1 0 T12 0 1 0 T13 0 1 0 T14 0 1 0 T15 0 2 0 T1 0 2 0 T2 1 2 0 T3 1 2 0 T4 0 2 0 T5 0 2 0 T6 1 2 0 T7 0 2 0 T8 0 2 0 T9 0 2 0 T10 1 2 0 T11 1 2 0 T12 0 2 0 T13 0 2 0 T14 0 2 0 T15 1 3 0 T1 1 3 0 T2 0 3 0 T3 0 3 0 T4 0 3 0 T5 1 3 0 T6 0 3 0 T7 0 3 0 T8 1 3 0 T9 1 3 0 T10 0 3 0 T11 0 3 0 T15 0 4 0 T1 0 4 0 T2 0 4 0 T3 1 4 0 T4 1 4 0 T5 0 4 0 T6 1 4 0 T7 0 4 0 T8 0 4 0 T9 0 4 0 T11 0 4 0 T12 0 4 0 T13 1 4 0 T14 1 4 0 T15 0 ================================================ FILE: test/dummy/queries.tsv ================================================ 1 republican party 2 hospitality industry 3 government spending 4 media ================================================ FILE: test/formats/__init__.py ================================================ ================================================ FILE: test/formats/test_trec.py ================================================ import os import shutil import unittest from ir_datasets.formats import TrecQrel, TrecQrels, TrecQuery, TrecQueries, TrecDoc, TrecDocs from ir_datasets.util import StringFile class TestTrec(unittest.TestCase): def test_qrels(self): mock_file = StringFile(''' Q0 0 D1 3 Q0 1 D2 2 Q0 0\tD3 3 Q0 1 D2 1 Q1 0 D2 1 '''.lstrip()) QREL_DEFS = {} expected_results = [ TrecQrel('Q0', 'D1', 3, '0'), TrecQrel('Q0', 'D2', 2, '1'), TrecQrel('Q0', 'D3', 3, '0'), TrecQrel('Q0', 'D2', 1, '1'), TrecQrel('Q1', 'D2', 1, '0'), ] qrels = TrecQrels(mock_file, QREL_DEFS) self.assertEqual(qrels.qrels_path(), 'MOCK') self.assertEqual(qrels.qrels_defs(), QREL_DEFS) self.assertEqual(list(qrels.qrels_iter()), expected_results) def test_qrels_bad_line(self): mock_file = StringFile(''' Q0 0 D1 3 Q0 1 D2 2 Q0 0\tD3 3 Q0 1 D2 1 BAD LINE Q1 0 D2 1 '''.lstrip()) QREL_DEFS = {} qrels = TrecQrels(mock_file, QREL_DEFS) with self.assertRaises(RuntimeError): list(qrels.qrels_iter()) def test_queries(self): mock_file = StringFile(''' Number: Q100A Topic: Some title <desc> Description: Descriptive text split on multiple lines <narr> Narrative: Further elaboration of the query intent split on multiple lines </top> <top> <num> 102 <title> Query 2 <desc> Q2 description <narr> Narrative: Q2 narrative </top> '''.lstrip()) expected_results = [ TrecQuery('Q100A', 'Some title', "Descriptive text\nsplit on multiple lines", 'Further elaboration of the query intent\nsplit on multiple lines'), TrecQuery('102', 'Query 2', "Q2 description", 'Q2 narrative'), ] queries = TrecQueries(mock_file) self.assertEqual(queries.queries_path(), 'MOCK') self.assertEqual(list(queries.queries_iter()), expected_results) def test_docs(self): mock_file = StringFile(''' <DOC> <DOCNO> D100A </DOCNO> <PARENT> Something </PARENT> <HT> Some text </HT> <HEADLINE> <AU> Header Text </AU> Daily Report </HEADLINE> <TEXT> Main body text on multiple lines with <F P=102> some markup </F> here. Also, some invalid <T> markup &. </TEXT> </DOC> <DOC> <DOCNO> 101 </DOCNO> <TEXT> More body text </TEXT> </DOC> '''.lstrip()) expected_results = [ TrecDoc(doc_id='D100A', text='\n\n Header Text \nDaily Report \n\n\n\nMain body text\non multiple lines\n\nwith some markup\n here. Also, some invalid markup &. \n\n', marked_up_doc='<HEADLINE>\n<AU> Header Text </AU>\nDaily Report \n\n</HEADLINE>\n<TEXT>\nMain body text\non multiple lines\n\nwith <F P=102> some markup\n</F> here. Also, some invalid <T> markup &. \n</TEXT>\n'), TrecDoc(doc_id='101', text='\n\nMore body text\n\n', marked_up_doc='<TEXT>\nMore body text\n</TEXT>\n'), ] docs = TrecDocs(mock_file) self.assertEqual(docs.docs_path(), 'MOCK') self.assertEqual(list(docs.docs_iter()), expected_results) def tearDown(self): if os.path.exists('MOCK.pklz4'): shutil.rmtree('MOCK.pklz4') if __name__ == '__main__': unittest.main() ================================================ FILE: test/formats/test_tsv.py ================================================ import os from typing import NamedTuple, Tuple import shutil import unittest from ir_datasets.formats import TsvDocs, TsvQueries, TsvDocPairs from ir_datasets.util import StringFile class TestTsv(unittest.TestCase): def test_core(self): class data_type(NamedTuple): doc_id: str field1: str field2: str mock_file = StringFile(''' 123\tsome field\tanother field 123\t repeated entry \tshouldn't filter 456\tanother query\tsomething '''.lstrip()) expected_results = [ data_type('123', 'some field', 'another field'), data_type('123', ' repeated entry ', 'shouldn\'t filter'), data_type('456', 'another query', 'something'), ] queries = TsvQueries(mock_file, data_type) self.assertEqual(queries.queries_path(), 'MOCK') self.assertEqual(list(queries.queries_iter()), expected_results) docs = TsvDocs(mock_file, data_type) self.assertEqual(docs.docs_path(), 'MOCK') self.assertEqual(list(docs.docs_iter()), expected_results) docpairs = TsvDocPairs(mock_file, data_type) self.assertEqual(docpairs.docpairs_path(), 'MOCK') self.assertEqual(list(docpairs.docpairs_iter()), expected_results) def test_too_many_columns(self): class data_type(NamedTuple): doc_id: str field1: str field2: str mock_file = StringFile(''' 123\tsome field\tanother field 123\trepeated entry\tshouldn't filter\ttoo many columns 456\tanother query\tsomething '''.strip()) queries = TsvQueries(mock_file, data_type) with self.assertRaises(RuntimeError): list(queries.queries_iter()) docs = TsvDocs(mock_file, data_type) with self.assertRaises(RuntimeError): list(docs.docs_iter()) docpairs = TsvDocPairs(mock_file, data_type) with self.assertRaises(RuntimeError): list(docpairs.docpairs_iter()) def test_too_few_columns(self): class data_type(NamedTuple): doc_id: str field1: str field2: str mock_file = StringFile(''' 123\tsome field\tanother field 123\ttoo few fields 456\tanother query\tsomething '''.strip()) queries = TsvQueries(mock_file, data_type) with self.assertRaises(RuntimeError): list(queries.queries_iter()) docs = TsvDocs(mock_file, data_type) with self.assertRaises(RuntimeError): list(docs.docs_iter()) docpairs = TsvDocPairs(mock_file, data_type) with self.assertRaises(RuntimeError): list(docpairs.docpairs_iter()) def test_flex_columns(self): class data_type(NamedTuple): doc_id: str field1: str field2: Tuple[str, ...] mock_file = StringFile(''' 123\tsome field\tanother field 123\ttoo few fields 456\tanother query\tsomething 456\tanother query\tsomething\ttoo many fields\teven more '''.strip()) expected_results = [ data_type('123', 'some field', ('another field',)), data_type('123', 'too few fields', ()), data_type('456', 'another query', ('something',)), data_type('456', 'another query', ('something', 'too many fields', 'even more')), ] queries = TsvQueries(mock_file, data_type) self.assertEqual(queries.queries_path(), 'MOCK') self.assertEqual(list(queries.queries_iter()), expected_results) docs = TsvDocs(mock_file, data_type) self.assertEqual(docs.docs_path(), 'MOCK') self.assertEqual(list(docs.docs_iter()), expected_results) docpairs = TsvDocPairs(mock_file, data_type) self.assertEqual(docpairs.docpairs_path(), 'MOCK') self.assertEqual(list(docpairs.docpairs_iter()), expected_results) def tearDown(self): if os.path.exists('MOCK.pklz4'): shutil.rmtree('MOCK.pklz4') if __name__ == '__main__': unittest.main() ================================================ FILE: test/indices/__init__.py ================================================ ================================================ FILE: test/indices/lz4_pickle.py ================================================ import tempfile import unittest import numpy as np from ir_datasets.indices import Lz4PickleLookup, FileAccess from ir_datasets.formats import GenericDoc class TestLz4PickleLookup(unittest.TestCase): def test_lz4_pickle_lookup(self): for file_access in FileAccess.__members__.values(): with tempfile.TemporaryDirectory() as d: idx = Lz4PickleLookup(d, GenericDoc, "doc_id", ['doc_id'], file_access=file_access) self.assertEqual(tuple(idx['id3', 'id2', 'id1', 'id4']), tuple()) self.assertEqual(tuple(iter(idx)), tuple()) with idx.transaction() as trans: trans.add(GenericDoc('id1', 'some text')) trans.add(GenericDoc('id2', 'some other text')) trans.add(GenericDoc('id3', 'short')) trans.add(GenericDoc('id2', 'duplicate - should overwrite')) self.assertEqual(list(idx['id1'])[0], GenericDoc('id1', 'some text')) self.assertEqual(list(idx['id2'])[0], GenericDoc('id2', 'duplicate - should overwrite')) self.assertEqual(list(idx['id3'])[0], GenericDoc('id3', 'short')) results = tuple(idx['id3', 'id2', 'id1', 'id4']) self.assertEqual(results, (GenericDoc('id1', 'some text'), GenericDoc('id3', 'short'), GenericDoc('id2', 'duplicate - should overwrite'))) with idx.transaction() as trans: trans.add(GenericDoc('id1', 'new text')) trans.add(GenericDoc('id4', 'new doc')) self.assertEqual(list(idx['id1'])[0], GenericDoc('id1', 'new text')) self.assertEqual(list(idx['id2'])[0], GenericDoc('id2', 'duplicate - should overwrite')) self.assertEqual(list(idx['id3'])[0], GenericDoc('id3', 'short')) self.assertEqual(list(idx['id4'])[0], GenericDoc('id4', 'new doc')) results = tuple(idx['id3', 'id2', 'id1', 'id4']) self.assertEqual(results, (GenericDoc('id3', 'short'), GenericDoc('id2', 'duplicate - should overwrite'), GenericDoc('id1', 'new text'), GenericDoc('id4', 'new doc'))) with idx.transaction() as trans: trans.add(GenericDoc('id1', 'newer text')) trans.add(GenericDoc('id4', 'newer doc')) trans.rollback() self.assertEqual(list(idx['id1'])[0], GenericDoc('id1', 'new text')) self.assertEqual(list(idx['id2'])[0], GenericDoc('id2', 'duplicate - should overwrite')) self.assertEqual(list(idx['id3'])[0], GenericDoc('id3', 'short')) self.assertEqual(list(idx['id4'])[0], GenericDoc('id4', 'new doc')) results = tuple(idx['id3', 'id2', 'id1', 'id4']) self.assertEqual(results, (GenericDoc('id3', 'short'), GenericDoc('id2', 'duplicate - should overwrite'), GenericDoc('id1', 'new text'), GenericDoc('id4', 'new doc'))) idx.close() if __name__ == '__main__': unittest.main() ================================================ FILE: test/indices/numpy_sorted.py ================================================ import tempfile import unittest import numpy as np from ir_datasets.indices import NumpySortedIndex class TestNumpySortedIndex(unittest.TestCase): def test_numpy_sorted_index(self): with tempfile.NamedTemporaryFile() as f: idx = NumpySortedIndex(f.name) values = idx['key', 'key1231', 'key', 'missing'] self.assertEqual(len(idx), 0) self.assertEqual(tuple(iter(idx)), tuple()) values = idx['key', 'key1231', 'key', 'missing'] self.assertEqual(values[0], -1) self.assertEqual(values[1], -1) self.assertEqual(values[2], -1) self.assertEqual(values[3], -1) idx.add('k', 1) idx.add('key', 3) idx.add('key4', 2) idx.add('key1231', 4) idx.add('k', 3) idx.commit() self.assertEqual(len(idx), 4) values = idx['key', 'key1231', 'key', 'missing'] self.assertEqual(values[0], 3) self.assertEqual(values[1], 4) self.assertEqual(values[2], 3) self.assertEqual(values[3], -1) idx.add('key', 5) idx.add('key4', 1) idx.add('key5', 8) idx.commit() self.assertEqual(len(idx), 5) values = idx['key', 'key1231', 'key'] self.assertEqual(values[0], 5) self.assertEqual(values[1], 4) self.assertEqual(values[2], 5) values = idx['key4'] self.assertEqual(values[0], 1) idx.close() self.assertEqual(len(idx), 5) values = idx['key4'] self.assertEqual(values[0], 1) self.assertEqual(tuple(iter(idx)), ('k', 'key', 'key1231', 'key4', 'key5')) idx.close() if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/__init__.py ================================================ ================================================ FILE: test/integration/antique.py ================================================ import unittest import ir_datasets from ir_datasets.formats import GenericDoc, GenericQuery, TrecQrel from .base import DatasetIntegrationTest class TestAntique(DatasetIntegrationTest): def test_antique(self): self._test_docs('antique', count=403_666, items={ 0: GenericDoc(doc_id="2020338_0", text="A small group of politicians believed strongly that the fact that Saddam Hussien remained in power after the first Gulf War was a signal of weakness to the rest of the world, one that invited attacks and terrorism. Shortly after taking power with George Bush in 2000 and after the attack on 9/11, they were able to use the terrorist attacks to justify war with Iraq on this basis and exaggerated threats of the development of weapons of mass destruction. The military strength of the U.S. and the brutality of Saddam's regime led them to imagine that the military and political victory would be relatively easy."), 9: GenericDoc(doc_id="1908421_1", text="The gas mileage for some hybrids is better in the city because they spend more time using their electric motor instead of their gas engine. Most hybrids have battery packs which are charged by what is called \"generative breaking\". Instead of slowing down with traditional braking, these cars store some of the energy from braking into a battery pack. Then when the light changes and the car moves again, the energy stored in the battery can be used to move the car, or assist the engine in moving the car.. . In the city this effect can be more pronounced because of more frequent stopping and starting."), 403665: GenericDoc(doc_id="1424320_9", text="You can finance up to 100% of the property value. The only kicker is, the terms of your loan may not be as good. Higher interest rates or origination fees may abound. A simple way to figure whether or not this will be a good idea is to calculate your potential income and your expenses and initial sunk costs. This is a gross oversimplification and I would recommend that you speak to a qualified CPA before you make this leap. As usual, I recommend being very conservative in your estimates, assume a few worst case scenarios (ie no rental income between renters) and if the numbers still come out good... then go for it! :) Good luck!. . **EDIT. . I almost forgot to give you a specific loan type!! You can ask for an 80/20 loan. Where you get a traditional loan for 80% of the value with no PMI (private mortgage insurance), and get another 20% loan for the downpayment (this will be at a higher interest rate and you should pay this one down as quickly as possible). Ask your loan officer about this. They may also be able to offer you other alternatives. Good luck! :)"), }) def test_antique_train(self): self._test_queries('antique/train', count=2_426, items={ 0: GenericQuery(query_id='3097310', text='What causes severe swelling and pain in the knees?'), 9: GenericQuery(query_id='992730', text='How do you transfer voicemail messages onto tape?'), 2425: GenericQuery(query_id='4086230', text='See I have lost my voice what do I do?'), }) self._test_qrels('antique/train', count=27422, items={ 0: TrecQrel(query_id='2531329', doc_id='2531329_0', relevance=4, iteration='U0'), 9: TrecQrel(query_id='3825668', doc_id='3825668_4', relevance=4, iteration='Q0'), 27421: TrecQrel(query_id='884731', doc_id='884731_1', relevance=3, iteration='Q0') }) def test_antique_train_split200train(self): self._test_queries('antique/train/split200-train', count=2_226, items={ 0: GenericQuery(query_id='3097310', text='What causes severe swelling and pain in the knees?'), 9: GenericQuery(query_id='3486120', text='Why does PAMELA ANDERSON ........NOT CARE about Children?'), 2225: GenericQuery(query_id='4086230', text='See I have lost my voice what do I do?'), }) self._test_qrels('antique/train/split200-train', count=25229, items={ 0: TrecQrel(query_id='2531329', doc_id='2531329_0', relevance=4, iteration='U0'), 9: TrecQrel(query_id='3825668', doc_id='3825668_4', relevance=4, iteration='Q0'), 25228: TrecQrel(query_id='884731', doc_id='884731_1', relevance=3, iteration='Q0') }) def test_antique_train_split200valid(self): self._test_queries('antique/train/split200-valid', count=200, items={ 0: GenericQuery(query_id='1907320', text='How do I get college money?'), 9: GenericQuery(query_id='3083719', text='How do you safely wean a person off Risperidal?'), 199: GenericQuery(query_id='2573745', text='How did African American women get the right to Vote?'), }) self._test_qrels('antique/train/split200-valid', count=2193, items={ 0: TrecQrel(query_id='2550445', doc_id='2550445_0', relevance=4, iteration='U0'), 9: TrecQrel(query_id='196651', doc_id='196651_1', relevance=4, iteration='Q0'), 2192: TrecQrel(query_id='344029', doc_id='344029_4', relevance=4, iteration='Q0') }) def test_antique_test(self): self._test_queries("antique/test", count=200, items={ 0: GenericQuery(query_id='3990512', text='how can we get concentration onsomething?'), 9: GenericQuery(query_id='1783010', text='What is Blaphsemy?'), 199: GenericQuery(query_id='1971899', text='what is masturbat***?'), }) self._test_qrels('antique/test', count=6589, items={ 0: TrecQrel(query_id='1964316', doc_id='1964316_5', relevance=4, iteration='U0'), 9: TrecQrel(query_id='1964316', doc_id='1964316_2', relevance=4, iteration='Q0'), 6588: TrecQrel(query_id='1262692', doc_id='3699008_1', relevance=2, iteration='Q0') }) def test_antique_test_nonoffensive(self): self._test_queries('antique/test/non-offensive', count=176, items={ 0: GenericQuery(query_id='3990512', text='how can we get concentration onsomething?'), 9: GenericQuery(query_id='1783010', text='What is Blaphsemy?'), 175: GenericQuery(query_id='1340574', text='Why do some people only go to church on Easter Sunday and never go again until Christmas ?') }) self._test_qrels('antique/test/non-offensive', count=5752, items={ 0: TrecQrel(query_id='1964316', doc_id='1964316_5', relevance=4, iteration='U0'), 9: TrecQrel(query_id='1964316', doc_id='1964316_2', relevance=4, iteration='Q0'), 5751: TrecQrel(query_id='1262692', doc_id='3699008_1', relevance=2, iteration='Q0') }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/aol_ia.py ================================================ import re import unittest import datetime import ir_datasets from ir_datasets.formats import GenericQuery, TrecQrel from ir_datasets.datasets.aol_ia import AolIaDoc, AolQlog, LogItem from .base import DatasetIntegrationTest class TestAolIa(DatasetIntegrationTest): def test_docs(self): self._test_docs('aol-ia', count=1525586, items={ 0: AolIaDoc('00002a94464d', 'Alchimie Forever skin care for men and women', re.compile("^require \\( `` include/config\\.php '' \\) ; if \\( \\$ PHPSESSID \\) session_start \\( \\$ PHPSESSID \\) ; else sessi.{328} Terms and conditions © 2005 Alchimie Forever Sàrl \\. All rights reserved \\. Design : Agence Virtuelle$", flags=48), 'http://www.alchimie-forever.com', 'https://web.archive.org/web/20060218092031/http://www.alchimie-forever.com:80/'), 9: AolIaDoc('00007d6c3dd3', 'Pinehurst Tea Room & Caterering', re.compile('^We have had visitors \\. Welcome to Pinehurst Tea Room \\. This beautifully restored Victorian house is .{456}n please contact Lynda Dubbs at 770\\-474\\-7997 or feel free to email her at pinehursttearoom @ aol\\.com$', flags=48), 'http://www.pinehursttearoom.com', 'https://web.archive.org/web/20060209164740/http://www.pinehursttearoom.com:80/'), 1525585: AolIaDoc('fffff6b18440', 'Golf School - Arizona Golf School , Florida Golf School , Calfornia Golf School', '', 'http://lvgolfschools.com', 'https://web.archive.org/web/20060211025934/http://www.lvgolfschools.com:80/'), }) def test_queries(self): self._test_queries('aol-ia', count=9966939, items={ 0: GenericQuery('8c418e7c9e5993', 'rentdirect com'), 9: GenericQuery('c8476c36af8761', 'www elaorg'), 9966938: GenericQuery('bba88dc56436eb', 'c21curabba'), }) def test_qrels(self): self._test_qrels('aol-ia', count=19442629, items={ 0: TrecQrel('50aa67fe786ca7', '430d8aa747a3', 1, '142'), 9: TrecQrel('f6eff9e0848e2d', 'ecd6d884243b', 1, '217'), 19442628: TrecQrel('14c1b5b54212ad', 'a114f6d94af0', 1, '24967361'), }) def test_qlog(self): self._test_qlogs('aol-ia', count=36389567, items={ 0: AolQlog('142', '8c418e7c9e5993', 'rentdirect com', 'rentdirect.com', datetime.datetime(2006, 3, 1, 7, 17, 12), ()), 6: AolQlog('142', '50aa67fe786ca7', 'westchester gov', 'westchester.gov', datetime.datetime(2006, 3, 20, 3, 55, 57), (LogItem('430d8aa747a3', '1', True),)), 9: AolQlog('142', 'b52c96bea30646', 'dfdf', 'dfdf', datetime.datetime(2006, 3, 24, 22, 23, 14), ()), 36389566: AolQlog('24969339', 'a03587795a216c', 'free credit report', 'free credit report', datetime.datetime(2006, 5, 31, 0, 42, 17), ()), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/aquaint.py ================================================ import re import unittest import ir_datasets from ir_datasets.formats import TrecQrel, TrecDoc, TrecQuery from .base import DatasetIntegrationTest class TestAquaint(DatasetIntegrationTest): def test_docs(self): self._test_docs('aquaint', count=1033461, items={ 0: TrecDoc('APW19980601.0003', re.compile("^\n\nDoohan calls for upgrade to 1000cc bikes \n\n\n\\\t SYDNEY, Australia \\(AP\\) _ Four\\-time world 500cc mot.{1909}r\\-stroke grand prix\nbikes there's plenty of other championships for them to race in\\.''\n \\&UR; \\(tjh\\)\n\n$", flags=48), re.compile("^<HEADLINE>\nDoohan calls for upgrade to 1000cc bikes \n</HEADLINE>\n<TEXT>\n\\\t SYDNEY, Australia \\(AP\\) _.{1942}e grand prix\nbikes there's plenty of other championships for them to race in\\.''\n \\&UR; \\(tjh\\)\n</TEXT>\n$", flags=48)), 9: TrecDoc('APW19980601.0028', re.compile("^\n\nForeign minister again denounces nuclear tests \n\n\n\\\t CANBERRA, Australia \\(AP\\) _ Australia does no.{988} Pakistan to\nimmediately sign and ratify the comprehensive nuclear ban treaty,''\nDowner said\\. \\\t \n\n$", flags=48), re.compile("^<HEADLINE>\nForeign minister again denounces nuclear tests \n</HEADLINE>\n<TEXT>\n\\\t CANBERRA, Australi.{1021}an to\nimmediately sign and ratify the comprehensive nuclear ban treaty,''\nDowner said\\. \\\t \n</TEXT>\n$", flags=48)), 1033460: TrecDoc('XIE20000930.0369', re.compile('^\n\n 2000\\-09\\-30 \n Argentine President Meets With Indonesian Counterpart \n\n\nArgentine Agriculture Minis.{488} on\nThursday\\.\n\n\nThe Indonesian president is scheduled to leave here for Chile on\nSaturday night\\. \n\n\n$', flags=48), re.compile('^<DOC>\n<DATE_TIME> 2000\\-09\\-30 </DATE_TIME>\n<BODY>\n<HEADLINE> Argentine President Meets With Indonesia.{599} Indonesian president is scheduled to leave here for Chile on\nSaturday night\\. \n</P>\n</TEXT>\n</BODY>\n$', flags=48)), }) def test_queries(self): self._test_queries('aquaint/trec-robust-2005', count=50, items={ 0: TrecQuery('303', 'Hubble Telescope Achievements', 'Identify positive accomplishments of the Hubble telescope since it\nwas launched in 1991.', 'Documents are relevant that show the Hubble telescope has produced\nnew data, better quality data than previously available, data that\nhas increased human knowledge of the universe, or data that has led\nto disproving previously existing theories or hypotheses. Documents\nlimited to the shortcomings of the telescope would be irrelevant.\nDetails of repairs or modifications to the telescope without\nreference to positive achievements would not be relevant.'), 9: TrecQuery('344', 'Abuses of E-Mail', 'The availability of E-mail to many people through their\njob or school affiliation has allowed for many efficiencies\nin communications but also has provided the opportunity for\nabuses. What steps have been taken world-wide by those\nbearing the cost of E-mail to prevent excesses?', "To be relevant, a document will concern dissatisfaction by\nan entity paying for the cost of electronic mail. Particularly\nsought are items which relate to system users (such as employees)\nwho abuse the system by engaging in communications of the type\nnot related to the payer's desired use of the system."), 49: TrecQuery('689', 'family-planning aid', 'To which countries does the U.S. provide aid to support family planning,\nand for which countries has the U.S. refused or limited support?', 'Relevant documents indicate where U.S. aid supports\nfamily planning or where such aid has been denied.\nDiscussions of why aid for family planning has been refused are\nalso relevant. Documents that mention U.S. aid to countries,\nbut not specifically for family planning are not relevant.\nDescriptions of funds for family planning in the U.S. itself are not relevant.'), }) def test_qrels(self): self._test_qrels('aquaint/trec-robust-2005', count=37798, items={ 0: TrecQrel('303', 'APW19980609.1531', 2, '0'), 9: TrecQrel('303', 'APW19981117.0914', 0, '0'), 37797: TrecQrel('689', 'XIE20000925.0055', 0, '0'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/argsme.py ================================================ from datetime import datetime from re import compile from unittest import main from ir_datasets.formats import ArgsMeDoc, ArgsMeStance, ArgsMePremise, \ ArgsMeSourceDomain, ArgsMeMode, ArgsMeAspect, ArgsMeProcessedDoc from ir_datasets.formats.argsme import ArgsMeSentence from test.integration.base import DatasetIntegrationTest class TestArgsMe(DatasetIntegrationTest): # noinspection PyTypeChecker def test_docs(self): self._test_docs("argsme/1.0", count=387692, items={ 0: ArgsMeDoc( doc_id="c67482ba-2019-04-18T13:32:05Z-00000-000", conclusion="Contraceptive Forms for High School Students", premises=[ ArgsMePremise( text=compile("My opponent forfeited every round\. None of my argu.{393}it is illegal to sell you them is, frankly, wrong\."), stance=ArgsMeStance.CON, annotations=[], ), ], premises_texts=compile("My opponent forfeited every round\. None of my argu.{393}it is illegal to sell you them is, frankly, wrong\."), aspects=[], aspects_names="", source_id="c67482ba-2019-04-18T13:32:05Z", source_title="Debate Argument: Contraceptive Forms for High School Students | Debate.org", source_url="https://www.debate.org/debates/Contraceptive-Forms-for-High-School-Students/1/", source_previous_argument_id=None, source_next_argument_id="c67482ba-2019-04-18T13:32:05Z-00001-000", source_domain=None, source_text=None, source_text_conclusion_start=None, source_text_conclusion_end=None, source_text_premise_start=None, source_text_premise_end=None, topic="Contraceptive Forms for High School Students", acquisition=datetime.fromisoformat( "2019-04-18T13:32:05+00:00"), date=None, author=None, author_image_url=None, author_organization=None, author_role=None, mode=None, ), 387691: ArgsMeDoc( doc_id="671509c8-2019-04-17T11:47:34Z-00007-000", conclusion="Charter schools", premises=[ ArgsMePremise( text="Charter schools are exploited most by affable students", stance=ArgsMeStance.CON, annotations=[], ), ], premises_texts="Charter schools are exploited most by affable students", aspects=[], aspects_names="", source_id="671509c8-2019-04-17T11:47:34Z", source_title="Debate: Charter schools - Debatepedia", source_url="http://www.debatepedia.org/en/index.php/Debate:_Charter_schools", source_previous_argument_id="671509c8-2019-04-17T11:47:34Z-00022-000", source_next_argument_id="671509c8-2019-04-17T11:47:34Z-00057-000", source_domain=None, source_text=None, source_text_conclusion_start=None, source_text_conclusion_end=None, source_text_premise_start=None, source_text_premise_end=None, topic="Charter schools", acquisition=datetime.fromisoformat("2019-04-17T11:47:34+00:00"), date=None, author=None, author_image_url=None, author_organization=None, author_role=None, mode=None, ), }) self._test_docs("argsme/1.0-cleaned", count=382545, items={ 0: ArgsMeDoc( doc_id="c67482ba-2019-04-18T13:32:05Z-00000-000", conclusion="Contraceptive Forms for High School Students", premises=[ ArgsMePremise( text=compile("My opponent forfeited every round\. None of my argu.{393}it is illegal to sell you them is, frankly, wrong\."), stance=ArgsMeStance.CON, annotations=[], ), ], premises_texts=compile("My opponent forfeited every round\. None of my argu.{393}it is illegal to sell you them is, frankly, wrong\."), aspects=[], aspects_names="", source_id="c67482ba-2019-04-18T13:32:05Z", source_title="Debate Argument: Contraceptive Forms for High School Students | Debate.org", source_url="https://www.debate.org/debates/Contraceptive-Forms-for-High-School-Students/1/", source_previous_argument_id=None, source_next_argument_id="c67482ba-2019-04-18T13:32:05Z-00001-000", source_domain=None, source_text=None, source_text_conclusion_start=None, source_text_conclusion_end=None, source_text_premise_start=None, source_text_premise_end=None, topic="Contraceptive Forms for High School Students", acquisition=datetime.fromisoformat("2019-04-18T13:32:05+00:00"), date=None, author=None, author_image_url=None, author_organization=None, author_role=None, mode=None, ), 382544: ArgsMeDoc( doc_id="671509c8-2019-04-17T11:47:34Z-00007-000", conclusion="Charter schools", premises=[ ArgsMePremise( text="Charter schools are exploited most by affable students", stance=ArgsMeStance.CON, annotations=[], ), ], premises_texts="Charter schools are exploited most by affable students", aspects=[], aspects_names="", source_id="671509c8-2019-04-17T11:47:34Z", source_title="Debate: Charter schools - Debatepedia", source_url="http://www.debatepedia.org/en/index.php/Debate:_Charter_schools", source_previous_argument_id="671509c8-2019-04-17T11:47:34Z-00022-000", source_next_argument_id="671509c8-2019-04-17T11:47:34Z-00057-000", source_domain=None, source_text=None, source_text_conclusion_start=None, source_text_conclusion_end=None, source_text_premise_start=None, source_text_premise_end=None, topic="Charter schools", acquisition=datetime.fromisoformat("2019-04-17T11:47:34+00:00"), date=None, author=None, author_image_url=None, author_organization=None, author_role=None, mode=None, ), }) self._test_docs("argsme/2020-04-01", count=387740, items={ 0: ArgsMeDoc( doc_id="Sb38112c8-A443a9828", conclusion="school", premises=[ ArgsMePremise( text="Done.", stance=ArgsMeStance.PRO, annotations=[], ), ], premises_texts="Done.", aspects=[], aspects_names="", source_id="Sb38112c8", source_title="Debate: school | Debate.org", source_url="https://www.debate.org/debates/school/3/", source_previous_argument_id=None, source_next_argument_id=None, source_domain=ArgsMeSourceDomain.debateorg, source_text=compile("DEBATES OPINIONS FORUMS POLLS Google Search My Deb.{3149}p Version © 2019 Debate\.org\. All rights reserved\. "), source_text_conclusion_start=1630, source_text_conclusion_end=1636, source_text_premise_start=2664, source_text_premise_end=2670, topic="school", acquisition=datetime.fromisoformat("2019-04-18T17:49:41+00:00"), date=None, author=None, author_image_url=None, author_organization=None, author_role=None, mode=ArgsMeMode.discussion, ), 387739: ArgsMeDoc( doc_id="S153cd52f-A118dded8", conclusion=compile("The recent throne speech said that the government .{27}o ensure that our communities continue to be safe\."), premises=[ ArgsMePremise( text=compile("How can Canadians trust the Liberals when they say.{1049}replaced by the system changers in the opposition\."), stance=ArgsMeStance.CON, annotations=[], ), ], premises_texts=compile("How can Canadians trust the Liberals when they say.{1049}replaced by the system changers in the opposition\."), aspects=[ ArgsMeAspect( name="Pornography", weight=3, normalized_weight=1, rank=1, ), ], aspects_names="Pornography", source_id="S153cd52f", source_title="4129762", source_url=None, source_previous_argument_id=None, source_next_argument_id=None, source_domain=ArgsMeSourceDomain.canadian_parliament, source_text=compile("How can Canadians trust the Liberals when they say.{1050}eplaced by the system changers in the opposition\. "), source_text_conclusion_start=149, source_text_conclusion_end=276, source_text_premise_start=0, source_text_premise_end=1149, topic="Child Pornography", acquisition=datetime.fromisoformat("2019-07-25T09:33:44.811404+00:00"), date=datetime.fromisoformat("1999-10-17T22:00:00+00:00"), author="Paul Forseth", author_image_url="https://www.ourcommons.ca/Parliamentarians/Images/OfficialMPPhotos/38/Forsep.JPG", author_organization="Reform", author_role="Opposition", mode=ArgsMeMode.person, ), }) self._test_docs("argsme/2020-04-01/debateorg", count=338620, items={ 0: ArgsMeDoc( doc_id="Sb38112c8-A443a9828", conclusion="school", premises=[ ArgsMePremise( text="Done.", stance=ArgsMeStance.PRO, annotations=[], ), ], premises_texts="Done.", aspects=[], aspects_names="", source_id="Sb38112c8", source_title="Debate: school | Debate.org", source_url="https://www.debate.org/debates/school/3/", source_previous_argument_id=None, source_next_argument_id=None, source_domain=ArgsMeSourceDomain.debateorg, source_text=compile("DEBATES OPINIONS FORUMS POLLS Google Search My Deb.{3149}p Version © 2019 Debate\.org\. All rights reserved\. "), source_text_conclusion_start=1630, source_text_conclusion_end=1636, source_text_premise_start=2664, source_text_premise_end=2670, topic="school", acquisition=datetime.fromisoformat("2019-04-18T17:49:41+00:00"), date=None, author=None, author_image_url=None, author_organization=None, author_role=None, mode=ArgsMeMode.discussion, ), 338619: ArgsMeDoc( doc_id="Sca72da7d-Adbd84fd2", conclusion="It Should be Legal in the U.S. to Occasionally Hit Someone", premises=[ ArgsMePremise( text=compile("In this debate, I will argue that occasionally hit.{2302}fe\. My opponent may begin his argument\. Good luck\."), stance=ArgsMeStance.PRO, annotations=[], ), ], premises_texts=compile("In this debate, I will argue that occasionally hit.{2302}fe\. My opponent may begin his argument\. Good luck\."), aspects=[ ArgsMeAspect( name="Obesity", weight=1, normalized_weight=1, rank=1, ), ], aspects_names="Obesity", source_id="Sca72da7d", source_title="Debate Topic: It Should be Legal in the U.S. to Occasionally Hit Someone | Debate.org", source_url="https://www.debate.org/debates/It-Should-be-Legal-in-the-U.S.-to-Occasionally-Hit-Someone/1/", source_previous_argument_id=None, source_next_argument_id=None, source_domain=ArgsMeSourceDomain.debateorg, source_text=compile("DEBATES OPINIONS FORUMS POLLS Google Search My Deb.{26529}p Version © 2019 Debate\.org\. All rights reserved\. "), source_text_conclusion_start=1677, source_text_conclusion_end=1735, source_text_premise_start=2391, source_text_premise_end=4794, topic="It Should be Legal in the U.S. to Occasionally Hit Someone", acquisition=datetime.fromisoformat("2019-04-18T19:21:03+00:00"), date=None, author=None, author_image_url=None, author_organization=None, author_role=None, mode=ArgsMeMode.discussion, ), }) self._test_docs("argsme/2020-04-01/debatepedia", count=21197, items={ 0: ArgsMeDoc( doc_id="S96f2396e-Aaf079b43", conclusion="Mine Ban Treaty (Ottawa Treaty)", premises=[ ArgsMePremise( text="Casualties in repelling N. Korean invasion would be higher w/o mines", stance=ArgsMeStance.CON, annotations=[], ), ], premises_texts="Casualties in repelling N. Korean invasion would be higher w/o mines", aspects=[], aspects_names="", source_id="S96f2396e", source_title="Debate: Mine Ban Treaty (Ottawa Treaty) - Debatepedia", source_url="http://www.debatepedia.org/en/index.php/Debate:_Mine_Ban_Treaty_%28Ottawa_Treaty%29", source_previous_argument_id=None, source_next_argument_id=None, source_domain=ArgsMeSourceDomain.debatepedia, source_text=compile("Welcome to Debatepedia! \| About \| Help \| FAQ \| Med.{33182} Disclaimers Problem with the site\?  Edit Close \. "), source_text_conclusion_start=1063, source_text_conclusion_end=1094, source_text_premise_start=18501, source_text_premise_end=18569, topic="Mine Ban Treaty (Ottawa Treaty)", acquisition=datetime.fromisoformat("2019-04-17T11:47:26+00:00"), date=None, author=None, author_image_url=None, author_organization=None, author_role=None, mode=ArgsMeMode.discussion, ), 21196: ArgsMeDoc( doc_id="S148bb110-A119d66b0", conclusion="Environmental impact of barages is ugly.", premises=[ ArgsMePremise( text=compile("Barages are fairly massive objects, like Dams, tha.{160} and possibly reduced property values and tourism\."), stance=ArgsMeStance.PRO, annotations=[], ), ], premises_texts=compile("Barages are fairly massive objects, like Dams, tha.{160} and possibly reduced property values and tourism\."), aspects=[], aspects_names="", source_id="S148bb110", source_title="Debate: Tidal energy - Debatepedia", source_url="http://www.debatepedia.org/en/index.php/Debate:_Tidal_energy", source_previous_argument_id=None, source_next_argument_id=None, source_domain=ArgsMeSourceDomain.debatepedia, source_text=compile("Welcome to Debatepedia! \| About \| Help \| FAQ \| Med.{28127} Disclaimers Problem with the site\?  Edit Close \. "), source_text_conclusion_start=22030, source_text_conclusion_end=22070, source_text_premise_start=22070, source_text_premise_end=22331, topic="Tidal energy", acquisition=datetime.fromisoformat("2019-04-17T11:47:38+00:00"), date=None, author=None, author_image_url=None, author_organization=None, author_role=None, mode=ArgsMeMode.discussion, ), }) self._test_docs("argsme/2020-04-01/debatewise", count=14353, items={ 0: ArgsMeDoc( doc_id="S5920cdef-A982becb7", conclusion="placebo effect and phenylthiamine", premises=[ ArgsMePremise( text=compile("But are chocolate eaters happy\? \[\[http://news\.bbc\..{623} health !!! as it increases our colestrol in body\."), stance=ArgsMeStance.CON, annotations=[], ), ], premises_texts=compile("But are chocolate eaters happy\? \[\[http://news\.bbc\..{623} health !!! as it increases our colestrol in body\."), aspects=[], aspects_names="", source_id="S5920cdef", source_title="Is chocolate good for you? - DebateWise", source_url="https://debatewise.org/debates/1904-is-chocolate-good-for-you/", source_previous_argument_id=None, source_next_argument_id=None, source_domain=ArgsMeSourceDomain.debatewise, source_text=compile("Browse Our Categories Is chocolate good for you\? L.{10743}me \| About Us \| Privacy & Contact Us Top wpDiscuz "), source_text_conclusion_start=751, source_text_conclusion_end=784, source_text_premise_start=1667, source_text_premise_end=2390, topic="Is chocolate good for you?", acquisition=datetime.fromisoformat("2019-04-19T12:46:26+00:00"), date=None, author=None, author_image_url=None, author_organization=None, author_role=None, mode=ArgsMeMode.discussion, ), 14352: ArgsMeDoc( doc_id="Sc47177f2-A730d8f9e", conclusion="NO", premises=[ ArgsMePremise( text=compile("‎:/ Adolf Hitler was not evil he wasn't a murderer.{244}wer Told You To Jump Off A Bridge Would You Do It\?"), stance=ArgsMeStance.PRO, annotations=[], ), ], premises_texts=compile("‎:/ Adolf Hitler was not evil he wasn't a murderer.{244}wer Told You To Jump Off A Bridge Would You Do It\?"), aspects=[ ArgsMeAspect( name="Jews", weight=2, normalized_weight=0.6666666666666666, rank=1, ), ArgsMeAspect( name="Adolf Hitler", weight=1, normalized_weight=0.3333333333333333, rank=2, ), ], aspects_names="Jews Adolf Hitler", source_id="Sc47177f2", source_title="Adolf Hitler Does Not Deserve His Reputation as Evil - DebateWise", source_url="https://debatewise.org/debates/357-adolf-hitler-does-not-deserve-his-reputation-as-evil/", source_previous_argument_id=None, source_next_argument_id=None, source_domain=ArgsMeSourceDomain.debatewise, source_text=compile("Browse Our Categories Adolf Hitler Does Not Deserv.{54510} Us \| Privacy & Contact Us Top wpDiscuz 113shares "), source_text_conclusion_start=43380, source_text_conclusion_end=43382, source_text_premise_start=43383, source_text_premise_end=43727, topic="Adolf Hitler Does Not Deserve His Reputation as Evil", acquisition=datetime.fromisoformat("2019-04-19T12:44:52+00:00"), date=None, author=None, author_image_url=None, author_organization=None, author_role=None, mode=ArgsMeMode.discussion, ), }) self._test_docs("argsme/2020-04-01/idebate", count=13522, items={ 0: ArgsMeDoc( doc_id="Sf9294c83-Af186e851", conclusion="the War in Iraq was Worth the Cost", premises=[ ArgsMePremise( text="His removal provides stability and security not only for Iraq but for the Middle East as a region", stance=ArgsMeStance.PRO, annotations=[], ), ], premises_texts="His removal provides stability and security not only for Iraq but for the Middle East as a region", aspects=[], aspects_names="", source_id="Sf9294c83", source_title="This House Believes that the War in Iraq was Worth the Cost | idebate.org", source_url="https://idebate.org/debatabase/international-middle-east-politics-terrorism-warpeace/house-believes-war-iraq-was-worth", source_previous_argument_id=None, source_next_argument_id=None, source_domain=ArgsMeSourceDomain.idebate, source_text=compile("idebate\.org Educational and informative news and r.{17520}in with\.\.\. Login with Facebook Login with Twitter "), source_text_conclusion_start=196, source_text_conclusion_end=230, source_text_premise_start=8137, source_text_premise_end=8234, topic="the War in Iraq was Worth the Cost", acquisition=datetime.fromisoformat("2019-04-19T12:40:25+00:00"), date=None, author=None, author_image_url=None, author_organization=None, author_role=None, mode=ArgsMeMode.discussion, ), 13521: ArgsMeDoc( doc_id="Sd6cf79d9-Af8d9e187", conclusion="Socialism is a more secure system than the free market in Capitalism", premises=[ ArgsMePremise( text=compile("In order to avoid economic crisis there is a need .{1343}agall\. Financial Times\. Retrieved June 14, 2011 1\."), stance=ArgsMeStance.CON, annotations=[], ), ], premises_texts=compile("In order to avoid economic crisis there is a need .{1343}agall\. Financial Times\. Retrieved June 14, 2011 1\."), aspects=[ ArgsMeAspect( name="Capitalism", weight=2, normalized_weight=0.6666666666666666, rank=1, ), ArgsMeAspect( name="Socialism", weight=1, normalized_weight=0.3333333333333333, rank=2, ), ], aspects_names="Capitalism Socialism", source_id="Sd6cf79d9", source_title="This House believes that capitalism is better than socialism | idebate.org", source_url="https://idebate.org/debatabase/economy-economy-general-philosophy-political-philosophy/house-believes-capitalism-better", source_previous_argument_id=None, source_next_argument_id=None, source_domain=ArgsMeSourceDomain.idebate, source_text=compile("idebate\.org Educational and informative news and r.{29663}in with\.\.\. Login with Facebook Login with Twitter "), source_text_conclusion_start=19722, source_text_conclusion_end=19790, source_text_premise_start=21487, source_text_premise_end=22930, topic="capitalism is better than socialism", acquisition=datetime.fromisoformat("2019-04-19T12:39:56+00:00"), date=None, author=None, author_image_url=None, author_organization=None, author_role=None, mode=ArgsMeMode.discussion, ), }) self._test_docs("argsme/2020-04-01/parliamentary", count=48, items={ 0: ArgsMeDoc( doc_id="S1f6b58eb-A5c530110", conclusion=compile("I want them to know that their braids, their dread.{165} boardroom, and yes, even here on Parliament Hill\."), premises=[ ArgsMePremise( text=compile("I want them to know that their braids, their dread.{165} boardroom, and yes, even here on Parliament Hill\."), stance=ArgsMeStance.PRO, annotations=[], ), ], premises_texts=compile("I want them to know that their braids, their dread.{165} boardroom, and yes, even here on Parliament Hill\."), aspects=[ ArgsMeAspect( name="Women", weight=2, normalized_weight=0.6666666666666666, rank=1, ), ArgsMeAspect( name="Woman", weight=1, normalized_weight=0.3333333333333333, rank=2, ), ], aspects_names="Women Woman", source_id="S1f6b58eb", source_title="4718632", source_url=None, source_previous_argument_id=None, source_next_argument_id=None, source_domain=ArgsMeSourceDomain.canadian_parliament, source_text=compile("This week I have my hair in braids, much like I ha.{1066}boardroom, and yes, even here on Parliament Hill\. "), source_text_conclusion_start=900, source_text_conclusion_end=1165, source_text_premise_start=0, source_text_premise_end=1165, topic="Body Shaming", acquisition=datetime.fromisoformat("2019-07-25T09:33:44.814585+00:00"), date=datetime.fromisoformat("2017-09-19T22:00:00+00:00"), author="Celina Caesar-Chavannes", author_image_url="https://www.ourcommons.ca/Parliamentarians/Images/OfficialMPPhotos/42/CaesarChavannesCelina_Lib.jpg", author_organization="Liberal", author_role="Government", mode=ArgsMeMode.person, ), 47: ArgsMeDoc( doc_id="S153cd52f-A118dded8", conclusion=compile("The recent throne speech said that the government .{27}o ensure that our communities continue to be safe\."), premises=[ ArgsMePremise( text=compile("How can Canadians trust the Liberals when they say.{1049}replaced by the system changers in the opposition\."), stance=ArgsMeStance.CON, annotations=[], ), ], premises_texts=compile("How can Canadians trust the Liberals when they say.{1049}replaced by the system changers in the opposition\."), aspects=[ ArgsMeAspect( name="Pornography", weight=3, normalized_weight=1, rank=1, ), ], aspects_names="Pornography", source_id="S153cd52f", source_title="4129762", source_url=None, source_previous_argument_id=None, source_next_argument_id=None, source_domain=ArgsMeSourceDomain.canadian_parliament, source_text=compile("How can Canadians trust the Liberals when they say.{1050}eplaced by the system changers in the opposition\. "), source_text_conclusion_start=149, source_text_conclusion_end=276, source_text_premise_start=0, source_text_premise_end=1149, topic="Child Pornography", acquisition=datetime.fromisoformat("2019-07-25T09:33:44.811404+00:00"), date=datetime.fromisoformat("1999-10-17T22:00:00+00:00"), author="Paul Forseth", author_image_url="https://www.ourcommons.ca/Parliamentarians/Images/OfficialMPPhotos/38/Forsep.JPG", author_organization="Reform", author_role="Opposition", mode=ArgsMeMode.person, ), }) self._test_docs("argsme/2020-04-01/processed", count=365408, items={ 0: ArgsMeProcessedDoc( doc_id="Sf9294c83-Af186e851", conclusion="the War in Iraq was Worth the Cost", premises=[ ArgsMePremise( text=compile("His removal provides stability and .{21} Iraq but for the Middle East as a region"), stance=ArgsMeStance.PRO, annotations=[], ), ], premises_texts=compile("His removal provides stability and .{21} Iraq but for the Middle East as a region"), aspects=[], aspects_names="", source_id="Sf9294c83", source_title="This House Believes that the War in Iraq was Worth the Cost | idebate.org", source_url="https://idebate.org/debatabase/international-middle-east-politics-terrorism-warpeace/house-believes-war-iraq-was-worth", source_previous_argument_id=None, source_next_argument_id=None, source_domain=ArgsMeSourceDomain.idebate, source_text=compile("idebate.org Educational and informative news and .{17542} Facebook Login with Twitter "), source_text_conclusion_start=196, source_text_conclusion_end=230, source_text_premise_start=8137, source_text_premise_end=8234, topic="the War in Iraq was Worth the Cost", acquisition=datetime.fromisoformat("2019-04-19T12:40:25+00:00"), date=None, author=None, author_image_url=None, author_organization=None, author_role=None, mode=ArgsMeMode.discussion, sentences=[ ArgsMeSentence( id="Sf9294c83-Af186e851__PREMISE__1", text="His removal provides stability and security not only for Iraq but for the Middle East as a region" ), ArgsMeSentence( id="Sf9294c83-Af186e851__CONC__1", text="the War in Iraq was Worth the Cost" ) ], ), 365407: ArgsMeProcessedDoc( doc_id="S148bb110-A119d66b0", conclusion="Environmental impact of barages is ugly.", premises=[ ArgsMePremise( text=compile("Barages are fairly massive objects, like .{190} property values and tourism."), stance=ArgsMeStance.PRO, annotations=[] ) ], premises_texts=compile("Barages are fairly massive objects, like .{190} property values and tourism."), aspects=[], aspects_names="", source_id="S148bb110", source_title="Debate: Tidal energy - Debatepedia", source_url="http://www.debatepedia.org/en/index.php/Debate:_Tidal_energy", source_previous_argument_id=None, source_next_argument_id=None, source_domain=ArgsMeSourceDomain.debatepedia, source_text=compile("Welcome to Debatepedia! \| About \| Help \| FAQ \| Media Kit Personal .{28131} with the site\?\xa0 Edit Close . "), source_text_conclusion_start=22030, source_text_conclusion_end=22070, source_text_premise_start=22070, source_text_premise_end=22331, topic="Tidal energy", acquisition=datetime.fromisoformat("2019-04-17T11:47:38+00:00"), date=None, author=None, author_image_url=None, author_organization=None, author_role=None, mode=ArgsMeMode.discussion, sentences=[ ArgsMeSentence( id="S148bb110-A119d66b0__PREMISE__1", text="Barages are fairly massive objects, like Dams, that obstruct the natural flow of water and can, subsequently, have harmful environmental impacts." ), ArgsMeSentence( id="S148bb110-A119d66b0__PREMISE__2", text="These effects can be very ugly, causing frustration among locals and possibly reduced property values and tourism." ), ArgsMeSentence( id="S148bb110-A119d66b0__CONC__1", text="Environmental impact of barages is ugly." ) ] , ), }) if __name__ == "__main__": main() ================================================ FILE: test/integration/base.py ================================================ import re import unittest import ir_datasets _logger = ir_datasets.log.easy() class DatasetIntegrationTest(unittest.TestCase): def _test_docs(self, dataset_name, count=None, items=None, test_docstore=True, test_iter_split=True): orig_items = dict(items) with self.subTest('docs', dataset=dataset_name): if isinstance(dataset_name, str): dataset = ir_datasets.load(dataset_name) else: dataset = dataset_name expected_count = count items = items or {} count = 0 for i, doc in enumerate(_logger.pbar(dataset.docs_iter(), f'{dataset_name} docs', unit='doc')): count += 1 if i in items: self._assert_namedtuple(doc, items[i]) del items[i] if expected_count is None and len(items) == 0: break # no point in going further if expected_count is not None: self.assertEqual(expected_count, count) self.assertEqual({}, items) if test_iter_split: with self.subTest('docs_iter split', dataset=dataset_name): it = dataset.docs_iter() with _logger.duration('doc lookups by index'): for idx, doc in orig_items.items(): self._assert_namedtuple(next(it[idx:idx+1]), doc) self._assert_namedtuple(it[idx], doc) if test_docstore: with self.subTest('docs_store', dataset=dataset_name): doc_store = dataset.docs_store() with _logger.duration('doc lookups by doc_id'): for doc in orig_items.values(): ret_doc = doc_store.get(doc.doc_id) self._assert_namedtuple(doc, ret_doc) def _test_queries(self, dataset_name, count=None, items=None): with self.subTest('queries', dataset=dataset_name): if isinstance(dataset_name, str): dataset = ir_datasets.load(dataset_name) else: dataset = dataset_name expected_count = count items = items or {} count = 0 for i, query in enumerate(_logger.pbar(dataset.queries_iter(), f'{dataset_name} queries', unit='query')): count += 1 if i in items: self._assert_namedtuple(query, items[i]) del items[i] if expected_count is None and len(items) == 0: break # no point in going further if expected_count is not None: self.assertEqual(expected_count, count) self.assertEqual(0, len(items)) def _test_qrels(self, dataset_name, count=None, items=None): with self.subTest('qrels', dataset=dataset_name): if isinstance(dataset_name, str): dataset = ir_datasets.load(dataset_name) else: dataset = dataset_name expected_count = count items = items or {} count = 0 for i, qrel in enumerate(_logger.pbar(dataset.qrels_iter(), f'{dataset_name} qrels', unit='qrel')): count += 1 if i in items: self._assert_namedtuple(qrel, items[i]) del items[i] if expected_count is None and len(items) == 0: break # no point in going further if expected_count is not None: self.assertEqual(expected_count, count) self.assertEqual(0, len(items)) def _test_qlogs(self, dataset_name, count=None, items=None): with self.subTest('qlogs', dataset=dataset_name): if isinstance(dataset_name, str): dataset = ir_datasets.load(dataset_name) else: dataset = dataset_name expected_count = count items = items or {} count = 0 for i, qlogs in enumerate(_logger.pbar(dataset.qlogs_iter(), f'{dataset_name} qlogs', unit='qlog')): count += 1 if i in items: self._assert_namedtuple(qlogs, items[i]) del items[i] if expected_count is None and len(items) == 0: break # no point in going further if expected_count is not None: self.assertEqual(expected_count, count) self.assertEqual(0, len(items)) def _test_docpairs(self, dataset_name, count=None, items=None): with self.subTest('docpairs', dataset=dataset_name): if isinstance(dataset_name, str): dataset = ir_datasets.load(dataset_name) else: dataset = dataset_name expected_count = count items = items or {} count = 0 for i, docpair in enumerate(_logger.pbar(dataset.docpairs_iter(), f'{dataset_name} docpairs', unit='docpair')): count += 1 if i in items: self._assert_namedtuple(docpair, items[i]) del items[i] if expected_count is None and len(items) == 0: break # no point in going further if expected_count is not None: self.assertEqual(expected_count, count) self.assertEqual(0, len(items)) def _build_test_docs(self, dataset_name, include_count=True, include_idxs=(0, 9)): items = {} count = 0 if isinstance(dataset_name, str): dataset = ir_datasets.load(dataset_name) else: dataset = dataset_name for i, doc in enumerate(_logger.pbar(dataset.docs_iter(), f'{dataset_name} docs', unit='doc')): count += 1 if i in include_idxs: items[i] = doc if not include_count and ((include_idxs[-1] < 1000 and i == 1000) or (include_idxs[-1] >= 1000 and i == include_idxs[-1])): break items[count-1] = doc items = {k: self._replace_regex_namedtuple(v) for k, v in items.items()} count = f', count={count}' if include_count else '' _logger.info(f''' self._test_docs({repr(dataset_name)}{count}, items={self._repr_namedtuples(items)}) ''') def _build_test_queries(self, dataset_name): items = {} count = 0 if isinstance(dataset_name, str): dataset = ir_datasets.load(dataset_name) else: dataset = dataset_name for i, query in enumerate(_logger.pbar(dataset.queries_iter(), f'{dataset_name} queries', unit='query')): count += 1 if i in (0, 9): items[i] = query items[count-1] = query _logger.info(f''' self._test_queries({repr(dataset_name)}, count={count}, items={self._repr_namedtuples(items)}) ''') def _build_test_qrels(self, dataset_name): items = {} count = 0 if isinstance(dataset_name, str): dataset = ir_datasets.load(dataset_name) else: dataset = dataset_name for i, qrel in enumerate(_logger.pbar(dataset.qrels_iter(), f'{dataset_name} qrels', unit='qrel')): count += 1 if i in (0, 9): items[i] = qrel items[count-1] = qrel _logger.info(f''' self._test_qrels({repr(dataset_name)}, count={count}, items={self._repr_namedtuples(items)}) ''') def _build_test_scoreddocs(self, dataset_name): items = {} count = 0 if isinstance(dataset_name, str): dataset = ir_datasets.load(dataset_name) else: dataset = dataset_name for i, scoreddoc in enumerate(_logger.pbar(dataset.scoreddocs_iter(), f'{dataset_name} scoreddocs', unit='scoreddoc')): count += 1 if i in (0, 9): items[i] = scoreddoc items[count-1] = scoreddoc _logger.info(f''' self._test_scoreddocs({repr(dataset_name)}, count={count}, items={self._repr_namedtuples(items)}) ''') def _build_test_docpairs(self, dataset_name): items = {} count = 0 for i, docpair in enumerate(_logger.pbar(ir_datasets.load(dataset_name).docpairs_iter(), f'{dataset_name} docpairs', unit='docpair')): count += 1 if i in (0, 9): items[i] = docpair items[count-1] = docpair _logger.info(f''' self._test_docpairs({repr(dataset_name)}, count={count}, items={self._repr_namedtuples(items)}) ''') def _test_scoreddocs(self, dataset_name, count=None, items=None): with self.subTest('scoreddocs', dataset=dataset_name): if isinstance(dataset_name, str): dataset = ir_datasets.load(dataset_name) else: dataset = dataset_name expected_count = count items = items or {} count = 0 for i, scoreddoc in enumerate(_logger.pbar(dataset.scoreddocs_iter(), f'{dataset_name} scoreddocs', unit='scoreddoc')): count += 1 if i in items: self._assert_namedtuple(scoreddoc, items[i]) del items[i] if expected_count is None and len(items) == 0: break # no point in going further if expected_count is not None: self.assertEqual(expected_count, count) self.assertEqual(0, len(items)) def _build_test_qlogs(self, dataset_name): items = {} count = 0 for i, qlog in enumerate(_logger.pbar(ir_datasets.load(dataset_name).qlogs_iter(), f'{dataset_name} qlogs', unit='qlogs')): count += 1 if i in (0, 9): items[i] = qlog items[count-1] = qlog _logger.info(f''' self._test_qlogs({repr(dataset_name)}, count={count}, items={self._repr_namedtuples(items)}) ''') def _assert_namedtuple(self, a, b): # needed because python <= 3.6 doesn't expose re.Pattern class Pattern = re.Pattern if hasattr(re, 'Pattern') else type(re.compile('')) self.assertEqual(type(a).__name__, type(b).__name__) if hasattr(type(a), '_fields') or hasattr(type(b), '_fields'): self.assertEqual(type(a)._fields, type(b)._fields) for v_a, v_b in zip(a, b): # support compiled regex for matching (e.g., for long documents) if isinstance(v_b, Pattern): self.assertRegex(v_a, v_b) elif isinstance(v_a, Pattern): self.assertRegex(v_b, v_a) elif isinstance(v_a, tuple) and isinstance(v_b, tuple): self._assert_namedtuple(v_a, v_b) elif isinstance(v_a, list) and isinstance(v_b, list): self._assert_namedtuple(v_a, v_b) else: self.assertEqual(v_a, v_b) def _replace_regex_namedtuple(self, tup, maxlen=200): result = [] for value in tup: if isinstance(value, str) and len(value) > maxlen: count = len(value) - maxlen pattern = '^' + re.escape(value[:maxlen//2]) + (r'.{%i}' % count) + re.escape(value[-(maxlen//2):]) + '$' result.append(re.compile(pattern, re.DOTALL)) elif isinstance(value, bytes) and len(value) > maxlen: count = len(value) - maxlen pattern = b'^' + re.escape(value[:maxlen//2]) + (b'.{%i}' % count) + re.escape(value[-(maxlen//2):]) + b'$' result.append(re.compile(pattern, re.DOTALL)) elif isinstance(value, tuple) and len(value) > 0 and isinstance(value[0], tuple): result.append(tuple(self._replace_regex_namedtuple(t) for t in value)) elif isinstance(value, list) and len(value) > 0 and isinstance(value[0], tuple): result.append(list(self._replace_regex_namedtuple(t) for t in value)) else: result.append(value) return type(tup)(*result) def _repr_namedtuples(self, items): result = '{\n' for key, value in items.items(): result += f' {repr(key)}: {self._repr_namedtuple(value)},\n' result += '}' return result def _repr_namedtuple(self, value): result = f'{type(value).__name__}(' for item in value: if isinstance(item, re.Pattern): if isinstance(item.pattern, str): pattern = item.pattern.replace('\\ ', ' ').replace('\\\n', '\n') # don't want these escaped else: pattern = item.pattern.replace(b'\\ ', b' ').replace(b'\\\n', b'\n') # don't want these escaped result += f're.compile({repr(pattern)}, flags={item.flags}), ' elif isinstance(item, list) and len(item) > 0 and isinstance(item[0], tuple) and hasattr(item[0], '_fields'): result += '[' + ', '.join(self._repr_namedtuple(i) for i in item) + '], ' elif isinstance(item, tuple) and len(item) > 0 and isinstance(item[0], tuple) and hasattr(item[0], '_fields'): result += '(' + ', '.join(self._repr_namedtuple(i) for i in item) + ',), ' else: result += f'{repr(item)}, ' result = result[:-2] + ')' return result ================================================ FILE: test/integration/beir.py ================================================ import re import unittest from ir_datasets.datasets.beir import BeirTitleDoc, BeirTitleUrlDoc, BeirSciDoc, BeirToucheDoc, BeirCordDoc, BeirCqaDoc, BeirCqaQuery, BeirToucheQuery, BeirCovidQuery, BeirUrlQuery, BeirSciQuery from ir_datasets.formats import TrecQrel, GenericDoc, GenericQuery from .base import DatasetIntegrationTest class TestBeir(DatasetIntegrationTest): def test_docs(self): self._test_docs('beir/arguana', count=8674, items={ 0: BeirTitleDoc('test-environment-aeghhgwpe-pro02b', re.compile('^You don’t have to be vegetarian to be green\\. Many special environments have been created by livestoc.{1667}, 12 October 2010 \\[2\\] Lucy Siegle, ‘It is time to become a vegetarian\\?’ The Observer, 18th May 2008$', flags=48), 'animals environment general health health general weight philosophy ethics'), 9: BeirTitleDoc('test-environment-aeghhgwpe-con01b', re.compile('^Human evolved as omnivores over thousands of years\\. Yet since the invention of farming there is no l.{283} over to farming we have get our food from the most efficient sources, which means being vegetarian\\.$', flags=48), 'animals environment general health health general weight philosophy ethics'), 8673: BeirTitleDoc('validation-society-fyhwscdcj-con02b', re.compile('^Many of the organisations that run child sponsorship schemes are dedicated to improving all of these.{594} encourage the sponsoring of children to build for a better future alongside other charity projects\\.$', flags=48), ''), }) self._test_docs('beir/climate-fever', count=5416593, items={ 0: BeirTitleDoc('1928_in_association_football', 'The following are the football ( soccer ) events of the year 1928 throughout the world .', '1928 in association football'), 9: BeirTitleDoc('1998_All-Ireland_Senior_Hurling_Championship', re.compile('^The All\\-Ireland Senior Hurling Championship of 1998 \\( known for sponsorship reasons as the Guinness .{91} \\. Offaly won the championship , beating Kilkenny 2\\-16 to 1\\-13 in the final at Croke Park , Dublin \\.$', flags=48), '1998 All-Ireland Senior Hurling Championship'), 5416592: BeirTitleDoc('NW_Rota-1', re.compile('^NW Rota\\-1 is a seamount in the Mariana Islands, northwest of Rota, which was discovered through its .{1135}many animals, although the unstable environment from the frequent eruptions limits animal diversity\\.$', flags=48), 'NW Rota-1'), }) self._test_docs('beir/dbpedia-entity', count=4635922, items={ 0: BeirTitleUrlDoc('<dbpedia:Animalia_(book)>', re.compile("^Animalia is an illustrated children's book by Graeme Base\\. It was originally published in 1986, foll.{136}al numbered and signed anniversary edition was also published in 1996, with an embossed gold jacket\\.$", flags=48), 'Animalia (book)', '<http://dbpedia.org/resource/Animalia_(book)>'), 9: BeirTitleUrlDoc('<dbpedia:Alkane>', re.compile('^In organic chemistry, an alkane, or paraffin \\(a historical name that also has other meanings\\), is a .{191}cal formula CnH2n\\+2\\. For example, Methane is CH4, in which n=1 \\(n being the number of Carbon atoms\\)\\.$', flags=48), 'Alkane', '<http://dbpedia.org/resource/Alkane>'), 4635921: BeirTitleUrlDoc('<dbpedia:Frankfurt>', re.compile('^Frankfurt am Main \\(German pronunciation: \\[ˈfʁaŋkfʊɐ̯t am ˈmaɪ̯n\\] \\) is the largest city in the German.{400}t of the European Union in 2013, the geographic centre of the EU is about 40 km \\(25 mi\\) to the east\\.$', flags=48), 'Frankfurt', '<http://dbpedia.org/resource/Frankfurt>'), }) self._test_docs('beir/fever', count=5416568, items={ 0: BeirTitleDoc('1928_in_association_football', 'The following are the football ( soccer ) events of the year 1928 throughout the world .', '1928 in association football'), 9: BeirTitleDoc('1998_All-Ireland_Senior_Hurling_Championship', re.compile('^The All\\-Ireland Senior Hurling Championship of 1998 \\( known for sponsorship reasons as the Guinness .{91} \\. Offaly won the championship , beating Kilkenny 2\\-16 to 1\\-13 in the final at Croke Park , Dublin \\.$', flags=48), '1998 All-Ireland Senior Hurling Championship'), 5416567: BeirTitleDoc('Raúl_Castro', re.compile('^Raúl Modesto Castro Ruz \\(; American Spanish: \\[raˈul moˈðesto ˈkastɾo ˈrus\\]; born 3 June 1931\\) is a C.{1534}ighth Congress of the Communist Party of Cuba, which is scheduled to take place 16 to 19 April 2021\\.$', flags=48), 'Raúl Castro'), }) self._test_docs('beir/fiqa', count=57638, items={ 0: GenericDoc('3', re.compile("^I'm not saying I don't like the idea of on\\-the\\-job training too, but you can't expect the company to.{260}g out with thousands in student debt and then complaining that they aren't qualified to do anything\\.$", flags=48)), 9: GenericDoc('138', re.compile('^So you asked him in 2010 how he was gong to compete with DVD rental distributors like Netflix \\(which.{103}y were going to continue to compete as a DVD rental distributor just like the mentioned competitors\\?$', flags=48)), 57637: GenericDoc('599987', re.compile("^Giving the government more control over the distribution of goods and services, even more than it ha.{165}ply aren't competitive\\. https://www\\.thelocal\\.dk/20170829/denmarks\\-government\\-announces\\-new\\-tax\\-plan$", flags=48)), }) self._test_docs('beir/hotpotqa', count=5233329, items={ 0: BeirTitleUrlDoc('12', re.compile('^Anarchism is a political philosophy that advocates self\\-governed societies based on voluntary instit.{149}ierarchical free associations\\. Anarchism holds the state to be undesirable, unnecessary and harmful\\.$', flags=48), 'Anarchism', 'https://en.wikipedia.org/wiki?curid=12'), 9: BeirTitleUrlDoc('316', re.compile('^The Academy Award for Best Production Design recognizes achievement for art direction in film\\. The c.{280} the award is shared with the set decorator\\(s\\)\\. It is awarded to the best interior design in a film\\.$', flags=48), 'Academy Award for Best Production Design', 'https://en.wikipedia.org/wiki?curid=316'), 5233328: BeirTitleUrlDoc('55408517', "Wilfrid Tatham (12 December 1898 – 26 July 1978) was a British hurdler. He competed in the men's 400 metres hurdles at the 1924 Summer Olympics.", 'Wilfrid Tatham', 'https://en.wikipedia.org/wiki?curid=55408517'), }) # NOTE: Beir doesn't handle the encoding properly, so it differs from msmarco-passage. However, we do not correct here so that these benchmarks are identical with the Beir suite self._test_docs('beir/msmarco', count=8841823, items={ 0: GenericDoc('0', re.compile('^The presence of communication amid scientific minds was equally important to the success of the Manh.{125}nd engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated\\.$', flags=48)), 9: GenericDoc('9', re.compile("^One of the main reasons Hanford was selected as a site for the Manhattan Project's B Reactor was its.{13} the Columbia River, the largest river flowing into the Pacific Ocean from the North American coast\\.$", flags=48)), 99: GenericDoc('99', re.compile("^\\(1841 \\- 1904\\) Contrary to legend, AntonÃ\xadn DvoÅ\x99ák \\(September 8, 1841 \\- May 1, 1904\\) was not born i.{120} in the way of his son's pursuit of a musical career, he and his wife positively encouraged the boy\\.$", flags=48)), # Antonín Dvořák 243: GenericDoc('243', re.compile('^John Maynard Keynes, 1st Baron Keynes, CB, FBA \\(/Ë\x88keɪnz/ KAYNZ; 5 June 1883 â\x80\x93 21 April 1946\\), wa.{46}y changed the theory and practice of modern macroeconomics and the economic policies of governments\\.$', flags=48)), # /ˈkeɪnz/ 1004772: GenericDoc('1004772', re.compile('^Jordan B Peterson added, Jason Belich ð\x9f\x87ºð\x9f\x87¸ @JasonBelich\\. Replying to @JasonBelich @jordanbpeters.{24}for anybody with the authority to deploy code to slip a bit of code to enforce a grey list of sorts\\.$', flags=48)), # 🇺🇸 1032614: GenericDoc('1032614', re.compile('^The CLP Group \\(Chinese: ä¸\xadé\x9b»é\x9b\x86å\x9c\x98\\) and its holding company, CLP Holdings Ltd \\(SEHK: 0002\\) \\(Chines.{290}any Syndicate, its core business remains the generation, transmission, and retailing of electricity\\.$', flags=48)), # 中電集團 1038932: GenericDoc('1038932', re.compile('^Insulin\\-naïve with type 1 diabetes: Initially â\x85\x93â\x80\x93½ of total daily insulin dose\\. Give remainder .{115}tially 0\\.2 Units/kg once daily\\. May need to adjust dose of other co\\-administered antidiabetic drugs\\.$', flags=48)), # naïve ⅓–½ 8841822: GenericDoc('8841822', re.compile('^View full size image\\. Behind the scenes of the dazzling light shows that spectators ooh and ahh at o.{266}h special chemicals, mainly metal salts and metal oxides, which react to produce an array of colors\\.$', flags=48)), }) self._test_docs('beir/nfcorpus', count=3633, items={ 0: BeirTitleUrlDoc('MED-10', re.compile('^Recent studies have suggested that statins, an established drug group in the prevention of cardiovas.{1524}evaluated further in a clinical trial testing statins’ effect on survival in breast cancer patients\\.$', flags=48), 'Statin Use and Breast Cancer Survival: A Nationwide Cohort Study from Finland', 'http://www.ncbi.nlm.nih.gov/pubmed/25329299'), 9: BeirTitleUrlDoc('MED-335', re.compile('^OBJECTIVE: Meat and milk products are important sources of dietary phosphorus \\(P\\) and protein\\. The u.{1495}s\\. Copyright © 2012 National Kidney Foundation, Inc\\. Published by Elsevier Inc\\. All rights reserved\\.$', flags=48), 'Differences among total and in vitro digestible phosphorus content of meat and milk products.', 'http://www.ncbi.nlm.nih.gov/pubmed/21978846'), 3632: BeirTitleUrlDoc('MED-961', re.compile('^BACKGROUND: Current unitage for the calciferols suggests that equimolar quantities of vitamins D\\(2\\) .{1382}cy and lower cost, D3 should be the preferred treatment option when correcting vitamin D deficiency\\.$', flags=48), 'Vitamin D(3) is more potent than vitamin D(2) in humans.', 'http://www.ncbi.nlm.nih.gov/pubmed/21177785'), }) self._test_docs('beir/nq', count=2681468, items={ 0: BeirTitleDoc('doc0', re.compile('^In accounting, minority interest \\(or non\\-controlling interest\\) is the portion of a subsidiary corpor.{151}of outstanding shares, or the corporation would generally cease to be a subsidiary of the parent\\.\\[1\\]$', flags=48), 'Minority interest'), 9: BeirTitleDoc('doc9', re.compile("^Hermann is rushed to Chicago Med after being stabbed at Molly's\\. After losing a lot a blood, it is d.{172}lli grows more concerned about Chili's erratic behavior\\. Mouch considers finally proposing to Platt\\.$", flags=48), 'Chicago Fire (season 4)'), 2681467: BeirTitleDoc('doc2681467', 'Rookies in italics', '1990 New England Patriots season'), }) self._test_docs('beir/quora', count=522931, items={ 0: GenericDoc('1', 'What is the step by step guide to invest in share market in india?'), 9: GenericDoc('10', 'Which fish would survive in salt water?'), 522930: GenericDoc('537933', 'What is it like to have sex with your cousin?'), }) self._test_docs('beir/scidocs', count=25657, items={ 0: BeirSciDoc('632589828c8b9fca2c3a59e97451fde8fa7d188d', re.compile('^An evolutionary recurrent network which automates the design of recurrent neural/fuzzy networks usin.{1388}pared to both GA and PSO in these recurrent networks design problems, demonstrating its superiority\\.$', flags=48), 'A hybrid of genetic algorithm and particle swarm optimization for recurrent network design', ['1725986'], 2004, ['93e1026dd5244e45f6f9ec9e35e9de327b48e4b0', '870cb11115c8679c7e34f4f2ed5f469badedee37', '7ee0b2517cbda449d73bacf83c9bb2c96e816da7', '97ca96b2a60b097bc8e331e526a62c6ce3bb001c', 'f7d4fcd561eda6ce19df70e02b506e3201aa4aa7', '772f83c311649ad3ca2baf1c7c4de4610315a077', '0719495764d98886d2436c5f5a6f992104887160', 'a1aa248db86001ea5b68fcf22fa4dc01016442f8', 'a1877adad3b8ca7ca1d4d2344578235754b365b8', '8aedb834e973a3b69d9dae951cb47227f9296503', '1e5048d87fd4c34f121433e1183d3715217f4ab4', 'b1c411363aded4f1098572f8d15941337310ca15', '05bd67f3c33d711f5e8e1f95b0b82bab45a34095', 'f59f50a53d81f418359205c814f098be5fa7655a', '8cc9fa42cb88f0307da562bb7a8104cb2ed4474c', 'c26229b43496b2fe0fa6a81da69928b378092d4d', 'fe49526fef68e26217022fc56e043b278aee8446', 'c471da1875ad3e038469880b5f8321fb15364502', 'a2f65aae36fee93adf4e32589816b386bd0121cf', '97d58db3c8d08ba6b28fcb7b87031222b077669a', '3bb96f380b213d3b597722bf6ce184ff01299e14', '2450a56cfa19bb75fdca9bb80326502cf999f503', 'aacb4c8cbb3ebeba8045169333d9915954bfc9e0', '21bf7734d99d9967a92f23ded5c97a8638defabb', '6c80c53474a48d3a9bfdab25c6771cdc32fc754e', '1e4aebb032a75b186f6bc80d3ec72ce415d2c509', '278a2bcb2bfdf735f33bcd3423f75160fa349816', '5a6cf5c1cf29b080ed49707961c760bf4f68031f', 'cd22b27e2f094ac899b3f4795db0fd59d90ec4ef', 'ee05187997dcb548b86ab25e25a19a2eaeae46f8', 'd9a2c54ec3aaaea66cef9a664b704a056498d958', '41a5d7d783e7776715543a80f1dea31c2a6a416d', 'c91d076423d20939df90447c17f7995ad48af5c2', '115ab3aa4915185549dcb488a432934bc6e9602a', 'd3c27966e7ff87ea64f8e7644964d5d210bb4bd0', '239eb7a57f4dbf67da36d0c0ab2bc9ed7b2da740', 'b738cd6aeb90fcc4acae1811adb7bb569b198f26', '6cbc15829a4c16189f1871b7fdb5ca850555ec5f', '9c244015d82b2911bcfa74ca68555db4660bda49', 'd2a2add50f11f8c5be0db504509e1acfad435817', '0f7606f0f386e860db2b6ef97f4c71f4ae205646', '4e5da9e9bc3695609fea69ce04f147c7096ade8d', '5c01686a41c31a6b7a9077edb323ed88cf158a98', 'bafa852ed1764321494cdbe4cad97d022cbf24de', '2c7434dc50df0b4adb11e52fb6c3c1dd816dee88', 'a2d7c5237e7e0f3ed63d04a10ecd33a2e289c0c6', '644bcc8870ca92db212ab96640c98b26cf4708b0', 'bc7355ebb81756a284aa3489edca2da2f67e8be2', 'e7508004e13b0f2d3d0c3b07f4f967b38561096e', '1143e42cd4fcd8f2564834138c99555cfbff20fc', '2838302385c5f2338212e81962485c7bfb52bb15', '302e4b35e9c55c367de957d99c53567cd4f9af40', 'a468e406d170f802dafba994dbe9950c244c7320', 'd600ce2e7676b6d5d97be126014faceca3650408', 'ca10ca753aff094b91f51785dbe7b387e1c50275', '5d05a1ec8ae2cf34ce2ffd9efa07c6e5d39136ca', '1ce9cb252c3c3c4083cdd8c51f24ee1eb3a7cb17', '8f75355add4f9520fc4ffcf525419c5a299814db', '23a400d9b5a70223bf15cf0438d3408d8923ef1e', '7cbb016c73eb05f5a2b3996af687a0a2681fed97', 'c36efedbc8c0aaafaa32e42d93dfe6352c1f99ab', 'b9190fa1b349435a0532c7cb6a29bb26a7f7c78c', '3e4e7eab7bf967c2fce4c4af41e212f9aa26af87', 'fdf8da6cad2e443280845663f2fd5211fa2d5316', '9fcfc1ea5a4171cc7d1e6de3931999242f8a0ecf', '4339003eae685c293426398e801ee7e79f5416e2', '7c55fe6aa32bab5f61cd16df4d8156a0aad47742', 'bbd0c02b60a5737f49b019606f1d8adfc8eb4706', 'bad5e3c5fc9da04968790f1aa4166aa570511454', '7e720d2daa6a72b02f04091b14ffa74b5f9d6755', 'b406feee35d476c4aa516bff4ecc5c4c6ff6c353', 'fdece23a929504045b87fac8ff2c490110d1d624', 'cd887424b25d9036cbfb817fa57a3c509297a82b', 'f0b9dc64f6df004d3f776031050317f0a7fb1bdc', 'd71e30919f03df92d76bd3bb8d113f1df6b03710', '93336ce843d4f83e8c22e02494880398c210908c', '3e4be8785a1ad34c60356385a5a7417f7f2d6699', 'bcb7894325606c9765810563e89ad4fc275ae010', 'da10f1c9b5a6253ffbb9c8e933993b773e49a188', '106b8c7a4053d77ccca319a7dd4b054f60cb4026', 'e233b5c4b8ea9e6daebee83b956bffbdca2d08c2', '59066ff5d305249658150618dbebe7ab21ae82ea', '3e769b6dd0b0fb3393dc894752b0afffd8d2d064', 'd127160840debb1a7edf38ac5cf02914fb2f8a59', 'b8485fe3b70588697cad5f46726ce18ca8afb77e', 'f0ebee9b612d517fc3831cd45abd503539e25085', '381323297bd2017330fb53c2d81b2802cd7caf88', 'd6e99b3ba3d2d8c00c50a154a3e5171f99ae2c85', '03576d44433ecb04e7d87b526fe8238a4ae6d15f', '09b48c8502fa63183b39cd725ecebb634f83c037', 'b2409618be98dca139d2b1a326c9f47e279bd600', '8d0d940113d281ae522a72662fef3d6c40f9d6cb', 'eeca94cc6cffe49537533ee37fd7ba5d18e70386', 'ef8ff6394a9e463acfd4ce2784c68e2f92a55e17', '14d05578d0b6b71c2f023b71e9fe71a6b44430da', 'ae8440cdf5ecbb8dfd00df723f92c34d21fcabb3', 'cf6a72d1eb2011d5a2b57d58c4ad9bd3751c3443', '4c3c3bf48d48b3ceee50a9b463b80ab9834aea68', '7720e805627d1f5d6c992fcdb0f2bbc31e133284', '2e440ff1540094c0608c200eacfd5573b424391e', '919955f6b198260e4d889a8f6ac55feac3f20ac7', '88f0d71a63552a7cee72f7e8e18588d0776f6d8f', 'b4f7477fb596e4933323f153fb1e287bef8b1335', 'b52c1f8077090c660ab4a22f47e7f8483b4bb7cf', '7486b9ceadd64e496470e75b315eac543aec7f2e', '5d068f5e3dd9813c3b108f6eb08dc436a134a218', 'f36bb6f93f3045d3bae924411ec7a07be0a49c6f', '3dbb1997727ad478b1e2a6c8c27386cc92fccf9d', 'eec9b96d796a8a4b00af1b3e2ced301dd4607312', '5ee347218ba58940df02c35fd7fcf1795ff477a7', 'd5a8509199984393f728f4268acfee97a8ba4ff6', 'c1220fd757136150ee5f55e83b12cdcb4302048b', '4c422a7012858a200af4967b11da8f9a457ecad8', '44e6a642e05f9c11f2df02133d9b57d2a4b30d50', '02a4293cb083dd54a0d685dd4fa25ce165850557', '6222bfa3e282948e250a8688027079380012b2cb', 'ee666f494724f4ef3949a5532d59b207bb42de6b', '14e21725a5e25978b25ec4e27dc190d2e2ec542e', 'e37b6d3840d037f5536404fe65c81373c2574d66', 'bdce2c4a25826d3242e04f993af16c10845fc78e', '92ec3f0e7d0cd30a68b901878b3d27a752a6705f', '85c45753caf576297cc43dd285ca4d78f230dde1', '1c8bc4fe215841be7290956ea96502a7e494e76e', 'b0574087d5b0b4d4ed4819bdd590a1b7c2024802', 'aca085cef4b6c042886cb608bba803036b704000', '4da844e414c5ddc177ecd4bcd73ed975ea8cfd23', '924c6135ca4faa043fb8e0edce850630863a0302', 'b72c6b29224871d4ced87289a225b1ed47cbc6fe', 'f42b9bb8e50573c66e9b9166102989dabc76fe7a', 'e4a2496ebd30c6f2b882be322eb2aba9de3cd15b', '294ca24667d7c88231492ab3bc9167a4ad958456', 'ec658577c3612814cf1a2a0f7d55457f42e788e2', '368eea6972bf8140324e5c22684cb60b52a7c35c', '14e9aa7473b18aa729315939cb5b1e427275cd00', '9316a26093d195273702effdd9502a077f1b4dcd', '5bc936d39907e99068ba1d07f3fa8883ffa0ca74', 'a1f4ea8b9f875567e71f71f00bb3b0168642c91d', '21229ea3dc420e19b3b3b57345dcc2a5d6bea98b', '04257c2d45c025d1bac07119af163dad7b1aebd1', '27d6cff7c45020b673790d94f92a58ae1880027b', '634cbffc7aac82a1a7843db2d9c1bc5a351f6849', '866696dd758d404291195c651e254e428b6a4be8', 'd24ecc520e2d4bd7980dc5bce547791688a71f72', 'd8869d381387be11924a04b4525f9a408ef37cdb', '77db5dbfb33c0919f60c49bb5ddb99861ffef474', '0b5da93fe58dfc004d3569998e00f979766af658', 'a90b2fba001ea50302b8ac1023a06ffcbfd8f7bc', 'c5ab88898f33f2a37e4a6ab7563f562264c47854', 'd5708326c5e682d933ae1539b424aee93d6b7188', 'b2d146286843822549fa46ef0e5263ff5b8ef436', 'd66c46ae198aec982bb9e98762b39c9bb11bf6ca', '1f7c71361c065d2a0586be3868260a81122648d3', 'd76e965a8a5fb9139e306ea5052896f5816358a9', '9bdac06d5b9a9804f3f5ba029aaf3a974ce831c7', '284f78d2b7fb96868e1ce7cd4ed02321c450ef68', 'fe7fdc4dbd9d45487ab6c69caeb6182a69ca2019', 'eb21adaf5017ddc593cf9a3f252adf31ee240645', '1aa9501d7ec7084f8500864b3ff808100c8045be', '250019746203612925abbe02d83589c3738d3982', '60127d1a428037c1835292ddaa3dbca95fd12ab7', '518158093bc0234a8a7c9657cc03b79483c21a76', '41251b17e995217b2417585e2a44cdc07789a0f4', '014951e19c98ce58a2607fc12df411bacb982d3e', 'aeb6458d44cd2fef802fa5a9f59f87d62e0c02c0', '1c7b5df940825c71c97efa97e309bedd89562635', '4fb91b679f5e987ca6c3a9948f0985a52e9014e4', 'faff869e53b2f379ab23afdc01292a300132edba', '3fa12b2e36163600f62bc8fcb4946ad734cbdc00', 'ee12be5cc21a34d28f5a98b68d0ccc5c416caa12', 'ba91b468f06275fb4b7882421efcd8070aaeec07', 'e8309b9b44361c03231a62bf59d2587185a7e81c', 'eb7f5ffb624b9e82a825b27e1a3c7d23e2527a35', '468a2bb7a12fdd1dcef61a3194070d7d9a644fdb', '6983da35d34d9456bc6184e36d92426c5a117e97', '969b3eb194dbff25b137f29cb9a015dc38b6a2ac', '321e868c86d2a3c86003ac7aebe374c1eab25b81', 'a301b95dccd5a8fc6362e475af50182bf6a1caa7', '9396fe8096d937bbd482cdafe29e5c4e1751fd06', 'a9ba36a7b0bc90a9b85574c1c597805f05771e6a', '7184e02e6ccfe08ca3ef7c3b16be2023b6be0e24', '25917acd4d1e96faae4398452eeca743dd64b2d8', 'd0fae819552bd425bfc3d780429bbb7b8d7a4d0b', '410316db0cdb9001e76daf3e2a27ccd3c6156042', 'ce49a2c822c9a27efc00cc7aae022e8d1aefa982', '78d2f29d9b5af247250e04ae1e686b0c6886b2b1', '94ea5678154d34b270133ade5265fe21a551e2cf', '38e58fce0b460951ee28a2162fcfa7d2847f4ce6', '7783feae9f5f2abd5bf1584b98a5707c519d4769', 'bb7ecf6bfe1776320f4e7d68c56435573aa5eef4', '3f7eb16d88d60db473f703fb8137972293b6eaee', '55f923c75fb5344d4df8b3fd12e16bcc49db7372', 'ece8d11971adecfb15c81ccf4e0c5b2b48d10649', '0d616bc6963f3241bb2c417b4a584c0dcaee7125', '396fc63019a3dd0e550f6eecb0bfd1c34601aa22', '592623484c2d4a482b4841100eb2aaaf6fc85ead', '15667d08f9a2c4bceb5d6e8d3a368dcbdde75bd5', 'c30bd16c400bee7a51d7ce9aad20560d01e28ba0', 'b5a5534e0d3104a634f61e47773801649bb277d5', 'cca7f73fea0ad9d96622509f5428ed8410421948', '17b3bc528440771b104d3df884dffa417f61000f', 'fcea8027dfc0ca7b8871084ad55c46f09615df22', '410ed30d7eb5fbf18764d15089d15c2f68896727', 'f4ff2c3a64c8e094d46bd4ed89e9b02897b9937f', 'dcf92a2d6e5d96678031f313b8a78b6d8e4fdd3e', '50c079e60bbd843d32e46cd1b9aa7f64daa5b8ac', 'aa9d4403443abe31b7c828ee6df0b21c155f3dfd', '9d7ea82ffc353f5be53e245981bdf6c0e8e93839', '6098353c87dea12e0a5881f66ccf738face30d7a', 'b91b62138e674d8e35edd564d785550775d2c745', '0e31b1eca8f03d8b0cf561fb0f76835f7ee7f91a', '6b1ee3d9df1356725cfc04a6316725415e925fab', '1b1de1c77f7aa95eaeafe3bd0e8fc681c13e5a49', 'cb2db48e636a4d871185d832074f68449f424a59', 'fee8292e18978a34260e4a500ccdef9f1994a536', 'f276c78c60e04ba9b0329be8446faea57366a236', '41a0d41f46f90c017b539acd752674970b54ae09', 'a6c0d67613e238e73ebfcb1f9b90d7e248a89a45', '91df33c9139df01b8b42b9650d8690b6bcb76bb8', '269ff07c14a212d4f3f7711fcb1ccf5ce1b9450f', '2aa083a360f5a1d8b598b63d098f8c1b19e428af', '4d6c669dbfd103083f96d2e8a893a31738ab235e', '79262d5d32c1c7de6839dee1d848121d92d96c37', '6b8d37fd28d1dbde8cab08e032eb3830e994f8de', 'bc4e5593972e8cf713535b0fdb6acb1f5cdf93b5', 'cc041ff04e40f09cec667065ac30b9ac3ce2f3a7', '1c312bff25f99daf788c9b8a6db902baeb3dd5f7', '5762d52d8230068883364b93dffbca73c809e49d', '8a19a2bcdcb16929da444ae71631e258fe0b4bfb', 'ea5a76e30fab975e2549bc798bfd2c9bada3e33d', '9a872f7533e423a59aacdb54fe6139fdbb4e7cde', '7d88721118e3fcad46bcc943105c2e9f478d5fa4', '05c4d2ebc8a5bdfb32714fdb1950616891074b18', '126df9f24e29feee6e49e135da102fbbd9154a48', '405dfaad697076f3ca61b50d48db45722bc3b503', '23aa5210aa633ec78e3ce1823cb9cacb18ff7124', '6fd38c3d2f0a455c1223a47280485b26c0fb9b65', '9155a25fa50c2df0fc4e155f8f1e3fd8679ae4bd', '29c18531bdba93cbef2c431de4047c182b642694', 'e01d12ea29720c96003f59ec74cd56b70571bc42', 'abea867b5328f52c0c6beb33f60cb4876ea14595', 'a333ac76e4b2041ac684913b23a8cd719fd46445', 'd6ebe01eb11e9211760a220466238c368f712474', 'e5f06f97eda1af0488eaf495db763f4044a52769', '95eecfd28ea0f3441d838a50c5bfbedd3550070a', 'a5c60b655425ec47bd1119b6ed33edc96071f10a', '04f1a046c66f87073660a839930f1cb6200886aa', 'b43b67fafeaf88eda5673504063188a02d4e1b45', 'ca2885d3eba2983b82345e9362adeecac63f3ae5', '8d677c93182b8163e8bf8d6004906c79c1c06b70', '957d4b3ad38c69ee0b231b385421fac362ca5d65', '3560b2618e9c64cfd587b87966d9f19ff659a148', '8aa041e9e43afdc538647a3f68305538cb321003', 'bb6694528752e33fc74a6ad2d28ef9c1a7b8d750', '1c21e5c66cceb27d56f5823934a0fafa0157b231', '8d3777856af3010c935730ff5b3f482f259f0e74', '653818a8f78fa6b45d7d3fd89af84daa96eca38b', 'eeaf75f79f237b450976b6305724c72400c84095', '7b72aa337f1e5019ba47d92c26917062605fb6d3', '5ac98eb478b8c7b1b20bf7dde0be6ecea38f82e3', '78495383450e02c5fe817e408726134b3084905d', 'ecc84e3ee9348fb182b282a848159a24423efdbe', 'dd4041c26c4d50a7966697819a2bb1ce0e4d1783', 'db38399a4851e72187322ae7f7f9faa3f2eb69d1', '4068609212f024a8cf3c5b7bc755d415f269ce74', 'ae2a4c6f2ad099cd8f5eb66f1935cc1777244bf2', 'ed073c437190660927387d93f41b6db3b2684311', '5bf7cc6483fd054d0ad7f3a37eda94db4cbb6e58', 'c0c00fd1934b61b39ecbbcde007eb919d7a59bd4', '0638b5dd509cd4f6d68d10557967e7b66a741852', 'b88931cf54a7fc51e1b09fa3fb99ecd6cdf41d6b', 'cb9f43961f4b7033d3730e258b14bd5f59f242aa', 'a99f04d7250b5b5c29e3cc28a6c81dd0eebeecd8', '64835ff8ae3412811c182771e30fd33841cc92a1', '15b991b966a9b446705f4484e3380702121e470e', '2b497978211c471e4a5586a2d99dfd087b533b4a', '9313677ca439b3c63a591786d2ec4b3a192aa32f', '7e5110d13ce5a393977c1b4aca7c2a7c06680392', 'b04645579b5be335ad0e107e248b3c1885b9168d', 'b0ead0e018797a68bde2a5cef44926dc5dd8a27c', '1a93897ed610235bbd42debeba79d7cd3d37a28d', '01924bb7ce5da3457c4a20006e2dc2b92af72434', '07bbf1e69528718397e5ebf9a2101b8d9b320743', '5f074195e88c9a6da54602ccb5d7a755668f055c', 'df207c92b2196461da35d5fa69f0968b339709e3', '9bd81c4e318cb217b29f7c381def34f1d7454ecf', '850793dbb35ce40ba591eaae09f59b06ea27f4b7', '8eb24b298856bbe5b13590f185152a3af198bd2e', '353bf82f2ffa870473a814155b4214f93af41bdd', 'b4ded7e3a94e831dd807f0078ca2bad8598c8578', 'd7e7a08814f2d6456690334c04c832a117d35b2b', '6b8f96342eaafafbb4c87e7e6913aea2f9a663f4', '729c81595b1a792d08c407fe5d57826020837a53', '97e1cf63054283e8df52425cacd22eca1eb53499', '392ce361c5b9d0d948baf2a4010676dcf592ee7b', '6c60baf027de044cd8236deff23cdce78b525361', 'a10c03e8084116c112a954da9ed8ae59c426e356', '842ee396e6997b5036fb4cf0bdea527bd37aefd0', '41de39e0cfa898bae4a977f45408ce19dab329de', '21f427b1b38c45e499480f11856b9fe30615eec2', 'f80d7a7f97258d7642491434785db78c195c6f84', '807c519bb57af7329db35dce846849c900b7097a', '87b39a2592f2b96f8366f7467beaa406434dc134', '198e0e490533df571b9e6606c5b8a6e54afe2065', 'e3699b84cce7e3e9bc26f6450dd261d29721f00c', '9ad6f491f498a1c503c5f80a8acdd03156af1429', 'f64a08b8937bb243219905083f6396f11e33654e', '7f0264c0e4abb2d6701ea5006bc33a8e5b1b569f', '3ec328171fc900fca1d324ce25f238718cf7893c', '2c13925dce07b0b829ef87a95beb1942fe1f0e1e', 'ad64a762ca7ff3fd65cb065c528740a37b5064c4', 'ec9d2d7868125331d6f34d0cdef53cf27b450820', '1b39e04c57ff07f5614f81c0c4797ee50803122f', 'a7065010db5543c02d53f602ba6931e9f0e69d8c', 'e0171a544d32782a4c3882cb3ff34dd057a7ff49', '57c587eedfd0f25f86fad3991ab2978398c890c8', 'b441ddf343911c1a0f8d6b3a437d79ce00c68f0f', '89c8aa20414672a4591b2d4df82d334137fff44b', '64e1a2b174496b919cbb3dcf50902a9af5f444ca', '53c86b26f02f56e92a38c560e4aa5b046878c763', '4c32870cc6450cb596cb74b01eae3bf7144d03e4', 'f878ae929922759f6a094d6c856a23e939858853', 'a73308d440625a67c80cd2d5c9090008c9be9f80', 'e890bfb449753bc794fab3cc98ecf8b66a2601f6', 'bd1ab7fe6df2a5dbd292f5a89e777972c1e500bc', 'ab2129caff74c53defd559ffcd1f12d793e8f174', '464023e4041a378b4826a1ce406575dc79b473b7', 'e8bf30c2c133c6e3ae6681909efa25d8edb8971e', '5ee43bc72c88f50fcfb6fdd710a06225846ce00b', 'fdb874a5c8e58f8abccf525a727a3413e1f1b759', '643c8c17ece46f4aec5db67199cc40bba3fa2a78', '0d286827a2bc9e7de533277d2c11255bac66496b', 'ea864047e3fd214878beced6a648408f8eaacefa', 'c0f8667b872b62155e66e57e36e4bb4044506747', '61ad1c50d8bb1a67d33e0020673f3347dc85935a', '6a32f50bb7f92445f5e5ace812ba146e9e2d7e63', '2a2bdad21be9bdc6af948462abe2a222f099097c', 'c93db39154a1eeb6085abe6119d1f9e4de4e5c8f', '69430980fe2880b9b06f72e0327ec15d669b1a54', '8e6d070195d4a5a048aa8e8d06cc2798afdc11ca', '0f1ca0e044c0998f0ea294ce5c429a574916d601', '2d9a5fc8c6affb69e1cb38a0181cd8b1b38e83b7', '9e5a8818e1386f42c59a8aeac81d32a12f011d93', '5c250d3e035ef770850d36599e29ebc8c69d7dc8', '4fdf040a82f1f15a066c9a8ea0943ec2a6358395', 'ea1a9eeab82cae242f8ae6a8b0f8a5fe8ba27845', 'fcb3a54d3a6b9b339eb6f8583f84cc10efae8986', 'c88bebb40a245d1f5555bdfb80f1b902be62f936', 'f4aebce1b34d9de6a2a403641977c7870d6f3918', 'fa70d436d25ab51992efac193b29fd398a90f5f0', '8a14cd4d89d3b9431bde792d9d826d7c2c407383', 'b4066a397e303161363ce89862132fdeae0199a2', '8bdeedacab0b3ecc5968c76f219e501ca6659176', 'af4b5684fbca9ed7b50c6df3f72999b036ea8e79', '6e7b61999ed8275efd2d4b34e4f53e783e8a9164', '05f7f75c3cab14e941bd27b585c86d3998f412df', '4cb570c770dcdf88b48d235ef280cc8e2c9beda3', '1561f98b6683778e36b0df2f7a7f079069bb2a3b', 'de16ca9859ab10c36e8aea8eab09f5143f68b39d', '8e57c4b3669133e09961df068b7ddd357a2059eb', '34fe25f49a3c7e0723ba4eea466884f7ef33104d', 'd7ea23352a88c086c148cee79890a63470d16b8d', '7a05002e8edb4c28e40c57e987c792f97bef71aa', '35e90d988889a16545583b267a1f931882127e99', '3678067dc8d6ec3e787a18c8e99762cbb4c232fa', '787085f7b73000b94b9dbd59be04fc0b6596f720', 'ac1464a6d98cf0ac9a951eadad9dd065301e99d8', '0df69845757c31a9329dc74cde93e0c2138a6896', 'cc2ecd92bab62b16732b9d20636f6034fa439a91', '49838912325b6d046b6cc14d27b062c5f7bc1449', 'c7037b1179e4d38336b53b5727845f56dedc9a99', '15c174e00313143f52f667a4c0ae3b4a31ba07f7', '8ef4ea7a639b7f7e1ae9ac16177900dc6ca76000', '091bed11c0694e311fa5676a34b03079d62e5472', '81d86206f59ff83c5a46c584c6b461a95fe3dd32', '9c1d843c92c3efb30f0a151657b0a0ad92f2ea62', 'b5ccda12348d74005d9e928a42a7671a337b8ff7', '0b588ebe591e163d8b1e1df7982769e08ae1eb0b', '21d41af61952500538e907673613b1d3b765acdf', '0f79e6324b3ee895cb672a68c7020b8a7fde1b17', '4ed3802e8151f582807363e61422f3beaf17274d', '043f85b09645e788f5b069c7542c348dcdfe173a', '006ae86a02d8cdc2b471621ec1761583577ac484', '5d0885053508c58638af879e0de3376d3b3ac87c', 'd1176e115e3e49775f15874142e41885b1a74ce5', '7f2a764a9f5da7b2e5dbc9ecdf7cb85cecd9bc3f', '3b93daa4adee016d5fd0439e3d52380097c5a099', '8b6b4d9766c062d0e2ec8fde014bf90e98538049', '9fe20cfafebeb2d0176700fe1c9b64ccc583853d', '15f716405fa7fc11d3b59f19c98f2ec28734f8a7', 'dec613220a62e998b20037a447c1782f842def1a', 'e88087cc3fde1c50a9255a24a7822103d5eae9fa', '7b9f983e9c835a325b64bd15449d284f38d9a5ee', '753d9142646e875cf9ab1d1de18884f936e6daa6', '2d6d6226fa47bca22bf875335d45ff4347d9e67e', '77c2a4233cb6499d3739ffc530635f43960b41af', '899a5b6629a067ffab4d9aa570b0c13294c42983', '888c1a0c5977d413ee58e0eb84042f6d5080e938', '6a6c706607ff24a4868d5f16497fe0ae3d8e6859', '6fd2ad1aae22d635b382ea4acbeef67fae3af5d2', '975aa6ce3ad36ea2efa34fbca953c87f81657374', '574e7de856c34f980ac80336c913ae968b3c4687', '6f003acbc28d7b323862888d92cf12db16d4c873', 'c05874dae758a7456ed10679f538a00616e94302', 'cec1faaf96cd970a15dfbbee015df73cc6770a0f', '86d2f41be9a37be8e2b36ea57a2d41e82c61837c', '97a3cf067eb6c7849ecd0b47e215097a485170d1', '1b39e43ca0438d348a333c13c73c59943fbb9c28', 'e05fd4a3d18c58d6c84cfc82810a6244328b7689', '55924efe87fd5ca2642c14b5c9e6ffdaccfe9143', '6b3c55ba65d3ceb8f9db1472655ec0ce5a121aa1', '43720fe1b3660cc2071c2f72a05516f7b287c966', 'c33e1b6d5a6fadebf50c9df1ff901faf366413ab', 'a4134433d5d1fe9f83b44c04c59636d1a85120bd', '4268fdd19ed1d233dfe45ded139dedfa06a193c1', '9e164ab067d610a244224d74fa0b21aaf217d54e', '610c1c4ddfb745f44885841ed293c0d64d225f58', '6e0f1085945fe359b46b8a6b583f8bd3ab48d953', 'f300d56a3dc444c8f2ab7a4d0df71f10dfe0467d', 'e4a07483c81e90b2278ef29bd7bdc3ccde6d1ce0', 'a7121630c53b6cd44bcc975c293e9bca065498db', '53e4b45e7e652f55276b47f34e66f29f4e33df0c', 'b011484ac12278ce2fc41990b190d623b328306b', 'dc7df1803ab4bb87dc92679210d7b69ce0e237e9', 'd33a47e79344733264e95a554fa8d5ced7c1061b', '4bcd7aec3bf2493b57363f59e890c0a49a6860a6', '8b3956496fb31ca373991b136e476f883b8a0937', '9837eac799c279e35ff02f80e1651a89b4962e07', '06eab37d7f50713ca38a89744ad35c4e3b596cc4', 'fa4f5d701711149913af347be12de46fd0a192ab', 'd6b587f28be7e334b29881970ebc53d22d4801d9', '4bcb6a68c40d11d97fd33bafe8928c7e8bc784e2', '21e58c2114c2e33d7792881f95dd73ed4532e916'], ['57fdc130c1b1c3dd1fd11845fe86c60e2d3b7193', '51317b6082322a96b4570818b7a5ec8b2e330f2f', '2a047d8c4c2a4825e0f0305294e7da14f8de6fd3', 'e105fa310790f91644d2d9f978582652d2d4de55', '0e5fea0a13594cfc6ab9c8cdf3095ed16b728e70', '2883f7edbe5d4a80bb694a4ee36abce29cab5706', 'f191a434b8ea61ddb6b20cfe99e65dfa710ab5e4', 'a24109c954c160dfd52ea6ff107b9fe6f75da0fe', '506172b0e0dd4269bdcfe96dda9ea9d8602bbfb6', '9cf5415eaec3c9cb70ea2dac92eab7652f829fc0']), 9: BeirSciDoc('305c45fb798afdad9e6d34505b4195fa37c2ee4f', re.compile("^Iron, the most ubiquitous of the transition metals and the fourth most plentiful element in the Eart.{813}rk has begun to take advantage of iron's potential, and work in this field appears to be blossoming\\.$", flags=48), 'Synthesis, properties, and applications of iron nanoparticles.', ['5701357'], 2005, ['82b17ab50e8d80c81f28c22e43631fa7ec6cbef2', '649ad261855d2854f25093bf3efa541b2a249af7', '76eeff302dcb0fbef8986a6f317a70bdc1b263be', 'c3fcff9920c799eaf96378b289f79c26bf61a049', '4422e9e1655c09b558b3898f4b30caacd9bd3429', 'c93df6a2f47c04e3b45fcffcf5dbaa4032e65399', 'bc01ee13cd600fd91cd61a2367fee18083d38d2b', '52479e45194554361fb9d98c9ee33e23d252fefd', 'f6c961488bd11b9a4fc210b047f170a55a9eafa7', '1230a8fa38065b5310385d4d654bc425ccbbb6aa', 'e350e0d36f2f378acc8e81e773bd44e9ed22e966', '21cf8ba5f7965aff50269fc0115ad86e90aeccf3', '70fe357999f3860ea6ea10cd59d23148d7c62af3', '81316082c6c3dadc10ba97c1967979c02779ae4c', '0eddbed524231fd153e45a8c395601d754863468', '81c144d26df3d344b79ed36d49fca3aa20c9552c', '3f47268db51e4d283df6f3ca130e9913fda7c580', '849878e5d1140023cfdda8644052b0b2ef6d2aa3', '0b5c42f099d973605a9a6467baf9031fc9ca9a7e', '700bb2eabbe05de449ef18c74eae338aadfde954', 'ed10bf3bd7b9b006a916fa34d910f20fdf83d631', '9b83f2c29a1523fb765aa6317a3221892c3f173c', 'aad2912ff93b99a2f6abbbfe4627a82326a33565', 'd2e5db5f68170f768dc27407d02dbf56be7240a6', '56ba06f04c8e80920ee46b3a1c37de6918e0da47', 'fd072638033a25f2d635e12e17d4856b9611a1cf', '3f5a0e1110855a442e0fe31bd15c3006d90ba683', '0d2a00690d137883805ea160305ebfbdb9a0e9ef', '0b4a03a77b9c066421b94f18d06ed5051586fe5a', '074e2837dba47e756f270061fb723f06868d550d', '219872c6070a2f5400e95930a5e659305ddee09e', 'ece0c28b3b60672a2b2d04a529fc43a2bffa80e8', 'dccedf91d83f215186acbaf0fee8cce96630e69c', '016a58d257f1592fc4bf4f409d46e34448af9328', '4bafd7734b3f54b8d55b9e0755bef768cd96d1c3', '324d172296533539fbed787a3c255f883f0df455', '76e522626f8522e89e4e4334545ffc45d28b1c1b', '62226d54b69b7594441707e2af6f15d7c284b82b', '3692ac11ecf46a584e47d3784cdc8589628ffba0', '7982e3b444f78bc83e69e3d805e817fce0836d9b', '3fa55a993ccc2ad96d30416bffd346ba87fb47c0', 'ae9ecf8f5a62cd6145305b987ec796aeb0277d32', 'b9d80a195094d78080414f55988532d53a15d2b6', 'aa3d041859dd502b575638c7f118b722f066dccb', '653277b69172858b6972854291e0e716d8f487e1', '97e1b60b4391cfa956ac682bd9cc422694f542f8', '1e29d7d2fa274aa34e04ff223ca2e14a1e38154a', '3c57f7b2b31363cc370bf6d0ac16d47b8f910d58', 'ba2135a7f332e6cbc15680e57215286baba69964', '7311bee2d68d0bbee8a33070e681c7eb225ebfc5', '73245bba0fe6a8a344a3243b7b471217b7451e9e', '2b14b5d080181e1bba344047ef799ed6ed944298', '27fb0b03765798505a08760c748fffd3a0f477c0', '753670263c3ebd07de930a4e08b2a958efbf5601', 'ac5a49658c3f623e4a5c834edeb3a05c6443e72f', '2eab1eb680cac5787ae2b151f542cf37c6021ae7', '41d5b1ea2e9c040265e75eaba77d47dda24b8b71', '0ea10201b14c37f09cb707eca0ce611511b14fbf', 'beeb798dc5a4b1317d311934e85db7aada84632c', 'c4b70f1dfd91635dfb26adc2043ca50570df2ce5', 'ce8d1f138728bbdbcbba60ce511c1b01d9099ebc', 'c853a66473866e6a84f42a7b08158efa204926de', '1075e3a139b142fe8b020f0b37a96967caab72f8', 'ac6750c5f26e3cc6bddcf94db6f12db0c806e4a3', '11a9b9c13b00ac50efa85b7408a69fbf8eb0914d', '1755b4180e6f160645ca0f2ecd829288ac3558b5', '376f5f4b92a36b799e75efaf91f525aacf41c6c0', 'a2b6fb94bfbb0a9ed912df378d515fe12fc50c21', '79d0dded1bfbd307b1449e9f0515f384f86ce9c3', '82c0878c01a449e35d0d55cdcaf3a5a117dc3dbf', '57c6b7148d049c56a128b1c785a71a060aa5fd83', '314fce43a13565578b55a24dae9a96ed5a666c42', '0b07b032ba0eaa3082909e710c39a7c0baa6a712', '29b8c3b4fd393acc338f1af30b35dc0ab4c9abd4', 'f62ce373a2984a4f393723bccd931f254e476391', 'c8333d1c7faf00497eed14e6c0b8e76e3a3ee0d0', 'f7789ef50c67e3d9d57dc9d3b15a7f6f720f76fe', '1be32177c7a1b9c65dfacad88f0a135164b44246', '9359d828b98809f71d91a913284ca0b3a864b168', 'b42f4ff18a858ff8f02e6b144d295df7fb24a862', 'bcccfa9ea926dc235a1ebfd6447b722744628c80', 'f38611811b63a177d10273b9b3d2edd3456cf8a7', '2d574bd536a3ffe32bd90339beb7c194b91c7af9', '01302fc606ae1b1d5c7fccf0ea938e9fbcc74fff', '5d610cc0cced232b7ff3c8610b7e9f035a06d758', 'd6f235ff5f6a85c25d5d239cbe9a8256d7543ffe', '30b6c4d696985c2d0ae8cba908fcd09312de4bb9', 'aea3bf8bcdf385288c2a9f8abc594ae894976efc', '8afc9f9315f3549c3ecc5f8db52a55533254af23', 'bf0bfadcd6a7d24cc07deab36c18ef54daaadce0', 'c0a22c3f7cca7befde2ce5d596bac180dd64b531', '77ed0d5b23793683b26e40f894bb11f6978222cd', 'a4d3445492f8b44163704ea90d72072df228fc8a', '0d2707b130fef30620c3b62d48e59716fa22a43c', 'e4685e3bf2583d249fe0163db58f620993601774', '5df1f62d5d94e4902064bb13caade24014824ca8', 'ea6525e27686631438c56ecf6446e8718a9fa39a', '158037eeb0be786d51577d0db76141a166b0006c', '099eaa1b094a3e4228098d7cc85a5ddd133bca71', 'f3cfa4ab011b43468de46a109de6eef21d1f8d7a', 'c56349ace2995868973d5f05f08a073dd1d7d9ec', 'c4f6b2eba635152d382a4b14668c11f9c367c343', 'c34c62fedbda674aa0f16994a0978092f7dfcfc1', '9992dc990ea5d51c9977dae7dd97410aa59a2a3c', '8f23e205e11b4ad316791f200913e35e89516377', 'adaa03397ea52277404dc19b791b25419b31c373', 'a20414c9016670be7d6e11bdfc9ca7f856f749fb', '909d29ff282cbd443502c5c0d96a613c67f33629', 'e29310ff23984b0c20d1df175dfcf48b92fd640a', 'c4ce94a35fdc50bbfacb8224f852d74259605671', '6fa1ab72cdedb42156299615c740717a44158cbd', '867a0973fb128e373f02b35407fde004e8fc4996', 'dac854b3a2e5953da02a181d053528c9eba3d975', 'ff5ed956fabfae3df623645a3e320eb8e2dd5e41', '351b44c60e6a2e02b1915151904c215a25a2cbcf', '0bd95a3a558049ef440458d5790c36f7fff9174b', 'b50647c4c2d0a34a2399e9b682f7d7387bf0e547', 'ac04f4f12cafacab1853bed4eafcbe7e4afb69a9', '0cf7a2b868929aa88fc4561b83f05f7d58fdd021', '64dd6882be209c9efdc496be3f7a399c02bfa9c4', 'de53d4d37a37668b26a54afa7120a5182cfd786a', 'e12c59792ffdeb75a28f3ba81db308583c0c17f9', 'c976caedc14e0400d021ea3d61199b88d827a9ff', '017e2fe5742d457636a86d9821729aa05cd3bdbc', 'c785c0f92a180f09601d9962f3aec18814e649cb', '74d843664fac2f50d591fefd451febb219c672aa', '4d4e54fcecf8c2841295a7331382a1047a5fc662', 'c15241dff7c872c86df0990c94f9883b5af1808a', '7d1f3d5a1d7ccfcaea5a24152bc7c1d0fe10d62d', 'accb8c725011e522f87f15b7f2a1951f50726d98', '9604163e2a0e3471736f6417b7867448a869356b', 'd55d770616c3199a53b352ac618633cf10ee4110', 'd3c61afb3a20b887c5528ba87af70a47c557d505', 'ffb56b30db82123488fe837b080b26d69de1647c', 'a8b884d6164039714a5d418dde0a5a8fd6693120', 'e32cb4b6c12c7a26c9e9e3d0321d255b95623856', '6f9af3a5881b9e6c2be0f3e4f56d08ec46f06166', '782c86ae6584aac656a8ce62876188b16afc7eb5', 'ce94afc8f8487ffc21981377c69f8f818caa0ef9', '443da6abcaa3cf6dd145f8dd557e19084bbc9815', 'd04bba486e8c9de2b8806e0b99c48ab8966449c4', '28d536de93f2bd84243ae235b4cd258fb48d88ea', 'a35a221b962e4a665412365fbde6367d9c369d96', 'be2fc72b84bc786939c991f2dcd021fb1dd7dff9', '68f36ad7e630629c520c58ce4fe9ac4d2491aaf1', 'ad5ad62f929c1c4ab202e4ef7c6bb6bfd1ac95c3', '24bc7c579fa0f912b4a3995641ad0bea0fd2dceb', '854b6cdb220968a68cf6ad11d6c3fbe96783d8c8', 'b009814a37c19914a564c071540bdf78c76f56bf', 'a5f4f2ab56ba1ac85a5770188caaf2a4ee16dcef', '7118310a746334aa772f94223c8cd2a240a9f4fa', '41670cba9f5c684c49998b1831abd7c46766bc11', '0a173cc4df2d354ae6913d737569b28cc6b9d992', 'e4bef1c2f5c2ddad1a8d869154fac6d0dd989404', '15aa004a99ea2842d2ab70d840c2ed29c4072c2d', '5bac092a3a48743ed515db42e1745004a0be525c', '092787f5e169a4658646d884ddd93092bc4b301a', '969596feb7162be4202d9b356bcd201ae1b9d70d', 'c25732e3973def6cadef8763684be8ff0235089f', 'e2021aa1ca43bcd99b6ac04e4da3f8d8d4ef39c5', '8209bed97970d4e8c908a49733a2cfe79b5f2315', 'ea1948d8a5eec4f9a0ccadae9beea0de08b46c71', '41999eefce1245fb1cc765a9246bad5f1ae247c8', '8d52d2e3b5b17413da36db38b7e29867888e7af9', '5bd9c844ee92675cb37c2a0c2156aa12dda9f9ce', '8f4b37f0682042416e4fa0f8d5a3acfce5c23d22', '958d8d959eb1eeeebea185d64aac34b45e46c2ac', '03516888707a1e1f87d8fd12345c311a90b961b9', '39c5d5a21947311de08c8178bf903361f257a14f', '35ee5c602cb52437ce7a4c75fe96e53df66288bb', '1b36511b90d6bb197f9308c4fe26ed1667c33023', 'b5bb5d8e2a5c8f5f7f04bd0965ceb7dce221bf6b', 'df8350e74a4d1eaf4a17236f42b9b77ec2951b5f', 'b55a5ccdf3e173a9f9fe470b8d61980c4843d67e', 'b37c32d0802a93cd4fa976452125344d516fc004', 'f670c311f4f0c41bd4202f29fb4ea4cc0ca8bd9d', 'a24d84fe139209fbda2311c6a9b357e28c10bcf1', 'e00ca9adfa26620699b0475509de93460f37268d', '865caae7eaf3a0e7ee0a12e95ace2a02bfdd8919', 'efc2cfe6a9003a230713003ec5df8eae17528826', '22bbcb8672acd1fdbe2f694058a6244efd435d1f', 'd3de8dcb601d10e9b71a3b726e0ff5111cb08aea', 'd7aeb2d16745ffcaeca1fb8e0a8f87b6738bb44f', 'b62fca281fe3fd29a1e11a636f007c5a55bd9938', 'c81c75bcdd7ea2b40a3712969eb5beeffcb390f7', '53eb63a95ca2b7e6c9b82ddf24ebcafd6cdaef48', '7a704ba047cfa97a2c8057996576db9a531f8afc', '72385f812acf7ebd57cffaf06536a3486383e354', 'b955cfe4a4407d3ad2e6d53c152a36a2b43d56dd', '1621ff5f63dac23bebf332fac295ec6d41e0ac1e', '7b4beb7fa993a24dc88ddeb5faa9c9994aa18230', '9feae90b61adc5bbde268e035897aedf9e913f5e', '74e6d3f71ddcdbdeb55a46c922e8833f103fbd16', '92137a58b6a250a4f39af004cdd05bc496ffe9f0', '39cd8d5b25186bbf99e61022e98b610d20773317', 'ede32d4160582f65d737a4712aee7a2015f5390a', 'fd179de2435327e7ccb95704702e457aac11123b', '3a8d6149eb4c92f1717ebaaeb9e51c4163a77e11', '992557cc89374c5b3c2010ba12a82717912608b4', '3b6262bcc745fffd6aef146157691fd6ac33e6f9', '8129b5b0f3190e29f65c09ec3ef8727c9a2da3b0', 'bcf018c16e4e2c11d6908facd658eed1313ff3b2', '401e73ef446390882f78d7375104e1ae4a344196', '4dec24735e5484fafabca32d9a3209e18a247942', '497163e2a7d3cf4a3908590b6ae68d0e45c0d6ea', '7851c70e64465581cadf41e18619a0a0726aa00b', '4ef426e926d2563d59915dad1ec87ab16b947427', 'a33419689c715a8a260a01b2352a05fa082ecd39', '8a371dd14880a9269d12f70037a45b154416babb', '48f10958236db99c535afae9bc2399442ad899a9', 'bc3e934f0b8c5b398dd5f7b9e9bba331c4d0dd06', 'fba52f021cc9318ef8a3eab67157858bdd6c55f6', '525d7d59cd78ae002c0fbc7407c8f835ec104ce4', 'a6414214a8a14bff445670890468b5d5e0f89fcb', 'fcf6e8b6d040aa6f1ff3c2724be88ec27d18a61c', '5ff79419c1222da84b5e37049d2b8eaee360c474', '9bb1864ced59b42d664a968dabe3645e91b057c5', '77f8a71ef25c0e230ae7c34b13b132d73fe37f84', 'd46300092fe5fc457136182b83483b8fd2a02e78', 'cc5d71a21289e532111b57be9d306bb070cf959c', 'e6a5a36705a34393dbda3bc8a9b77d94b75651c0', 'f366350fe99eec61b9e5e5fac070caac670da5e0', 'dc825f53d9d148226a293905a0ab796682050035', '68931e654a5dc1d970b4e2f22cb42856654faa2e', '68f4c1207bb2157fbb9f475100772797cfd098da', '2359ab3ad6039b5493907917e2224e2cb9c57d27', 'ea338d19d5a05d1a58730228f34a6feaadde464c', 'b7f1855af7785a6a5072f444ea9378a6e6ef0415', '79b4960b495e7d41b94f916359c0fab562ea78d8', '12c1b2214448324fd3d01d316c86665b00fba379', '0219b02acf88159b1357139c559f8eefed952091', 'e07392e9f2318d6eeb7f7c41b691534247e11a2c', '3eefbf1328896be2c792c7cbf96a990d77d8feb0', '49f8b520f74d461665d954f70acb3935c73aa298', 'f580f4b6c3366a0a1a945b1327c3b0d4c540507c', '2d0ebbc0e8f54659845fa2153dfe226aad82ff95', 'f00eba6c08f005ba7aa6f18e44cf0015bf69c8e0', '0f184261246ccaf557de764ef80457dfef546403', '73b9c4c0e575c51879c3631b6857661213f5cd47', '5472998c0f2e46a647197bd1c48041618ab522ca', 'efcf2bfcefaaba169ee645709ecf5bcc206a747b', 'bdf9581688b89c21ac1b1b861f3aca5db5b95a2a', 'ca4fda4f7bf014d4dbcf4dc1bd80212d1ae61d71', 'e206bd8357ba07cd3a48cb306dc7c51b0d329e7b', '4baa1d0a041ec74ed0a445d9387c57bace6fda3c', '39ada29c525ed3717e3d7869ad704b2ba7b544c4', 'd17409c5ef135740ed24337a5da29d7e9154f87f', 'e380b3d3b506aec795513c66371006d43e426f6a', 'acfc24f92024c4e2100c6514ea80b81a8635c410', 'ecb7771f0eac083f85e0cab38e314fda2818d3fc', 'f1199d9ae3e967e804a742db3a593e9ce7654a1b', '7d69bb4459ec841dffe85b9a54fd5265de0cc007', 'c1053712f2ca6b1b937a414b2715fe96417c9083', 'ff53c4d87109c09b43b8a76e6e87ec1da64badb2', '372bb9084c6045f281a81a6dd782a06b209f8d35', '6976f42a2540a257f330f6a965dc0695a8cb7092', 'e9147fef923c5eb43f0dc12fddb637ab15d51ca7', 'abc663f70ef0a1158993bd5701c8477fe1e46982', '79f31764916adf5027c972acacbf68084b5b7b37', '3d3be4c0e405517af14e27b8e1fdbde543658391', '1ab25555a9174c4707e388626d0d6540f335341a', '5ad26cced507b1b39d5784d35c71e3cd8a080f58', '78c618f755b5822ed7d9565f63a717b41ebdbd47', '230b964bbfcb5ab9476ca159312d56e917fc6dbd', '33aac94569d23bd6a988e94100e47839808ff04e', 'b301638ae79e0d3af5eca862092a1a2c39562edb', 'e78653a4f162aa37ea4acdfeaa23792f2ce13bb5', '575f59ebb94ac4c1a29624affa1930752628a4ab', '7ea6f7ecb8987fca6ce0b561956029da127e0b30', 'f55212398b23a7a441a69c3448741f040c0929be', '4e3c76b7cbef1f11863ad535e6a43557ef9e0ed1', 'b76e1330c3d373d2492cdfffa4665209b5fb50fd', 'b0b2b7600879ba59b1a6b3dc369c0e0aa5464ac4', '2e0fa5c6e38b900e1b3622bf2ce3cc6eaa4d983e', '5ffa3c58579776df72e2b77b9f9bfd51c235de6b', 'f83be882c6b9f84b5adf76d4251bfba4b703a399', '2aac341861cd7269f97ff5418de23836dce6de8b', '742d730969401e56a7de1a35321f1ad8ed50ecbd', '764774f071883e2ff68babfd063d43a4daa2f921', '55ea401f7660c52e877bacd2229c97b2c6955ac1', '0deac7c72edb8cc37db930680238bb2601630e26', '0dc294932dafa11cc89149ccb29e810521654787', '21b180e4e2ff346e45d3f1aa2c48c81db84ef46a', '554386661c697e1e6c341637782f3ae00f126757', '9a08d01c5e48debce2d52e698beb6c50e062d665', '9f8f731cc84ddf8f44249dbabed4b8dd4a3b8d08', 'c3c8547c666701666ce75e92163c241ca5f85e2a', '2f7724bc63966142ff91ec88d31028b419866432', '91043b821acdb9d9c40462462902f46e51fd0a63', 'b089c66326e30ab02fed73fa264fc2e3a5df9603', '7f62a0e29ecd6015dd43770c0518793a9cdf96b6', '7c0287d7e14bd29f349c34021962cf9426f65e92', '060c0a33f2a20a0119dcce7757653ae7dce9a42b', '835752249a453a521656f81ab304255036a08949', 'd738014c3dafd3ae62bbf174f03d54d2e1a5ff80', 'eb90379a029c1770a84046c87771d9dc0d307d09', '1066e0f9b917b48d27e80b7753cfa28993f533a7', '06af1fda3e210c48fc53448adae16968a3b56dc9', '936b73c9112ebd80603657df0a2348ec77bcd777', 'beb69b64de0a5990b4e5fd2b6c8242d11a607939', '9cc8dd3f817acfa7bcdb7396ee634c83186c0965', '2c2e82fa02d82854f076a48f4df4e6cbd19d71cb', '67f2be0495f4e254e06326182a85dc94519c82a5', 'b33fed1870ad04e3a132d27cf8b851bdac500f99', '7427ea64f1ec24863a5908e2b150951b63e3a999', '16ae92614e0d5bc012592eea169351e1552227f5', 'c1dc82fab3bda6bbcb62780a277c40735d892e25', 'a2bdd8eed73fd484551675c1428a8a80bec59a9c', '52b8208627010d872222f557891e82cea7e46ac9', 'ae103997d2ffee109ddd6ca5edc9488fbe18fa7a', '1608a4f9cd7348eaab2c38083a97aeac9bdd0179', '9fc4fc4680c0889189219d97c99ef21d4ac0c3e7', 'b8c0dfdaa8a10b2154b1009d755ecc4434a59b55', 'a03cd9f82b0dec7211cf69706a584e76e88be92c', '60dc76da9a4ec31b4a82412805e8bb9da97d2b62', '427f96c89f2a843d4178d70c567b1eac0a34c09d', '52f7101ba422d10ce51f91ae17e8a9c3c3e9d803', '319af963f7916906a8b289bc29ed73712e62c924', '926c26d1f4f44a3e378a0afb0dd4902f54f56cfd', 'ddce99ce4ed40bb0d628794676b9aa7afa02e3eb', 'be9f029b77c20f71167a73332ee57012ea0a7c1c', 'dfbd9e2f0b2c09d9dc554bbff38c02b80a936ad1', '8c9aea4b33049b53c90f40bd5ffe914edf6809db', '395b8c11dc7629faafb4b36b7996fae8888c3734', 'b86ccc5d835789db47e67e3fdfc8431ea448b706', '16a59335dc6283b5d6af107c30f7c95d801c6a48', '9b292736c1a20393c5066b251dc7eda48cad837e', '80581200117906d25b4613d37ad2dee5bf40412f', 'e6d99d566c8acc46b88a49a5441e641ac9bba22b', '5485c00be97e3079be0adec7ce43930b8e88f7db', 'dc052e81525a6919e21a58fadc7f32b843d7b513', '566175d708e15390b3a8e2c4f395f271854973a5', '32b623788b524de9c2e63cf6c7e9b2e7fd18fcfd', '855c40f815b90d177109232b88d8b3a7551b5237', '7c4d2db34738222fb564df8b0664157deab68cc4', 'c67a4eaa3f27267a25037ef8b06822f6631e783a', 'e45a59ae928c5f2ae0f56b6ba781455958eb01ce', 'c6f221afbe9610f2e9794f7346aa1e586ece2ff8', 'e145f48e48416a03d428a8df054fda200ca93eea', '09edabbe36b0e8c093d6da0047896587360e9112', '9746b0d7f7405fac73e347a98eccc4c387b216a5', '6c6249c85922c18e422031f1acebdc9339aac0ac', 'f251e5af0d9a2114e6becdaa3f54325cb743ca98', 'edbfe3b8c228ca664723d464208f161b99819514', '32d814c8f0b83c922953b3dcde1842a3bfb12f88', '33707ac4306abd31f160bc2b2fe51f846a7dd8e4', '24abddabaf2bc24b74b39120481f00f3bdf8dee1', '8850cf6c6666b79a37683d6dd23e6a1a42fb90de', 'ad889aa95c9e9fc81bb39a830efd816d40ecbcb1', '3ff1f4f289a5a3b30843977b9e2e0b96337c80e2', 'b49c22c953c3347f3564c09143d2b4af8de650a8', '4b98980d38325f4acc7dc97a4e0f28b59104a2ca', 'bc06585dd065b5dbdd8c921842fc9b5fc8b0f824', 'd335fd80c55caa51bb45df048867145741442efa', '26c88ad749aaa5c38bb8df402c688945919a509a', 'a539ba2d6f9681673583ff14434ae6a10abf6007', '0f0742dbb0c5000610eab03a19ec3e143924ae85', '91c009e7c9ad80bfe17afe16549592434afab665', 'e9c287b098a12dc688b5bd15ccf3450b52332afa', 'bb72a1231d67087e7f9c6127aff01bb2cc88d6e3', '3399c46b98dde9ad4ed5f8ac6d845e879e8652c7', 'f813cdfcc4335225363dc2e451c3fffeaf802d48', 'da1478be124100b7b1e7568ab620c9256268a277', '6108b950956c81a70ff3825080f861d05089edb2', '6828e0f244452d65b87d2ee9d951e355dbf211de', 'bc3d402ec8f695b6d2693e2fed826a3df53311e9', 'fc9f9e817d2e516003f748f0a8b327c5490a0231', '864ec1624f5113c76e43c722a950993ac89b179c', '91f41249f40d7bcc23168e9b619e926cac0c0c04', '230f5f69a64e853d9f82a7671d6a3e23cef21833', '53d6201b1b93fe6e3b2960379e04451021e100c0', '18dd8da3ff75481aa3886c45db32a82c73312bf3', '375a5f47f41681a56972fd459e6ee4c45c157075', '84cdf4f541e5df3979c0fea8ffd9cabdd210d9d8', '00c988c15598723a56554afebeb7205ce9e125ce', '50272ffe52aef0a4809ec45fbc7db08b2ac5e061', 'b111f0a20689f728775ceff8eb9428080a38fc3e', 'a25987f004462fd10bfe725b80ce951293d1f023', '4f92e2f8aa903474b7ed42d04ad46d8258bf35e8', '07453e6118007b17085f4c7b2488980a2f286d4e', '7485695064599f3c8bce132a67b62b3bdfe00494', '7e91565735983342e2d574a9041abbd489a4cb30', 'e80c8919ad8c91b1d07f63840f838c030a1f0c9e', '6a365c98a6e471575bdfeaf91bfa6a4e59dd77d1', 'b2bfd9406c86673be294d1f707f0880e9635cb6c', 'd2fe98b10345504770d388882dbb5bf1066048c5', 'b239b72a6889c3886bfa4e8b4e635f68062338ae', '88c5ac54a4833632bd41ba007a18f8dfe383add5', 'e23eedf4af59c9a6b25e5131185945f5bf7f3d40', '442855386a0754f524d8ab75b0696c830b4dbd45', '4e7b1213b71a0c32a7de951140214811d6606601', '8fb23352aaebc55e7be1245b0cf17470cea10f01', '967aa275ef52cfd0c1f9120ca39dbc94d5e23181', '1e3a6dc165d7351f20112a4a6a3ab77a9ae61f2c', '37aa433dae08bda1a45c5e83ca8497c4c2f8d95f', '3b6c72d3eac24cbc966cc5fcbb4443affe67fd15', 'f890bc9f98fe18f811d8bb2708f3411bba5dff12', 'd978bf57ab07e2d222f6c13fad6251d10ced695a', 'f35865413c0769a1d018bea58e5552abd2f2f141', '647bebe39fbd4ff408600f4d7fd599f4bbd8b4b7', '97f300a878382f910dbe5c950b29ba0c7b6d1638', 'e43f56c393009fd6bddb1628409c37162310591a', 'cbe50ce355fef7f56bd5bbc23633e5e0d9a4e4aa', '2185bc89e605412401f7bcf3744c0eebf6d160f5', '7dea0e9e5685ca477f512ef3c37147555e03e2a2', 'e0871c5b82ac7eb4345c8f7f3d393454eee32d12', 'd27dcea3db37e2135ad62f644dc6d8882581df59', 'af1f8904fe6f6041869790150c64e3804d5ce385', 'aa869232abec3716f6bea6b05003dc8fcc96fa0b', 'c558c38a0ba025342d50a7bcf4fca78ed121322f', 'baca82c40b68348a9788f68c1d917883f4455bfe', '1a0f4fee9f2ce40bfa98963659400ffbd72f533b', 'e95d55ddbc0d651bd543ea3bca650e4fea6ec3f0', 'ae61a81a2986ca66ce262d096c7557c926775ece', 'c87444cef977cce169853815ba3e44e6039a6176', '929d760ff022a61d83267bb31e5a9debd0fad992', '3eacd8a05832fd3741865f0d33e93509003057d3', 'df8843a770a3817aa6128144968f9780e4d8b93c', '6babd89607b1d72b80ee6f7e5a2b600eb7dde83c', '48d80bde5fe88f2c1f02a4cd953e176007171b39', 'd66382542ae64f9f72845a22e584c7ba3814263e', 'd249718d5a0abb9a24bc58085dc052a5d0065118', 'aef9504ab27b78fbfa8c9e03dc676546c0d9ace7', 'ce630a0ca55c28ee5c78701445f99de22e37d908', 'e5b2f34a3cc5b0fe8967fe4e911060fc33727608', '92096cc0bff6454642716f5f8859e37bd5a26d89', '3febafa73e50ba16acd9ebdcb8f97520392acf25', '531d1311849001df8a199a1561a8ccea74d69b31', 'dec866c42e27e6ae9a1d04de405e06abc9c43231', '24974211a78aba94f94e0b1461ff8aecde6a2858', '02060d0511d0b69a2a88ae24d70bb3bff5f55196', '7e0ca91c5a0f2358b5e4123e6ec97edc8b001f6c', 'b47975da306b3a13b51a2ab9d2590e9cb380a358', '199b8894a9d6c45a60784afe53e791663796ff82', '3cda6f14b9c753ff7c7a1fe1a7ef238d06de7d8d', 'b6f87b13112358d65ac4a4868620bb15e916e74b', 'ee10fff76b317675bef74fc788f89f1c59eab4b9', '2063fa14fa570773a6d88e77e777d87811ca0e55', '21c4188daf823693faade26d11b2bf823d68a1a4', '9fc29f559fbe1062abb47a1accf6357c2a23db0d', 'e5edb1fa542980f20a99d5e740e03e866a0ff86d', '3d3972fcaec4f025766c9a9fad74675caa5ca991', '023eba60e52495d80efcafcef9abf35adf38baf9', '30117df64ded972ef9ad78eb1cb3086e43424d08', 'edeea9ff910f0a42f3efb722e203bf1a190ded01', '595054842ee43c6ff199afe1e6e95cee3b18fbf6', '599b3cfab390ab4d2757cde4807b2468accf4004', '58d91aa5b6626920cbb924bea2b8933e95b7ab57', '060163b1ca089f648cc95ccf9f31f6bd5645327d', '8b58936a85ec96e6f12e7de5558141955cb3baca', '461c23462aba4e4c34a49e72564da739a8f64d9c', '0e00e2789b9907bc3c5d192bb0f6536a816b2a72', '92824bfb08987bd73c60b4d69abcb30dbe9f328c', 'e14212c50b25171a3bae06acd67088f6189f1f0a', '09ab8816f2dffe2d9fe651e04a4d8868e701afbb', '788d64bc1282b8ff8d0e38c19b60e7679b6c569e', '9a7d6f41fe294aaa7ab3050d2cd695d5a25ed3a1', '32f134d380a8d46fbc178fd7f66cebf6374ef570', 'fd8ef7098cbc17b1b39431598250a10490a172aa', 'dc9364152536239d00204c963d96e97587d691ed', '16bc2fd12f2c4bf216e2b0de71d75477151926bd', '3ef379894ea4780ffb5be0f180008d85fc955c7f', '1487d2e3f8f66df27b1d62f999629b1dc8e7850b', '525f139a9b30ba35649c122a0e0c3024acdf7a2f', '592074660bd09cc687eaf1b5405e695b9a7c7df8', '2446763300c204dcecf16e603a38f3a9572a428f', 'a1025f329fc408b30e46a7ba2a4cabdd7b4b986e', 'b8277c72a333bdc50fc570e263e1a00e77efafe5', 'f287158565caa170d99d47f31c5fcf03c93a293c', '2af88fe47c4f87430126690410366cdca27b7c05', 'c8504bba850d87d6c1ba9ec3d32a42f5bcf38078', '0039975b8f375ca7f4905bacd70cabf2bdd38f84', '511eca6e5b4e68c55bf090ec1802501b76fdecdc', 'f00dbb58d325623c5a6f934fdc49c74995174d20', '6d1bd66c17e47b03f4a2449a934bbfd3a6e5f535', 'ca98a256c3430490eb6d2cf7994b60353fc15200', '431a514a8b439da8612575ccfbfc4f559880b38f'], []), 25656: BeirSciDoc('dec997b20ebe2b867f68cc5c123d9cb9eafad6bb', re.compile('^Training deep neural networks generally requires massive amounts of data and is very computation int.{1320}classification problems, thus significantly boosting our ability to solve such problems efficiently\\.$', flags=48), 'Deriving optimal weights in deep neural networks', ['9716460', '2116548', '1695338'], 2018, [], ['367f2c63a6f6a10b3b64b8729d601e69337ee3cc', '178325c2b267bee56931f22e4f17c6454de7475a', '0d67362a5630ec3b7562327acc278c1c996454b5', '2efc0a99f13ef8875349ff5d47c278392c39e064', '15e0daa3d2e1438159e96f6c6fd6c4dd3756052c', '5d90f06bb70a0a3dced62413346235c02b1aa086', '7346d681807bf0852695caa42dbecae5265b360a', 'c61d139a2382760f560164e25e4be264de5dd59f', '1827de6fa9c9c1b3d647a9d707042e89cf94abf0', '563e821bb5ea825efb56b77484f5287f08cf3753']), }) self._test_docs('beir/scifact', count=5183, items={ 0: BeirTitleDoc('4983', re.compile('^Alterations of the architecture of cerebral white matter in the developing human brain can affect co.{1609}or MRI provides insight into microstructural development in cerebral white matter in living infants\\.$', flags=48), 'Microstructural development of human newborn cerebral white matter assessed in vivo by diffusion tensor magnetic resonance imaging.'), 9: BeirTitleDoc('70490', re.compile('^Likelihood ratios are one of the best measures of diagnostic accuracy, although they are seldom used.{286}ples illustrate how the clinician can use this method to refine diagnostic decisions at the bedside\\.$', flags=48), 'Simplifying likelihood ratios'), 5182: BeirTitleDoc('198309074', re.compile('^Introduction: Among the inflammatory mediators involved in the pathogenesis of obesity, the cell adh.{1481}nical inflammation that results from obesity by reducing the cell adhesion molecules and chemokines\\.$', flags=48), 'Adhesion molecules and chemokines: relation to anthropometric, body composition, biochemical and dietary variables'), }) self._test_docs('beir/trec-covid', count=171332, items={ 0: BeirCordDoc('ug7v899j', re.compile('^OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 pa.{1647}preschool children and that the mortality rate of pneumonia in patients with comorbidities was high\\.$', flags=48), 'Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia', 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC35282/', '11472636'), 9: BeirCordDoc('jg13scgo', re.compile('^This report describes the design and implementation of the Real\\-time Outbreak and Disease Surveillan.{1077} be a resource for implementing, evaluating, and applying new methods of public health surveillance\\.$', flags=48), 'Technical Description of RODS: A Real-time Public Health Surveillance System', 'https://academic.oup.com/jamia/article-pdf/10/5/399/2352016/10-5-399.pdf', '12807803'), 171331: BeirCordDoc('pnl9th2c', '', 'Vascular Life during the COVID-19 Pandemic Reminds Us to Prepare for the Unexpected', 'https://www.sciencedirect.com/science/article/pii/S1078588420303804?v=s5; https://www.ncbi.nlm.nih.gov/pubmed/32446539/; https://doi.org/10.1016/j.ejvs.2020.04.040; https://api.elsevier.com/content/article/pii/S1078588420303804', '32446539'), }) self._test_docs('beir/webis-touche2020', count=382545, items={ 0: BeirToucheDoc('c67482ba-2019-04-18T13:32:05Z-00000-000', re.compile('^My opponent forfeited every round\\. None of my arguments were answered\\. I don’t like the idea of winn.{293} sold to minors in ANY state\\. A retailer who says it is illegal to sell you them is, frankly, wrong\\.$', flags=48), 'Contraceptive Forms for High School Students', 'CON', 'https://www.debate.org/debates/Contraceptive-Forms-for-High-School-Students/1/'), 9: BeirToucheDoc('fbe6ad2-2019-04-18T11:12:36Z-00001-000', re.compile('^Why is it that so\\-called christians, Because there is no such a thing as a christian, Have serious t.{315}, All you did was babble on and on and on\\. So in this sense, It was YOU that forfeited\\. Sheesh! Bye\\.$', flags=48), 'The closet dementia of the superior ego god complex, The bible and why you should not believe in god', 'PRO', 'https://www.debate.org/debates/The-closet-dementia-of-the-superior-ego-god-complex-The-bible-and-why-you-should-not-believe-in-god/1/'), 382544: BeirToucheDoc('671509c8-2019-04-17T11:47:34Z-00007-000', 'Charter schools are exploited most by affable students', 'Charter schools', 'CON', 'http://www.debatepedia.org/en/index.php/Debate:_Charter_schools'), }) self._test_docs('beir/webis-touche2020/v2', count=382545, items={ 0: BeirToucheDoc('c67482ba-2019-04-18T13:32:05Z-00000-000', re.compile('^My opponent forfeited every round\\. None of my arguments were answered\\. I don’t like the idea of winn.{293} sold to minors in ANY state\\. A retailer who says it is illegal to sell you them is, frankly, wrong\\.$', flags=48), 'Contraceptive Forms for High School Students', 'CON', 'https://www.debate.org/debates/Contraceptive-Forms-for-High-School-Students/1/'), 9: BeirToucheDoc('fbe6ad2-2019-04-18T11:12:36Z-00001-000', re.compile('^Why is it that so\\-called christians, Because there is no such a thing as a christian, Have serious t.{315}, All you did was babble on and on and on\\. So in this sense, It was YOU that forfeited\\. Sheesh! Bye\\.$', flags=48), 'The closet dementia of the superior ego god complex, The bible and why you should not believe in god','PRO', 'https://www.debate.org/debates/The-closet-dementia-of-the-superior-ego-god-complex-The-bible-and-why-you-should-not-believe-in-god/1/'), 382544: BeirToucheDoc('671509c8-2019-04-17T11:47:34Z-00007-000', 'Charter schools are exploited most by affable students', 'Charter schools', 'CON', 'http://www.debatepedia.org/en/index.php/Debate:_Charter_schools'), }) self._test_docs('beir/cqadupstack/android', count=22998, items={ 0: BeirCqaDoc('51829', re.compile('^I want to send files to android tablet with a application from PC\\. \\- I can send files directly to ta.{188}m \\? \\- How can show my device as a external drive\\? my application that sent files written via Delphi\\.$', flags=48), 'How can show android tablet as a external storage to PC?', ['usb-connection-mode']), 9: BeirCqaDoc('19394', re.compile('^I bought "Cut the Rope" on my Nexus One cellphone from the Android Market\\. When I open this game on .{51}it to be "Purchased"\\. How can I add my Google account to Kindle Fire\'s Amazon appstore account list\\?$', flags=48), 'How can I use an app purchased from the Market on a Kindle Fire?', ['google-play-store', 'amazon-kindle-fire', 'accounts']), 22997: BeirCqaDoc('38348', re.compile('^With the growing number of Android devices in all sorts of different form factors \\(dev boards like R.{163}roid\\. For example, having the standard Linux build tools available would let me easily run a server\\.$', flags=48), 'Is there any easy way to get GNU build tools on Android? If not... why not?',['linux', 'development']), }) self._test_docs('beir/cqadupstack/english', count=40221, items={ 0: BeirCqaDoc('11547', re.compile('^An eponym is one way to eternal \\(if posthumous\\) fame\\. But is there a word meaning an eponym someone .{65}oycott_ , Mr Justice _Lynch_ , and Patrick _Hooligan_ would not appreciate their undying notoriety\\.\\)$', flags=48), 'Is there a word meaning "an unwanted eponym"?', ['single-word-requests', 'eponyms']), 9: BeirCqaDoc('182056', re.compile("^In the following statement, which one is grammatically correct\\? > XYZ caterers \\*\\*is\\*\\* on to somethin.{76} be 'are' as caterers is plural\\. But it has been suggested that I might be wrong\\. What do you think\\?$", flags=48), '"XYZ caterers is.." or "XYZ caterers are.."?', ['grammar', 'grammatical-number']), 40220: BeirCqaDoc('38346', re.compile('^A colleague and I were having a discussion as to the proper plural form of _abacus_\\. I believe the p.{183}rd that is part of the Arabic language\\. Any opinions or history to this matter would be appreciated\\.$', flags=48), 'Plural of "abacus"', ['meaning', 'etymology', 'grammar', 'latin', 'roots']), }) self._test_docs('beir/cqadupstack/gaming', count=45301, items={ 0: BeirCqaDoc('11542', 'What\'s your Supreme Commander 2 build order. I don\'t just want "6 mass extractors, 2 power and a factory". List of building and units out to the second or third factory, please.', 'Supreme Commander 2 - Build Orders', ['supreme-commander-2']), 9: BeirCqaDoc('19393', re.compile('^What are the benefits of an assault ship over an interceptor\\? I played some significant time ago, an.{176}So: a\\) What are the main uses of each b\\) Which would most benefit the style of play mentioned above\\?$', flags=48), 'Assault ships v. Interceptors', ['eve-online']), 45300: BeirCqaDoc('38346', re.compile("^_But you can't have more than one companion\\._ \\*\\*Wrong\\.\\*\\* So I was taking that stupid dog, Barbas, to.{156}ne with the dragon they start attacking each other\\. How can I get them to stop and be friends again\\?$", flags=48), 'How do I make my companions friends?', ['skyrim']), }) self._test_docs('beir/cqadupstack/gis', count=37637, items={ 0: BeirCqaDoc('73399', re.compile("^There is a satellite image it's size is 10 GB and I need to display this image using GeoServer and O.{211} response time using 32 GB satellite image\\. Please advice me how to achieve this\\? Thanks in advance\\.$", flags=48), 'Satellite image display with the help of GeoServer and OpenLayers', ['openlayers', 'geoserver']), 9: BeirCqaDoc('5983', re.compile('^Has anyone succeeded in programmatically updating metadata in ArcGIS 10\\? Considering using Python/ar.{254} except where they are in conflict in which case the added elements overwrite the existing elements\\.$', flags=48), 'Programmatically edit/update metadata in ArcGIS 10', ['arcobjects', 'arcgis-10.0', 'python', 'c#', 'metadata']), 37636: BeirCqaDoc('103092', re.compile('^Link: http://projects\\.nytimes\\.com/census/2010/explorer How can I also render that specific kind of m.{121} says its from Google at the bottom right, but then why does it look different from maps\\.google\\.com\\?$', flags=48), 'What map library does this census visualization use?', ['gui']), }) self._test_docs('beir/cqadupstack/mathematica', count=16705, items={ 0: BeirCqaDoc('35237', re.compile("^I'm trying to use `Get` to load some pretty substantial packages from a custom menu in the _Mathemat.{320}` / `MenuItem`\\) that will remove that time constraint so that my command can be executed completely\\.$", flags=48), 'Time constraints on KernelExecute commands or MenuItems?', ['front-end', 'menu']), 9: BeirCqaDoc('28990', re.compile("^I have multiple data sets, each of which is a 2D matrix\\. I want to construct a new 2D Matrix in whic.{139}ix2\\[i\\]\\[j\\] \\+ \\.\\.\\. \\+ MatrixN\\[i\\]\\[j\\]\\) I can't quite figure out how to do it in _Mathematica_\\. Thanks$", flags=48), 'Averaging multiple 2D data sets', ['list-manipulation', 'matrix']), 16704: BeirCqaDoc('34149', re.compile('^I want to add two matrices, the first one containing a 2D vector at each position the other one a li.{675},MB\\},2\\] This works but is rather slow\\. Is there a faster and maybe more elegant way to do this\\?$', flags=48), 'MapThread Alternatives', ['list-manipulation', 'performance-tuning', 'map']), }) self._test_docs('beir/cqadupstack/physics', count=38316, items={ 0: BeirCqaDoc('110557', re.compile("^Let's discuss about \\$SU\\(3\\)\\$\\. I understand that the most important representations \\(relevant to physi.{732} indices\\)\\. What is the general procedure to represent the generators in an arbitrary representation\\?$", flags=48), 'Representation of SU(3) generators', ['particle-physics', 'group-representations']), 9: BeirCqaDoc('11546', re.compile('^I have a question about the relation: \\$\\\\exp\\(\\-i \\\\vec\\{\\\\sigma\\} \\\\cdot \\\\hat\\{n\\}\\\\phi/2\\) = \\\\cos\\(\\\\phi/2\\) \\- i .{152}alized for \\$\\\\hat\\{n\\}\\$ being an operator\\? If so how exactly would the expression be different\\? Thanks\\.$', flags=48), 'generalizing spin rotations', ['quantum-mechanics', 'angular-momentum', 'spin']), 38315: BeirCqaDoc('38347', re.compile("^Let's say a box is moved by attaching a rope to it and pulling with an applied force at a certain an.{528}ined ramp, the above would not work\\. What do I need to do differently to solve this type of problem\\?$", flags=48), 'Overcoming Friction', ['homework', 'newtonian-mechanics', 'friction']), }) self._test_docs('beir/cqadupstack/programmers', count=32176, items={ 0: BeirCqaDoc('228054', re.compile('^I am in the midst of writing a web application for work\\. Everything is from scratch\\. I have been a P.{739}s of speed\\. So, my question is as the title asks, is a client\\-side centric app substantially slower\\?$', flags=48), 'Are (mostly) client-side JavaScript web apps slower or less efficient?', ['javascript', 'node.js', 'ajax', 'browser', 'client-side']), 9: BeirCqaDoc('127472', re.compile("^I've been developing web apps for a while now and it is standard practice in our team to use agile d.{317}words, when you develop ML and NLP algorithms as a job, do you use agile development in the process\\?$", flags=48), 'Is Agile Development used in Machine Learning and Natural Language Processing?', ['agile', 'development-process', 'machine-learning', 'nlp']), 32175: BeirCqaDoc('213799', re.compile("^I'm developing a small system with two components: one polls data from an internet resource and tran.{762}he other writes\\? I started writing the code but was wondering if this is a misapplication of SQLite\\.$", flags=48), 'SQLite with two python processes accessing it: one reading, one writing', ['web-development', 'python', 'sql', 'concurrency', 'sqlite']), }) self._test_docs('beir/cqadupstack/stats', count=42269, items={ 0: BeirCqaDoc('110556', re.compile("^I'm a beginner in statistics and R, sorry if this question may seem trivial\\. I've collected data mea.{5246}analysis do you suggest\\? \\* If yes, how can I interpret the result I got \\(please, in simple terms\\)\\?$", flags=48), 'Is this a case for an ordinal logistic regression? Problems interpreting output', ['r', 'regression', 'logistic', 'interpretation']), 9: BeirCqaDoc('89379', re.compile('^!\\[enter image description here\\]\\(http://i\\.stack\\.imgur\\.com/qmNwR\\.png\\) The image above represents a hyp.{574} know of a good way to do that\\? If there is a better place to ask this question, please let me know\\.$', flags=48), 'Need subspace partition algorithm, not necessarily a full classifier', ['machine-learning', 'data-mining']), 42268: BeirCqaDoc('38346', re.compile('^Regression: Wage=b0\\+b1collegegrad, where collegegrad is a dummy variable\\. Suppose you want to estima.{221}nd thus get the true ratio, so the estimator is consistent\\. Am I correct, or am I missing something\\?$', flags=48), 'Consistency of estimator', ['self-study', 'consistency']), }) self._test_docs('beir/cqadupstack/tex', count=68184, items={ 0: BeirCqaDoc('182565', re.compile('^I am using a pgfplots stacked bar to display the aggregated energy demand of a houshold and the asso.{1179} \\\\legend\\{low price, high price\\} \\\\end\\{axis\\} \\\\end\\{tikzpicture\\} \\\\end\\{document\\}$', flags=48), 'Adding horizontal lines to pgfplots bar', ['pgfplots', 'bar-chart']), 9: BeirCqaDoc('61123', re.compile('^> \\*\\*Possible Duplicate:\\*\\* > Left and right subscript > Superscripts before a letter in math I .{128} the subscript O but to be on the left side\\? Is this possible which commands/packages I need to use\\?$', flags=48), 'How to change the side on which the superscript appears?', ['superscripts']), 68183: BeirCqaDoc('103090', re.compile('^I appreciate it if you let me know the most elegant way to draw a crossed hierarchy such as the foll.{3}ngs: X /\\\\ Y Z /\\\\/\\\\ p q t q has two parents Y and Z\\.$', flags=48), 'Crossed hierarchy', ['tikz-pgf', 'horizontal-alignment', 'tikz-trees']), }) self._test_docs('beir/cqadupstack/unix', count=47382, items={ 0: BeirCqaDoc('110557', re.compile('^Is there a way to avoid ssh printing warning messages like this\\? "@@@@@@@@@@@@@@@@@@@@.{196}the remote host identity has changed but I know it is fine and just want to get rid of this warning\\.$', flags=48), 'Force ssh to not to print warnings', ['ssh']), 9: BeirCqaDoc('110550', 'What is the difference between the red5 versions RC1 and RC2 ? and what does RC mean?', 'What is the difference between red5 RC1 and RC2?', ['broadcast']), 47381: BeirCqaDoc('38346', re.compile("^I've got my vacation coming in and thought I might use that for something useful\\. Essentially, I've .{2438}stuff used in enterprise security, I'm very ignorant about how things are actually used in practice\\.$", flags=48), 'Getting from proficient to expert', ['shell', 'virtualization', 'storage', 'cluster']), }) self._test_docs('beir/cqadupstack/webmasters', count=17405, items={ 0: BeirCqaDoc('35236', re.compile("^I'm making a website for a small hotel in php\\. The hotel owners want a reservation system that uses .{290}d of buying with paypal\\. Is this possible\\? Does anyone know of an open php system that handles this\\?$", flags=48), 'Hotel Reservation Request Booking Paypal PHP', ['php', 'looking-for-a-script', 'paypal']), 9: BeirCqaDoc('503', re.compile("^My website used to have sitelinks and now it doesn't\\. It's very possible that it's due to changing t.{219}\\.imgur\\.com/sBaDc\\.jpg\\) What are some things that I can do to improve my chances of getting sitelinks\\?$", flags=48), 'What are the most important things I need to do to encourage Google Sitelinks?', ['seo', 'google', 'sitelinks']), 17404: BeirCqaDoc('38346', re.compile("^I'm looking for a keyword racking tracker tool for google\\. I have found a lot of them over the inter.{182}ord as my site has hundreds of pages\\. Any recommendation\\? Or do I have to set each URLs per keyword\\?$", flags=48), 'Keyword ranking tracker that works on a per-domain basis', ['seo', 'keywords', 'tools', 'ranking', 'google-ranking']), }) self._test_docs('beir/cqadupstack/wordpress', count=48605, items={ 0: BeirCqaDoc('108998', re.compile("^In a shortcode context, is there any difference here\\? array\\( 'slideshow' =.{32} array\\( 'slideshow' => NULL, \\), Is there a best practice for that\\?$", flags=48), 'What is the difference between Null vs Empty (Zero Length) string?', ['php', 'plugin-development', 'shortcode']), 9: BeirCqaDoc('19393', re.compile("^I'm using WP\\-Cufon for font replacements\\. It's adding extra cufon canvas out side of p tags in my pa.{127} it happening\\? How can I solve it\\? I'm having same kind of problem with all\\-in\\-one cufon plugin too\\.$", flags=48), 'WP-Cufon adding extra space in my paragraphs in Firefox and Chrome', ['plugins', 'javascript', 'plugin-all-in-one-cufon']), 48604: BeirCqaDoc('38344', 'Is there a specific reason why we can find max-width:97.5% instead of 100% in common themes such as Twenty Eleven?', 'Why max-width:97.5% on content images?', ['theme-development', 'css', 'maximized-width']), }) def test_queries(self): self._test_queries('beir/arguana', count=1406, items={ 0: GenericQuery('test-environment-aeghhgwpe-pro02a', "Being vegetarian helps the environment Becoming a vegetarian is an environmentally friendly thing to do. Modern farming is one of the main sources of pollution in our rivers. Beef farming is one of the main causes of deforestation, and as long as people continue to buy fast food in their billions, there will be a financial incentive to continue cutting down trees to make room for cattle. Because of our desire to eat fish, our rivers and seas are being emptied of fish and many species are facing extinction. Energy resources are used up much more greedily by meat farming than my farming cereals, pulses etc. Eating meat and fish not only causes cruelty to animals, it causes serious harm to the environment and to biodiversity. For example consider Meat production related pollution and deforestation At Toronto’s 1992 Royal Agricultural Winter Fair, Agriculture Canada displayed two contrasting statistics: “it takes four football fields of land (about 1.6 hectares) to feed each Canadian” and “one apple tree produces enough fruit to make 320 pies.” Think about it — a couple of apple trees and a few rows of wheat on a mere fraction of a hectare could produce enough food for one person! [1] The 2006 U.N. Food and Agriculture Organization (FAO) report concluded that worldwide livestock farming generates 18% of the planet's greenhouse gas emissions — by comparison, all the world's cars, trains, planes and boats account for a combined 13% of greenhouse gas emissions. [2] As a result of the above point producing meat damages the environment. The demand for meat drives deforestation. Daniel Cesar Avelino of Brazil's Federal Public Prosecution Office says “We know that the single biggest driver of deforestation in the Amazon is cattle.” This clearing of tropical rainforests such as the Amazon for agriculture is estimated to produce 17% of the world's greenhouse gas emissions. [3] Not only this but the production of meat takes a lot more energy than it ultimately gives us chicken meat production consumes energy in a 4:1 ratio to protein output; beef cattle production requires an energy input to protein output ratio of 54:1. The same is true with water use due to the same phenomenon of meat being inefficient to produce in terms of the amount of grain needed to produce the same weight of meat, production requires a lot of water. Water is another scarce resource that we will soon not have enough of in various areas of the globe. Grain-fed beef production takes 100,000 liters of water for every kilogram of food. Raising broiler chickens takes 3,500 liters of water to make a kilogram of meat. In comparison, soybean production uses 2,000 liters for kilogram of food produced; rice, 1,912; wheat, 900; and potatoes, 500 liters. [4] This is while there are areas of the globe that have severe water shortages. With farming using up to 70 times more water than is used for domestic purposes: cooking and washing. A third of the population of the world is already suffering from a shortage of water. [5] Groundwater levels are falling all over the world and rivers are beginning to dry up. Already some of the biggest rivers such as China’s Yellow river do not reach the sea. [6] With a rising population becoming vegetarian is the only responsible way to eat. [1] Stephen Leckie, ‘How Meat-centred Eating Patterns Affect Food Security and the Environment’, International development research center [2] Bryan Walsh, Meat: Making Global Warming Worse, Time magazine, 10 September 2008 . [3] David Adam, Supermarket suppliers ‘helping to destroy Amazon rainforest’, The Guardian, 21st June 2009. [4] Roger Segelken, U.S. could feed 800 million people with grain that livestock eat, Cornell Science News, 7th August 1997. [5] Fiona Harvey, Water scarcity affects one in three, FT.com, 21st August 2003 [6] Rupert Wingfield-Hayes, Yellow river ‘drying up’, BBC News, 29th July 2004"), 9: GenericQuery('test-environment-assgbatj-pro01a', 'Animals shouldn’t be harmed The difference between us and other animals is a matter of degree rather than type [2]. Their bodies resemble ours, as do their ways of conveying meaning. They recoil from pain, appear to express fear of a tormentor, and appear to take pleasure in activities; a point clear to anyone who has observed a pet dog on hearing the word “walk”. We believe other people experience feelings like us because they are like us in appearance and behaviour. An animal sharing our anatomical, physiological, and behavioural characteristics is surely likely to have feelings like us. If people have a right to not be harmed, we must ask ourselves what makes animals different? If animals feel what we feel, and suffer like us, to condemn one to testing because of them being of a different species is similar to racism or sexism.[3]'), 1405: GenericQuery('test-society-epsihbdns-con01a', 'Freedom of movement is an intrinsic human right Every human being is born with certain rights. These are protected by various charters and are considered inseparable from the human being. The reason for this is a belief that these rights create the fundamental and necessary conditions to lead a human life. Freedom of movement is one of these and has been recognised as such in Article 13 of the Universal Declaration of Human Rights. [1] If a family finds themselves faced with starvation, the only chance they have of survival might be to move to another place where they might live another day. It is inhuman to condemn individuals to death and suffering for the benefit of some nebulous collective theory. While we might pass some of our freedoms to the state, we have a moral right to the freedoms that help us stay alive – in this context freedom of movement is one of those. [1] General Assembly, “The Universal Declaration of Human Rights”, 10 December 1948,'), }) self._test_queries('beir/climate-fever', count=1535, items={ 0: GenericQuery('0', 'Global warming is driving polar bears toward extinction'), 9: GenericQuery('21', 'Sea level rise has been slow and a constant, pre-dating industrialization'), 1534: GenericQuery('3134', 'Over the last decade, heatwaves are five times more likely than if there had been no global warming.'), }) self._test_queries('beir/dbpedia-entity', count=467, items={ 0: GenericQuery('INEX_LD-20120112', 'vietnam war facts'), 9: GenericQuery('INEX_LD-2012336', '1906 territory Papua island Australian'), 466: GenericQuery('TREC_Entity-20', 'Scotch whisky distilleries on the island of Islay.'), }) self._test_queries('beir/dbpedia-entity/dev', count=67, items={ 0: GenericQuery('INEX_LD-20120112', 'vietnam war facts'), 9: GenericQuery('INEX_LD-2012336', '1906 territory Papua island Australian'), 66: GenericQuery('TREC_Entity-17', 'Chefs with a show on the Food Network.'), }) self._test_queries('beir/dbpedia-entity/test', count=400, items={ 0: GenericQuery('INEX_LD-20120111', 'vietnam war movie'), 9: GenericQuery('INEX_LD-20120312', 'tango culture countries'), 399: GenericQuery('TREC_Entity-20', 'Scotch whisky distilleries on the island of Islay.'), }) self._test_queries('beir/fever', count=123142, items={ 0: GenericQuery('75397', 'Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.'), 9: GenericQuery('76253', 'There is a movie called The Hunger Games.'), 123141: GenericQuery('81957', 'Trouble with the Curve is a television show.'), }) self._test_queries('beir/fever/dev', count=6666, items={ 0: GenericQuery('137334', 'Fox 2000 Pictures released the film Soul Food.'), 9: GenericQuery('18708', 'Charles Manson has been proven innocent of all crimes.'), 6665: GenericQuery('46064', 'The NAACP Image Award for Outstanding Supporting Actor in a Drama Series was first given in 1996.'), }) self._test_queries('beir/fever/test', count=6666, items={ 0: GenericQuery('163803', 'Ukrainian Soviet Socialist Republic was a founding participant of the UN.'), 9: GenericQuery('134850', 'Ice-T refused to ever make hip-hop music.'), 6665: GenericQuery('81957', 'Trouble with the Curve is a television show.'), }) self._test_queries('beir/fever/train', count=109810, items={ 0: GenericQuery('75397', 'Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.'), 9: GenericQuery('76253', 'There is a movie called The Hunger Games.'), 109809: GenericQuery('152180', 'Susan Sarandon is an award winner.'), }) self._test_queries('beir/fiqa', count=6648, items={ 0: GenericQuery('0', 'What is considered a business expense on a business trip?'), 9: GenericQuery('14', "What are 'business fundamentals'?"), 6647: GenericQuery('2399', 'Where do web sites get foreign exchange currency rate / quote information?'), }) self._test_queries('beir/fiqa/dev', count=500, items={ 0: GenericQuery('7208', 'Could an ex-employee of a company find themself stranded with shares they cannot sell (and a tax bill)?'), 9: GenericQuery('7526', 'First time investor wanting to invest in index funds especially Vanguard'), 499: GenericQuery('4872', 'Taking a car loan vs cash and effect on credit score'), }) self._test_queries('beir/fiqa/test', count=648, items={ 0: GenericQuery('4641', 'Where should I park my rainy-day / emergency fund?'), 9: GenericQuery('6715', 'What does it mean if “IPOs - normally are sold with an `underwriting discount` (a built in commission)”'), 647: GenericQuery('2399', 'Where do web sites get foreign exchange currency rate / quote information?'), }) self._test_queries('beir/fiqa/train', count=5500, items={ 0: GenericQuery('0', 'What is considered a business expense on a business trip?'), 9: GenericQuery('14', "What are 'business fundamentals'?"), 5499: GenericQuery('11104', 'Selling a stock for gain to offset other stock loss'), }) self._test_queries('beir/hotpotqa', count=97852, items={ 0: GenericQuery('5ab6d31155429954757d3384', 'What country of origin does House of Cosbys and Bill Cosby have in common?'), 9: GenericQuery('5adcd29a5542992c1e3a241d', "The 2015 Kids' Choice Sports Awards was hosted by an American footbal quarterback who was born on November 29th of what year?"), 97851: GenericQuery('5ac132a755429964131be17c', 'Blackfin is a family of processors developed by the company that is headquartered in what city?'), }) self._test_queries('beir/hotpotqa/dev', count=5447, items={ 0: GenericQuery('5ae81fbf55429952e35eaa37', 'Daniel Márcio Fernandes plays for a club founded in which year ?'), 9: GenericQuery('5a7fbc715542995d8a8ddf08', "The Monkey's Uncle and Benji the Hunted, are what form of entertainment?"), 5446: GenericQuery('5a8bae0c5542996e8ac889b5', 'The director of "An American Tragedy" emigrated permanently to the United States at what age?'), }) self._test_queries('beir/hotpotqa/test', count=7405, items={ 0: GenericQuery('5a8b57f25542995d1e6f1371', 'Were Scott Derrickson and Ed Wood of the same nationality?'), 9: GenericQuery('5a8db19d5542994ba4e3dd00', 'Are Local H and For Against both from the United States?'), 7404: GenericQuery('5ac132a755429964131be17c', 'Blackfin is a family of processors developed by the company that is headquartered in what city?'), }) self._test_queries('beir/hotpotqa/train', count=85000, items={ 0: GenericQuery('5ab6d31155429954757d3384', 'What country of origin does House of Cosbys and Bill Cosby have in common?'), 9: GenericQuery('5adcd29a5542992c1e3a241d', "The 2015 Kids' Choice Sports Awards was hosted by an American footbal quarterback who was born on November 29th of what year?"), 84999: GenericQuery('5a7543b155429916b01642cd', 'What is the title of the book that documents the involvement of the president of the BioProducts Division at Archer Daniels Midland in a conspiracy case?'), }) self._test_queries('beir/msmarco', count=509962, items={ 0: GenericQuery('1185869', ')what was the immediate impact of the success of the manhattan project?'), 9: GenericQuery('186154', 'feeding rice cereal how many times per day'), 509961: GenericQuery('195199', 'glioma meaning'), }) self._test_queries('beir/msmarco/dev', count=6980, items={ 0: GenericQuery('300674', 'how many years did william bradford serve as governor of plymouth colony?'), 9: GenericQuery('54544', 'blood diseases that are sexually transmitted'), 6979: GenericQuery('195199', 'glioma meaning'), }) self._test_queries('beir/msmarco/test', count=43, items={ 0: GenericQuery('19335', 'anthropological definition of environment'), 9: GenericQuery('156493', 'do goldfish grow'), 42: GenericQuery('1133167', 'how is the weather in jamaica'), }) self._test_queries('beir/msmarco/train', count=502939, items={ 0: GenericQuery('1185869', ')what was the immediate impact of the success of the manhattan project?'), 9: GenericQuery('186154', 'feeding rice cereal how many times per day'), 502938: GenericQuery('405466', 'is carbonic acid soluble'), }) self._test_queries('beir/nfcorpus', count=3237, items={ 0: BeirUrlQuery('PLAIN-3', 'Breast Cancer Cells Feed on Cholesterol', 'http://nutritionfacts.org/2015/07/14/breast-cancer-cells-feed-on-cholesterol/'), 9: BeirUrlQuery('PLAIN-15', 'Why Do Heart Doctors Favor Surgery and Drugs Over Diet?', 'http://nutritionfacts.org/2015/06/02/why-do-heart-doctors-favor-surgery-and-drugs-over-diet/'), 3236: BeirUrlQuery('PLAIN-3472', 'How Doctors Responded to Being Named a Leading Killer', 'http://nutritionfacts.org/video/how-doctors-responded-to-being-named-a-leading-killer/'), }) self._test_queries('beir/nfcorpus/dev', count=324, items={ 0: BeirUrlQuery('PLAIN-1', 'Why Deep Fried Foods May Cause Cancer', 'http://nutritionfacts.org/2015/07/21/why-deep-fried-foods-may-cause-cancer/'), 9: BeirUrlQuery('PLAIN-101', 'How to Treat Multiple Sclerosis With Diet', 'http://nutritionfacts.org/2014/07/22/how-to-treat-multiple-sclerosis-with-diet/'), 323: BeirUrlQuery('PLAIN-3471', 'Uprooting the Leading Causes of Death', 'http://nutritionfacts.org/video/uprooting-the-leading-causes-of-death/'), }) self._test_queries('beir/nfcorpus/test', count=323, items={ 0: BeirUrlQuery('PLAIN-2', 'Do Cholesterol Statin Drugs Cause Breast Cancer?', 'http://nutritionfacts.org/2015/07/16/do-cholesterol-statin-drugs-cause-breast-cancer/'), 9: BeirUrlQuery('PLAIN-102', 'Stopping Heart Disease in Childhood', 'http://nutritionfacts.org/2014/07/15/stopping-heart-disease-in-childhood/'), 322: BeirUrlQuery('PLAIN-3472', 'How Doctors Responded to Being Named a Leading Killer', 'http://nutritionfacts.org/video/how-doctors-responded-to-being-named-a-leading-killer/'), }) self._test_queries('beir/nfcorpus/train', count=2590, items={ 0: BeirUrlQuery('PLAIN-3', 'Breast Cancer Cells Feed on Cholesterol', 'http://nutritionfacts.org/2015/07/14/breast-cancer-cells-feed-on-cholesterol/'), 9: BeirUrlQuery('PLAIN-15', 'Why Do Heart Doctors Favor Surgery and Drugs Over Diet?', 'http://nutritionfacts.org/2015/06/02/why-do-heart-doctors-favor-surgery-and-drugs-over-diet/'), 2589: BeirUrlQuery('PLAIN-3474', 'Fish Consumption and Suicide', 'http://nutritionfacts.org/video/fish-consumption-and-suicide/'), }) self._test_queries('beir/nq', count=3452, items={ 0: GenericQuery('test0', 'what is non controlling interest on balance sheet'), 9: GenericQuery('test9', 'who makes the decisions about what to produce in a market economy'), 3451: GenericQuery('test3451', 'when will notre dame played michigan state again'), }) self._test_queries('beir/quora', count=15000, items={ 0: GenericQuery('318', 'How does Quora look to a moderator?'), 9: GenericQuery('784', 'Why should one hate Shahrukh Khan?'), 14999: GenericQuery('537876', 'How do Russian politics and geostrategy affect Australia and New Zealand?'), }) self._test_queries('beir/quora/dev', count=5000, items={ 0: GenericQuery('318', 'How does Quora look to a moderator?'), 9: GenericQuery('784', 'Why should one hate Shahrukh Khan?'), 4999: GenericQuery('537790', 'What are the most interesting books on the side of atheism?'), }) self._test_queries('beir/quora/test', count=10000, items={ 0: GenericQuery('46', 'Which question should I ask on Quora?'), 9: GenericQuery('616', 'Which are the best books to understand calculus?'), 9999: GenericQuery('537876', 'How do Russian politics and geostrategy affect Australia and New Zealand?'), }) self._test_queries('beir/scidocs', count=1000, items={ 0: BeirSciQuery('78495383450e02c5fe817e408726134b3084905d', 'A Direct Search Method to solve Economic Dispatch Problem with Valve-Point Effect', ['50306438', '15303316', '1976596'], 2014, ['38e78343cfd5c013decf49e8cf008ddf6458200f'], ['632589828c8b9fca2c3a59e97451fde8fa7d188d', '4cf296b9d4ef79b838dc565e6e84ab9b089613de', '86e87db2dab958f1bd5877dc7d5b8105d6e31e46', '4b031fa8bf63e17e2100cf31ba6e11d8f80ff2a8', 'a718c6ca7a1db49bb2328d43f775783e8ec6f985', 'cf51cfb5b221500b882efee60b794bc11635267e', '6329874126a4e753f98c40eaa74b666d0f14eaba', 'a27b6025d147febb54761345eafdd73954467aca']), 9: BeirSciQuery('ae0fb9c6ebb8ce12610c477d2388447a13dc4694', 'Distributed Privacy-Preserving Collaborative Intrusion Detection Systems for VANETs', ['49104949', '1709793'], 2018, ['a1e81122931a5e96ced6569d0ee22b174db1ebb7', '96bbb9c86cdd9d19643686f623898367f9efb0bc', '228c40580e888fc9df003a16b8b7abb5d854a6eb', 'ab95903604d7fb8c03148b1a4f56af3c6de6fde1', '4875ac38970c742d6bfa760ca26ab7a629fde8da'], ['24d800e6681a129b7787cbb05d0e224acad70e8d', '216d7c407109f5557ae525b292856c4ab56996ca', '6e63c4a8320be712e3067eef3f042bb3df38a8e1', '49934d08d42ed9e279a82cbad2086377443c8a75', 'b45d9d5957416f363635025630d53bf593d3dd5c', '11861442e7b59669d630aed8c3b5d5290a70687e', '0dacd4593ba6bce441bae37fc3ff7f3b70408ee1', '8ef2a5e3dffb0a155a14575c8333b175b61e0675', '32334506f746e83367cecb91a0ab841e287cd958', '61efdc56bc6c034e9d13a0c99d0b651a78bfc596']), 999: BeirSciQuery('89e58773fa59ef5b57f229832c2a1b3e3efff37e', 'Analyzing EEG signals to detect unexpected obstacles during walking', ['2492849', '6622542', '2927560', '40259975', '3334492', '46629632'], 2015, ['37512f0a2d5ea940f4debe84593ec2c054126c1e', '5181fe04756a2481d44bad5ec7f26461e41eaca0', '858e561895faadc6d6300948f06fd018a56c6775', '46f3cf9ff98c02b382079ec2d514c47379c3ffaa', '26fc69fb8cc5969b515e3b7d2bdc6ff83f68ac58', 'f9e11a43ccb47b58bc08937750f65d6306e6961a', 'fcd16ea07b9f35a851444f9933ca72535015d46c', '5fc1491937224b215a543196fe2514794b329c03', 'ac9e0bb99f12d697137b2373e1d5ba6f8babf355'], ['f1277592f221ea26fa1d2321a38b64c58b33d75b', '42ad00c8ed436f6b8f0a4a73f55018210181e4a3', '22ff979fafd58acea3b838036fdc55ed60b1a265', 'a20369f96ca4d73fbe25cc9e099b0f9ad57eb4a9', '94485df9a4a975ac8ae32e7f539c8a4f77d88f12', 'a5e6a3fb9bbfc4e494427b4f3a1782b9aefcab92', '4933491737750764aa304288f004f05a06f68704', 'f24a222de1e81b4dd5d9e3a6c5feb4499b095d4d', '57abdefc6e05d475cf5f34d190b8225a74de79f0', '0989bbd8c15f9aac24e8832327df560dc8ec5324']), }) self._test_queries('beir/scifact', count=1109, items={ 0: GenericQuery('0', '0-dimensional biomaterials lack inductive properties.'), 9: GenericQuery('15', '50% of patients exposed to radiation have activated markers of mesenchymal stem cells.'), 1108: GenericQuery('1395', 'p16INK4A accumulation is linked to an abnormal wound response caused by the microinvasive step of advanced Oral Potentially Malignant Lesions (OPMLs).'), }) self._test_queries('beir/scifact/test', count=300, items={ 0: GenericQuery('1', '0-dimensional biomaterials show inductive properties.'), 9: GenericQuery('51', 'ALDH1 expression is associated with better breast cancer outcomes.'), 299: GenericQuery('1395', 'p16INK4A accumulation is linked to an abnormal wound response caused by the microinvasive step of advanced Oral Potentially Malignant Lesions (OPMLs).'), }) self._test_queries('beir/scifact/train', count=809, items={ 0: GenericQuery('0', '0-dimensional biomaterials lack inductive properties.'), 9: GenericQuery('15', '50% of patients exposed to radiation have activated markers of mesenchymal stem cells.'), 808: GenericQuery('1407', 'β1/Ketel is able to bind microtubules.'), }) self._test_queries('beir/trec-covid', count=50, items={ 0: BeirCovidQuery('1', 'what is the origin of COVID-19', 'coronavirus origin', "seeking range of information about the SARS-CoV-2 virus's origin, including its evolution, animal source, and first transmission into humans"), 9: BeirCovidQuery('10', 'has social distancing had an impact on slowing the spread of COVID-19?', 'coronavirus social distancing impact', "seeking specific information on studies that have measured COVID-19's transmission in one or more social distancing (or non-social distancing) approaches"), 49: BeirCovidQuery('50', 'what is known about an mRNA vaccine for the SARS-CoV-2 virus?', 'mRNA vaccine coronavirus', 'Looking for studies specifically focusing on mRNA vaccines for COVID-19, including how mRNA vaccines work, why they are promising, and any results from actual clinical studies.'), }) self._test_queries('beir/webis-touche2020', count=49, items={ 0: BeirToucheQuery('1', 'Should teachers get tenure?', "A user has heard that some countries do give teachers tenure and others don't. Interested in the reasoning for or against tenure, the user searches for positive and negative arguments. The situation of school teachers vs. university professors is of interest.", "Highly relevant arguments make a clear statement about tenure for teachers in schools or universities. Relevant arguments consider tenure more generally, not specifically for teachers, or, instead of talking about tenure, consider the situation of teachers' financial independence."), 9: BeirToucheQuery('10', 'Should any vaccines be required for children?', 'Anti-vaccination movements are on the rise, and so are pathogens like measles again. The freedom to not vaccinate paired with rampant disinformation may be a threat to society at large, and for children in particular. A users thus wonders, whether there are vaccines that should be mandatory.', 'Highly relevant arguments name one or more vaccines and reason about the (un)necessity to administer them to children. Relevant arguments talk about vaccination for children in general.'), 48: BeirToucheQuery('50', 'Should everyone get a universal basic income?', 'Redistribution of wealth is a fundamental concept of many economies and social systems. A key component might be a universal basic income, however, a user wonders whether this truly would help.', 'Highly relevant arguments take a clear stance toward the universal basic income, giving clear premises. Relevant arguments offer only emotional arguments, or talk about minimum wages, mentioning universal basic income only in passing.'), }) self._test_queries('beir/webis-touche2020/v2', count=49, items={ 0: BeirToucheQuery('1', 'Should teachers get tenure?', "A user has heard that some countries do give teachers tenure and others don't. Interested in the reasoning for or against tenure, the user searches for positive and negative arguments. The situation of school teachers vs. university professors is of interest.", "Highly relevant arguments make a clear statement about tenure for teachers in schools or universities. Relevant arguments consider tenure more generally, not specifically for teachers, or, instead of talking about tenure, consider the situation of teachers' financial independence."), 9: BeirToucheQuery('10', 'Should any vaccines be required for children?', 'Anti-vaccination movements are on the rise, and so are pathogens like measles again. The freedom to not vaccinate paired with rampant disinformation may be a threat to society at large, and for children in particular. A users thus wonders, whether there are vaccines that should be mandatory.', 'Highly relevant arguments name one or more vaccines and reason about the (un)necessity to administer them to children. Relevant arguments talk about vaccination for children in general.'), 48: BeirToucheQuery('50', 'Should everyone get a universal basic income?', 'Redistribution of wealth is a fundamental concept of many economies and social systems. A key component might be a universal basic income, however, a user wonders whether this truly would help.', 'Highly relevant arguments take a clear stance toward the universal basic income, giving clear premises. Relevant arguments offer only emotional arguments, or talk about minimum wages, mentioning universal basic income only in passing.'), }) self._test_queries('beir/cqadupstack/android', count=699, items={ 0: BeirCqaQuery('11546', 'Android chroot ubuntu - is it possible to get ubuntu to recognise usb devices', ['linux', 'development']), 9: BeirCqaQuery('20256', 'Does Android hide some amount of RAM from the User?', ['linux', 'development']), 698: BeirCqaQuery('61210', 'Can you remotely download AndroidLost to your phone if your phone battery is dead?', ['linux', 'development']), }) self._test_queries('beir/cqadupstack/english', count=1570, items={ 0: BeirCqaQuery('19399', 'Is "a wide range of features" singular or plural?', ['meaning', 'etymology', 'grammar', 'latin', 'roots']), 9: BeirCqaQuery('21616', 'How are "yes" and "no" formatted in sentences?', ['meaning', 'etymology', 'grammar', 'latin', 'roots']), 1569: BeirCqaQuery('76823', 'When to use articles and when not to?', ['meaning', 'etymology', 'grammar', 'latin', 'roots']), }) self._test_queries('beir/cqadupstack/gaming', count=1595, items={ 0: BeirCqaQuery('82449', 'Can the trophy system protect me against bullets?', ['skyrim']), 9: BeirCqaQuery('176686', 'Please instruct me on how to light myself on fire', ['skyrim']), 1594: BeirCqaQuery('146551', 'How can I fix a corrupted solo world?', ['skyrim']), }) self._test_queries('beir/cqadupstack/gis', count=885, items={ 0: BeirCqaQuery('52462', 'Calculating mean upslope aspect from each cell in DEM using Python?', ['gui']), 9: BeirCqaQuery('12833', 'How to smooth a DEM?', ['gui']), 884: BeirCqaQuery('104332', 'MODIS MOD13Q1 extract ndvi value', ['gui']), }) self._test_queries('beir/cqadupstack/mathematica', count=804, items={ 0: BeirCqaQuery('35544', 'How to use Automorphisms[] on a graph?', ['list-manipulation', 'performance-tuning', 'map']), 9: BeirCqaQuery('37414', 'limit calculation step by step', ['list-manipulation', 'performance-tuning', 'map']), 803: BeirCqaQuery('25260', 'NDSolve with vector function', ['list-manipulation', 'performance-tuning', 'map']), }) self._test_queries('beir/cqadupstack/physics', count=1039, items={ 0: BeirCqaQuery('110554', 'Magnetic field resistance material: are there any?', ['homework', 'newtonian-mechanics', 'friction']), 9: BeirCqaQuery('12012', 'Is spacetime simply connected?', ['homework', 'newtonian-mechanics', 'friction']), 1038: BeirCqaQuery('16082', 'How do I find the frictional force using a free body diagram?', ['homework', 'newtonian-mechanics', 'friction']), }) self._test_queries('beir/cqadupstack/programmers', count=876, items={ 0: BeirCqaQuery('88392', 'Why is closure important for JavaScript?', ['web-development', 'python', 'sql', 'concurrency', 'sqlite']), 9: BeirCqaQuery('210327', "What is the one or the few major changes from Java 6 to Java 7, couldn't JBoss do that already with Java 5?", ['web-development', 'python', 'sql', 'concurrency', 'sqlite']), 875: BeirCqaQuery('133937', 'Methods to rewrite a program', ['web-development', 'python', 'sql', 'concurrency', 'sqlite']), }) self._test_queries('beir/cqadupstack/stats', count=652, items={ 0: BeirCqaQuery('11546', 'Tool to confirm Gaussian fit', ['self-study', 'consistency']), 9: BeirCqaQuery('59955', 'Variance of superset from variance of subsets', ['self-study', 'consistency']), 651: BeirCqaQuery('35719', 'Improvement of regression model', ['self-study', 'consistency']), }) self._test_queries('beir/cqadupstack/tex', count=2906, items={ 0: BeirCqaQuery('197555', 'How can I learn to make my own packages?', ['tikz-pgf', 'horizontal-alignment', 'tikz-trees']), 9: BeirCqaQuery('57481', 'Aliasing issues using beamer with pdfLaTeX', ['tikz-pgf', 'horizontal-alignment', 'tikz-trees']), 2905: BeirCqaQuery('84944', 'How I can delete frametitle after pagebreak in mdframed box?', ['tikz-pgf', 'horizontal-alignment', 'tikz-trees']), }) self._test_queries('beir/cqadupstack/unix', count=1072, items={ 0: BeirCqaQuery('103549', 'Yanked USB Key During Move', ['shell', 'virtualization', 'storage', 'cluster']), 9: BeirCqaQuery('111331', 'Evolution of the shell', ['shell', 'virtualization', 'storage', 'cluster']), 1071: BeirCqaQuery('20536', 'reformatting output with aligned columns', ['shell', 'virtualization', 'storage', 'cluster']), }) self._test_queries('beir/cqadupstack/webmasters', count=506, items={ 0: BeirCqaQuery('28994', 'Someone else is using our Google Analytics Tracking code number. What do we do?', ['seo', 'keywords', 'tools', 'ranking', 'google-ranking']), 9: BeirCqaQuery('30705', 'Redirecting from blogger to custom domain', ['seo', 'keywords', 'tools', 'ranking', 'google-ranking']), 505: BeirCqaQuery('65733', 'Does removing ID from url improve SEO?', ['seo', 'keywords', 'tools', 'ranking', 'google-ranking']), }) self._test_queries('beir/cqadupstack/wordpress', count=541, items={ 0: BeirCqaQuery('120122', "How to enqueue script or style in a theme's template file?", ['theme-development', 'css', 'maximized-width']), 9: BeirCqaQuery('23263', 'Syntax highlighting for post/page editor', ['theme-development', 'css', 'maximized-width']), 540: BeirCqaQuery('90939', 'All-in-One Event Calendar: Custom Query - Getting each event Instance', ['theme-development', 'css', 'maximized-width']), }) def test_qrels(self): self._test_qrels('beir/arguana', count=1406, items={ 0: TrecQrel('test-environment-aeghhgwpe-pro02a', 'test-environment-aeghhgwpe-pro02b', 1, '0'), 9: TrecQrel('test-environment-assgbatj-pro01a', 'test-environment-assgbatj-pro01b', 1, '0'), 1405: TrecQrel('test-society-epsihbdns-con01a', 'test-society-epsihbdns-con01b', 1, '0'), }) self._test_qrels('beir/climate-fever', count=4681, items={ 0: TrecQrel('0', 'Habitat_destruction', 1, '0'), 9: TrecQrel('9', 'Carbon_dioxide', 1, '0'), 4680: TrecQrel('3134', 'Global_warming', 1, '0'), }) self._test_qrels('beir/dbpedia-entity/dev', count=5673, items={ 0: TrecQrel('INEX_LD-2009096', '<dbpedia:1889_in_France>', 0, '0'), 9: TrecQrel('INEX_LD-2009096', '<dbpedia:CityCenter>', 0, '0'), 5672: TrecQrel('TREC_Entity-17', '<dbpedia:Worst_Cooks_in_America>', 0, '0'), }) self._test_qrels('beir/dbpedia-entity/test', count=43515, items={ 0: TrecQrel('INEX_LD-2009022', '<dbpedia:Afghan_cuisine>', 0, '0'), 9: TrecQrel('INEX_LD-2009022', '<dbpedia:British_cuisine>', 0, '0'), 43514: TrecQrel('TREC_Entity-9', '<dbpedia:Émile_Gilbert>', 0, '0'), }) self._test_qrels('beir/fever/dev', count=8079, items={ 0: TrecQrel('137334', 'Soul_Food_(film)', 1, '0'), 9: TrecQrel('105095', 'Carrie_Mathison', 1, '0'), 8078: TrecQrel('46064', 'NAACP_Image_Award_for_Outstanding_Supporting_Actor_in_a_Drama_Series', 1, '0'), }) self._test_qrels('beir/fever/test', count=7937, items={ 0: TrecQrel('163803', 'Ukrainian_Soviet_Socialist_Republic', 1, '0'), 9: TrecQrel('54298', 'Electric_chair', 1, '0'), 7936: TrecQrel('81957', 'Trouble_with_the_Curve', 1, '0'), }) self._test_qrels('beir/fever/train', count=140085, items={ 0: TrecQrel('75397', 'Fox_Broadcasting_Company', 1, '0'), 9: TrecQrel('226034', 'Tetris', 1, '0'), 140084: TrecQrel('152180', 'Susan_Sarandon', 1, '0'), }) self._test_qrels('beir/fiqa/dev', count=1238, items={ 0: TrecQrel('1', '14255', 1, '0'), 9: TrecQrel('29', '189642', 1, '0'), 1237: TrecQrel('11023', '579370', 1, '0'), }) self._test_qrels('beir/fiqa/test', count=1706, items={ 0: TrecQrel('8', '566392', 1, '0'), 9: TrecQrel('42', '331981', 1, '0'), 1705: TrecQrel('11088', '437100', 1, '0'), }) self._test_qrels('beir/fiqa/train', count=14166, items={ 0: TrecQrel('0', '18850', 1, '0'), 9: TrecQrel('11', '596427', 1, '0'), 14165: TrecQrel('11104', '518310', 1, '0'), }) self._test_qrels('beir/hotpotqa/dev', count=10894, items={ 0: TrecQrel('5ae81fbf55429952e35eaa37', '6607768', 1, '0'), 9: TrecQrel('5ae142a4554299422ee9964a', '1216600', 1, '0'), 10893: TrecQrel('5a8bae0c5542996e8ac889b5', '690481', 1, '0'), }) self._test_qrels('beir/hotpotqa/test', count=14810, items={ 0: TrecQrel('5a8b57f25542995d1e6f1371', '2816539', 1, '0'), 9: TrecQrel('5a8e3ea95542995a26add48d', '5382358', 1, '0'), 14809: TrecQrel('5ac132a755429964131be17c', '644341', 1, '0'), }) self._test_qrels('beir/hotpotqa/train', count=170000, items={ 0: TrecQrel('5ab6d31155429954757d3384', '2921047', 1, '0'), 9: TrecQrel('5adec8ad55429975fa854f8f', '202525', 1, '0'), 169999: TrecQrel('5a7543b155429916b01642cd', '20527', 1, '0'), }) self._test_qrels('beir/msmarco/dev', count=7437, items={ 0: TrecQrel('300674', '7067032', 1, '0'), 9: TrecQrel('54544', '7068203', 1, '0'), 7436: TrecQrel('195199', '8009377', 1, '0'), }) self._test_qrels('beir/msmarco/test', count=9260, items={ 0: TrecQrel('19335', '1017759', 0, '0'), 9: TrecQrel('19335', '1274615', 0, '0'), 9259: TrecQrel('1133167', '977421', 0, '0'), }) self._test_qrels('beir/msmarco/train', count=532751, items={ 0: TrecQrel('1185869', '0', 1, '0'), 9: TrecQrel('186154', '1160', 1, '0'), 532750: TrecQrel('405466', '8841735', 1, '0'), }) self._test_qrels('beir/nfcorpus/dev', count=11385, items={ 0: TrecQrel('PLAIN-1', 'MED-2421', 2, '0'), 9: TrecQrel('PLAIN-1', 'MED-4070', 1, '0'), 11384: TrecQrel('PLAIN-3471', 'MED-5342', 2, '0'), }) self._test_qrels('beir/nfcorpus/test', count=12334, items={ 0: TrecQrel('PLAIN-2', 'MED-2427', 2, '0'), 9: TrecQrel('PLAIN-2', 'MED-2434', 1, '0'), 12333: TrecQrel('PLAIN-3472', 'MED-3627', 1, '0'), }) self._test_qrels('beir/nfcorpus/train', count=110575, items={ 0: TrecQrel('PLAIN-3', 'MED-2436', 1, '0'), 9: TrecQrel('PLAIN-3', 'MED-2431', 1, '0'), 110574: TrecQrel('PLAIN-3474', 'MED-4634', 1, '0'), }) self._test_qrels('beir/nq', count=4201, items={ 0: TrecQrel('test0', 'doc0', 1, '0'), 9: TrecQrel('test6', 'doc63', 1, '0'), 4200: TrecQrel('test3451', 'doc117680', 1, '0'), }) self._test_qrels('beir/quora/dev', count=7626, items={ 0: TrecQrel('318', '317', 1, '0'), 9: TrecQrel('399', '364917', 1, '0'), 7625: TrecQrel('537790', '537789', 1, '0'), }) self._test_qrels('beir/quora/test', count=15675, items={ 0: TrecQrel('46', '134031', 1, '0'), 9: TrecQrel('187', '188', 1, '0'), 15674: TrecQrel('537876', '537875', 1, '0'), }) self._test_qrels('beir/scidocs', count=29928, items={ 0: TrecQrel('78495383450e02c5fe817e408726134b3084905d', '632589828c8b9fca2c3a59e97451fde8fa7d188d', 1, '0'), 9: TrecQrel('78495383450e02c5fe817e408726134b3084905d', '305c45fb798afdad9e6d34505b4195fa37c2ee4f', 0, '0'), 29927: TrecQrel('89e58773fa59ef5b57f229832c2a1b3e3efff37e', 'dec997b20ebe2b867f68cc5c123d9cb9eafad6bb', 0, '0'), }) self._test_qrels('beir/scifact/test', count=339, items={ 0: TrecQrel('1', '31715818', 1, '0'), 9: TrecQrel('50', '12580014', 1, '0'), 338: TrecQrel('1395', '17717391', 1, '0'), }) self._test_qrels('beir/scifact/train', count=919, items={ 0: TrecQrel('0', '31715818', 1, '0'), 9: TrecQrel('15', '22080671', 1, '0'), 918: TrecQrel('1407', '29863668', 1, '0'), }) self._test_qrels('beir/trec-covid', count=66336, items={ 0: TrecQrel('1', '005b2j4b', 2, '0'), 9: TrecQrel('1', '05vx82oo', 0, '0'), 66335: TrecQrel('50', 'zz8wvos9', 1, '0'), }) self._test_qrels('beir/webis-touche2020', count=2962, items={ 0: TrecQrel('1', '197beaca-2019-04-18T11:28:59Z-00001-000', 4, '0'), 9: TrecQrel('1', '24e47090-2019-04-18T19:22:46Z-00003-000', 3, '0'), 2961: TrecQrel('50', '799d051-2019-04-18T11:47:02Z-00000-000', -2, '0'), }) self._test_qrels('beir/webis-touche2020/v2', count=2214, items={ 0: TrecQrel('1', '197beaca-2019-04-18T11:28:59Z-00001-000', 0, '0'), 9: TrecQrel('1', '4fb4627-2019-04-18T18:47:37Z-00003-000', 1, '0'), 2213: TrecQrel('50', '4d1037f0-2019-04-18T11:08:29Z-00002-000', 2, '0'), }) self._test_qrels('beir/cqadupstack/android', count=1696, items={ 0: TrecQrel('11546', '18572', 1, '0'), 9: TrecQrel('82440', '78789', 1, '0'), 1695: TrecQrel('61210', '61212', 1, '0'), }) self._test_qrels('beir/cqadupstack/english', count=3765, items={ 0: TrecQrel('19399', '102236', 1, '0'), 9: TrecQrel('19399', '4501', 1, '0'), 3764: TrecQrel('76823', '31410', 1, '0'), }) self._test_qrels('beir/cqadupstack/gaming', count=2263, items={ 0: TrecQrel('82449', '53562', 1, '0'), 9: TrecQrel('46138', '42968', 1, '0'), 2262: TrecQrel('146551', '28158', 1, '0'), }) self._test_qrels('beir/cqadupstack/gis', count=1114, items={ 0: TrecQrel('52462', '49462', 1, '0'), 9: TrecQrel('46866', '46762', 1, '0'), 1113: TrecQrel('104332', '104331', 1, '0'), }) self._test_qrels('beir/cqadupstack/mathematica', count=1358, items={ 0: TrecQrel('35544', '14789', 1, '0'), 9: TrecQrel('48026', '47994', 1, '0'), 1357: TrecQrel('25260', '26583', 1, '0'), }) self._test_qrels('beir/cqadupstack/physics', count=1933, items={ 0: TrecQrel('110554', '21138', 1, '0'), 9: TrecQrel('89378', '36242', 1, '0'), 1932: TrecQrel('16082', '16081', 1, '0'), }) self._test_qrels('beir/cqadupstack/programmers', count=1675, items={ 0: TrecQrel('88392', '203507', 1, '0'), 9: TrecQrel('145437', '229691', 1, '0'), 1674: TrecQrel('133937', '27335', 1, '0'), }) self._test_qrels('beir/cqadupstack/stats', count=913, items={ 0: TrecQrel('11546', '66109', 1, '0'), 9: TrecQrel('57083', '91074', 1, '0'), 912: TrecQrel('35719', '35716', 1, '0'), }) self._test_qrels('beir/cqadupstack/tex', count=5154, items={ 0: TrecQrel('197555', '12668', 1, '0'), 9: TrecQrel('89372', '80', 1, '0'), 5153: TrecQrel('84944', '84946', 1, '0'), }) self._test_qrels('beir/cqadupstack/unix', count=1693, items={ 0: TrecQrel('103549', '2677', 1, '0'), 9: TrecQrel('103549', '48253', 1, '0'), 1692: TrecQrel('20536', '17664', 1, '0'), }) self._test_qrels('beir/cqadupstack/webmasters', count=1395, items={ 0: TrecQrel('28994', '53865', 1, '0'), 9: TrecQrel('11544', '52031', 1, '0'), 1394: TrecQrel('65733', '65118', 1, '0'), }) self._test_qrels('beir/cqadupstack/wordpress', count=744, items={ 0: TrecQrel('120122', '21561', 1, '0'), 9: TrecQrel('114225', '78428', 1, '0'), 743: TrecQrel('90939', '105803', 1, '0'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/c4.py ================================================ import re import unittest from ir_datasets.datasets.c4 import C4Doc, MisinfoQuery from .base import DatasetIntegrationTest class TestCar(DatasetIntegrationTest): def test_docs(self): self._test_docs('c4/en-noclean-tr/trec-misinfo-2021', items={ 0: C4Doc('en.noclean.c4-train.00000-of-07168.0', re.compile('^November 24, 2016 – World News, Breaking News\nWednesday, April 24, 2019\nLatest:\nFitbit introduced “s.{3832}World News, Breaking News\\. All rights reserved\\.\nTheme: ColorMag by ThemeGrill\\. Powered by WordPress\\.$', flags=48), 'http://sevendaynews.com/2016/11/24/', '2019-04-24T16:35:11Z'), 9: C4Doc('en.noclean.c4-train.00000-of-07168.9', re.compile('^Best Books Market\nBest Books Market\nCategories\nBook\nToy\nfree ftp mac client :: :: эффективные средст.{735}e Eleven Rival Regional Cultures of North America\n1\n2\n3\n4\n5\n6\n7\n8\n9\n10\nNext\n© 2019 Best Books Market$', flags=48), 'http://books.amzgr.com/', '2019-04-26T05:44:42Z'), 99: C4Doc('en.noclean.c4-train.00000-of-07168.99', re.compile('^Volunteers Needed for Assembly for Children \\| Church of God of Prophecy\n\\+1\\.423\\.559\\.5100 info@cogop\\.o.{2453}ture\nSocial Media\nResources\nAssembly\nTreasurer’s Report\nFacebook\nInstagram\nVimeo\nYoutube\nTwitter\nRSS$', flags=48), 'https://cogop.org/blog/volunteers-needed-for-assembly-for-children/', '2019-04-26T08:07:35Z'), 999: C4Doc('en.noclean.c4-train.00000-of-07168.999', re.compile('^타임폴리오자산운용\nINVESTMENT HIGHLIGHTS\nMulti Manager\nMulti Asset\nMulti Strategy\nTMS\nQAS\nMMS\nCOMPANY\nIntrodu.{300}x\\] 해상도에 최적화 되어 있습니다\\.\nTel\\. \\(02\\) 533\\-8940\nFax\\. \\(02\\) 534\\-3305\nE\\-mail\\. tf@timefolio\\.co\\.kr\n개인정보처리방침 > TOP$', flags=48), 'http://timefolio.co.kr/gallery/gallery_view.php?num=39&page=1&search_keyword=&search_field=', '2019-04-25T06:54:54Z'), 9999: C4Doc('en.noclean.c4-train.00000-of-07168.9999', re.compile('^Unex Avid Juicy 3/5/7/Ca Metal Ceramic Disc Brake Pad\nJavaScript seems to be disabled in your browse.{10024}rice\\}\\}\nApply\nCancel\n\\{\\{carrier\\.method_title\\}\\}\n\\+ \\{\\{\\$parent\\.currency\\}\\}\\{\\{carrier\\.price\\}\\}\nApply\nCancel\n\\-\\-$', flags=48), 'https://www.bicyclehero.com/us/unex-avid-juicy-3-5-7-ca-metal-ceramic-disc-brake-pad.html', '2019-04-23T16:41:09Z'), 99999: C4Doc('en.noclean.c4-train.00000-of-07168.99999', re.compile("^The truth about SHA1, SHA\\-256, dual\\-signing and Code Signing Certificates : K Software\nWelcome to th.{5597}ouldn't be helpful\\. Help us improve this article with your feedback\\.\nRelated Articles\nHome Solutions$", flags=48), 'https://support.ksoftware.net/support/solutions/articles/215805-the-truth-about-sha1-sha-256-dual-signing-and-code-signing-certificates-', '2019-04-20T09:00:23Z'), 999999: C4Doc('en.noclean.c4-train.00006-of-07168.109537', re.compile('^Results \\- Race Walking Association\nHome \\| Fixtures \\| Results \\| Rankings \\| Athletes \\| Clubs \\| Newslet.{400}und points: 0\n2012: 2 races 2,000 metres completed\\.\n\\(c\\) RACE WALKING ASSOCIATION 1907 \\- 2019 Sitemap$', flags=48), 'http://racewalkingassociation.com/AthleteDetails.asp?mode=edit&id=11300&athlete=Emily_Wyman', '2019-04-25T20:24:19Z'), }) def test_queries(self): self._test_queries('c4/en-noclean-tr/trec-misinfo-2021', count=50, items={ 0: MisinfoQuery('101', re.compile('ankl.*nitis', flags=48), re.compile('Will.*kle bra.*heal ac.*', flags=48), re.compile('Achil.*kle braces, or both.', flags=48), re.compile('We do no.*professional advice.', flags=48), 'unhelpful', 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3134723/'), 9: MisinfoQuery('110', re.compile('birt.*tment', flags=48), re.compile('Wil.*trol pil.*arian c.*', flags=48), re.compile('Functi.*lth issues, or both.', flags=48), re.compile('We do no.*professional advice.', flags=48), 'unhelpful', 'https://pubmed.ncbi.nlm.nih.gov/24782304/'), 49: MisinfoQuery('150', re.compile('antiox.*ity', flags=48), re.compile('Wil.*oxida.*ments.*blems.* ', flags=48), re.compile("Coupl.*ether. ", flags=48), re.compile('We do no.*professional advice.', flags=48), 'unhelpful', 'https://pubmed.ncbi.nlm.nih.gov/32851663/'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/car.py ================================================ import re import unittest from ir_datasets.datasets.car import CarQuery from ir_datasets.formats import TrecQrel, GenericDoc from .base import DatasetIntegrationTest class TestCar(DatasetIntegrationTest): def test_docs(self): self._test_docs('car/v1.5', count=29678367, items={ 0: GenericDoc('0000000e7e72cafb61a9f356b7dceb25c5e028db', re.compile("^Ukraine was one of the most dangerous places for journalists in the world during the euromaidan demo.{311}ened in Donetsk in April 2014\\. In July 2014 a firebomb was thrown at the TV channel ''112 Ukraine''\\.$", flags=48)), 9: GenericDoc('000006d5c22f4efbb6b963ea819e976a4b28600b', re.compile('^To mark the 40th anniversary of "Bohemian Rhapsody", the song was released on a limited edition 12" .{174}on CD, DVD \\& Blu\\-ray\\. This includes the first ever live recorded performance of "Bohemian Rhapsody"\\.$', flags=48)), 29678366: GenericDoc('ffffffb9eec6224bef5da06e829eef59a37748c6', re.compile('^Fisher recommended Louis as First Sea Lord: "He is the most capable administrator in the Admiralty\'s.{472}that would prepare the navy\'s plans in case of war\\. He was promoted to full admiral on 13 July 1912\\.$', flags=48)), }) self._test_docs('car/v2.0', count=29794697, items={ 0: GenericDoc('00000047dc43083f49b68399c6deeed5c0e81c1f', re.compile('^On 28 October 1943, Fuller sailed from Efate, New Hebrides, for the initial landings on Bougainville.{456}damage, and twice more during the following month and a half carried reinforcements to Bougainville\\.$', flags=48)), 9: GenericDoc('0000070402dbaf074bc1e3ba487036322ef8ce86', re.compile('^In 1662, the then Governor of Jamaica, Lord Windsor, received royal instructions to protect the "Ca.{527} its landward side to five feet on its seaward side, with the walls being about five feet in height\\.$', flags=48)), 29794696: GenericDoc('ffffffb9eec6224bef5da06e829eef59a37748c6', re.compile('^Fisher recommended Louis as First Sea Lord: "He is the most capable administrator in the Admiralty\'s.{472}that would prepare the navy\'s plans in case of war\\. He was promoted to full admiral on 13 July 1912\\.$', flags=48)), }) def test_queries(self): self._test_queries('car/v1.5/trec-y1', count=2287, items={ 0: CarQuery('Fudge/History', 'Fudge History', 'Fudge', ('History',)), 9: CarQuery('Glass%20ceiling/Glass%20Ceiling%20Index', 'Glass ceiling Glass Ceiling Index', 'Glass ceiling', ('Glass Ceiling Index',)), 2286: CarQuery('Global%20catastrophic%20risk/Organizations', 'Global catastrophic risk Organizations', 'Global catastrophic risk', ('Organizations',)), }) self._test_queries('car/v1.5/test200', count=1987, items={ 0: CarQuery('Hog-dog%20rodeo/Typical%20match', 'Hog-dog rodeo Typical match', 'Hog-dog rodeo', ('Typical match',)), 9: CarQuery('Infield%20fly%20rule/The%20rule/Foul%20balls', 'Infield fly rule The rule Foul balls', 'Infield fly rule', ('The rule', 'Foul balls')), 1986: CarQuery('Structural%20information%20theory/Visual%20regularity', 'Structural information theory Visual regularity', 'Structural information theory', ('Visual regularity',)), }) self._test_queries('car/v1.5/train/fold0', count=467946, items={ 0: CarQuery('Kindertotenlieder/Text%20and%20music', 'Kindertotenlieder Text and music', 'Kindertotenlieder', ('Text and music',)), 9: CarQuery('Northrop%20YB-35/Variants', 'Northrop YB-35 Variants', 'Northrop YB-35', ('Variants',)), 467945: CarQuery('1987%E2%80%9388%20Greek%20Cup/Final', '1987–88 Greek Cup Final', '1987–88 Greek Cup', ('Final',)), }) self._test_queries('car/v1.5/train/fold1', count=466596, items={ 0: CarQuery('Roderick%20Spode/Overview', 'Roderick Spode Overview', 'Roderick Spode', ('Overview',)), 9: CarQuery('Alan%20Hale%20Jr./Personal%20life', 'Alan Hale Jr. Personal life', 'Alan Hale Jr.', ('Personal life',)), 466595: CarQuery('Brian%20Eno/Personal%20life%20and%20beliefs', 'Brian Eno Personal life and beliefs', 'Brian Eno', ('Personal life and beliefs',)), }) self._test_queries('car/v1.5/train/fold2', count=469323, items={ 0: CarQuery('Lost%20in%20Space%20(film)/Plot', 'Lost in Space (film) Plot', 'Lost in Space (film)', ('Plot',)), 9: CarQuery('Dick%20&%20Dom%20in%20da%20Bungalow/Bungalow%20Games/Forfeit%20Auction', 'Dick & Dom in da Bungalow Bungalow Games Forfeit Auction', 'Dick & Dom in da Bungalow', ('Bungalow Games', 'Forfeit Auction')), 469322: CarQuery('Erick%20van%20Egeraat/Awards%20and%20recognition', 'Erick van Egeraat Awards and recognition', 'Erick van Egeraat', ('Awards and recognition',)), }) self._test_queries('car/v1.5/train/fold3', count=463314, items={ 0: CarQuery('Bradford,%20Ontario/History', 'Bradford, Ontario History', 'Bradford, Ontario', ('History',)), 9: CarQuery('CBBC/Scheduling', 'CBBC Scheduling', 'CBBC', ('Scheduling',)), 463313: CarQuery('Br%C3%BCel%20&%20Kj%C3%A6r/Organisational%20developments', 'Brüel & Kjær Organisational developments', 'Brüel & Kjær', ('Organisational developments',)), }) self._test_queries('car/v1.5/train/fold4', count=468789, items={ 0: CarQuery('Status%20symbol/By%20region%20and%20time', 'Status symbol By region and time', 'Status symbol', ('By region and time',)), 9: CarQuery('History%20of%20Greece/Ancient%20Greece%20(1100%E2%80%93146%20BC)/Iron%20Age%20(1100%E2%80%93800%20BC)', 'History of Greece Ancient Greece (1100–146 BC) Iron Age (1100–800 BC)', 'History of Greece', ('Ancient Greece (1100–146 BC)', 'Iron Age (1100–800 BC)')), 468788: CarQuery('Manchester%20International%20Organ%20Competition/1986%20-%20Fifth%20competition', 'Manchester International Organ Competition 1986 - Fifth competition', 'Manchester International Organ Competition', ('1986 - Fifth competition',)), }) def test_qrels(self): self._test_qrels('car/v1.5/trec-y1/auto', count=5820, items={ 0: TrecQrel('Aftertaste/Aftertaste%20processing%20in%20the%20cerebral%20cortex', '38c1bd25ddca2705164677a3f598c46df85afba7', 1, '0'), 9: TrecQrel('Aftertaste/Temporal%20taste%20perception', '8a41a87100d139bb9c108c8cab2ac3baaabea3ce', 1, '0'), 5819: TrecQrel('Yellowstone%20National%20Park/Recreation', 'e80b5185da1493edde41bea19a389a3f62167369', 1, '0'), }) self._test_qrels('car/v1.5/trec-y1/manual', count=29571, items={ 0: TrecQrel('Hadley%20cell/Hadley%20cell%20expansion', '389c8a699f4db2f0278700d1c32e63ac369906cd', -1, '0'), 9: TrecQrel('Water%20cycle/Effects%20on%20biogeochemical%20cycling', '844a0a0d5860ff1da8a9fcfb16cc4ce04ffb963f', 1, '0'), 29570: TrecQrel('Rancidification/Reducing%20rancidification', '20a4e9af2853803a08854a1cc8973534e2235658', -1, '0'), }) self._test_qrels('car/v1.5/test200', count=4706, items={ 0: TrecQrel('ASME/ASME%20codes%20and%20standards', '16d8f62407d2cdd283a71735e5c83f7d7947b93a', 1, '0'), 9: TrecQrel('Activity%20theory/An%20explanation', 'c0ee784b8f0eb3b80aaf85f42d5148655192cc1d', 1, '0'), 4705: TrecQrel('Zang-fu/Yin/yang%20and%20the%20Five%20Elements', 'fe6f4dd186037e09bf00f0f08bf172babac7930b', 1, '0'), }) self._test_qrels('car/v1.5/train/fold0', count=1054369, items={ 0: TrecQrel("$pread/''$pread''%20Book", '2f545ffad1581dea4a2e4720aa9feb7389e1956a', 1, '0'), 9: TrecQrel('%22Wild%20Bill%22%20Hickok/Death/Burial', '528b68a3355672c9b8bd5003428b72f54074b3fb', 1, '0'), 1054368: TrecQrel('Zygmunt%20Szcz%C4%99sny%20Feli%C5%84ski/Views%20on%20Poland', 'fd77154f625ca721e554cbd0e4f33b51d4d92af6', 1, '0'), }) self._test_qrels('car/v1.5/train/fold1', count=1052398, items={ 0: TrecQrel('$100,000%20infield/Eddie%20Collins', 'c7aa3c7821a112a149d85f650cbca4ec23c63617', 1, '0'), 9: TrecQrel("%60Abdu'l-Bah%C3%A1/Acre/Marriage%20and%20family%20life", '4da4ea634ccae1173e553129b368e95962969ec8', 1, '0'), 1052397: TrecQrel('Zygosity/Types/Nullizygous', '36186e2655db62fd9c31701302f86636b03d2511', 1, '0'), }) self._test_qrels('car/v1.5/train/fold2', count=1061162, items={ 0: TrecQrel("$h*!%20My%20Dad%20Says/''Surviving%20Jack''", 'dc4866e5b230ffb48b6f808f41ccf8063fbdc9fa', 1, '0'), 9: TrecQrel('%22Left-Wing%22%20Communism:%20An%20Infantile%20Disorder/%22Left-wing%22%20communism%20in%20Germany', '22ec581e3e1c5397e64bc6f0066dc8aea12fc71f', 1, '0'), 1061161: TrecQrel('ZynAddSubFX/Windows%20version', 'b9d1be10b54e5efcbf3e6f1e5f2fbaf7c8af303c', 1, '0'), }) self._test_qrels('car/v1.5/train/fold3', count=1046784, items={ 0: TrecQrel('$2%20billion%20arms%20deal/Confessional%20statements', '0e512b5962fa5ea838a578cbf414ae09b863a33f', 1, '0'), 9: TrecQrel('$2%20billion%20arms%20deal/Investigative%20committee', '812cb64a35f482bd60f82c1d67204c73612cb6a7', 1, '0'), 1046783: TrecQrel('Zyuden%20Sentai%20Kyoryuger/Video%20game', '844b90cf6f7c62e5bf51625a4d216baec2825bf9', 1, '0'), }) self._test_qrels('car/v1.5/train/fold4', count=1061911, items={ 0: TrecQrel('$1,000%20genome/Additional%20Resources', '67ea5eae967657a8f0282066e3086573e41726d5', 1, '0'), 9: TrecQrel('$1,000%20genome/Commercial%20efforts', 'a7ac9041cd833d6b09cc5270b495e9f94704027f', 1, '0'), 1061910: TrecQrel('Zyron/Products', 'f355f98b4e3d5b08f60abe61022e9393202b9718', 1, '0'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/clinicaltrials.py ================================================ import re import unittest import ir_datasets from ir_datasets.datasets.clinicaltrials import ClinicalTrialsDoc from ir_datasets.datasets.medline import TrecPmQuery, TrecPm2017Query from ir_datasets.formats import GenericQuery, TrecQrel from .base import DatasetIntegrationTest _logger = ir_datasets.log.easy() class TestClinicalTrials(DatasetIntegrationTest): def test_docs(self): self._test_docs('clinicaltrials/2017', count=241006, items={ 0: ClinicalTrialsDoc('NCT00530868', 'Comparing Letrozole Given Alone to Letrozole Given With Avastin in Post-Menopausal Women Breast Cancer', '', re.compile('^\n \n This purpose of this trial is to show that the combination of Avastin and hormone therap.{3} should be more effective than hormone therapy alone for the treatment of breast cancer\\.\n \n $', flags=48), re.compile('^\n \n Preclinical and clinical data have demonstrated that up\\-regulation of tumor cell VEGF is.{329}ould be more effective than hormonal therapy alone for the\n treatment of breast cancer\\.\n \n $', flags=48), re.compile('^\n \n Inclusion Criteria:\n\n All patients must meet the following criteria to be eli.{4158}and carcinoma in\\-situ of uterine cervix\\.\n\n \\- Patients with metastatic disease\\.\n \n $', flags=48)), 9: ClinicalTrialsDoc('NCT00530101', 'The Magnetic Resonance Imaging Evaluation of Doxorubicin Cardiotoxicity', '', re.compile('^\n \n The purpose of this research study is to evaluate MR imaging in subjects receiving\n .{166}ely 10 subjects over 12 months at the\n University of Miami / Miller School of Medicine\\.\n \n $', flags=48), re.compile('^\n \n Doxorubicin \\(Adriamycin\\) is one of the most widely used chemotherapy agents, despite its.{9660}ocardium\\.\n\n Medical records will provide data regarding cardiac morbidity or mortality\\.\n \n $', flags=48), re.compile('^\n \n Inclusion Criteria:\n\n \\- Subject must have breast cancer and undergoing rad.{112} \\- Healthy subjects\n\n \\- Males\n\n \\- Subjects under the age of 18\n \n $', flags=48)), 241005: ClinicalTrialsDoc('NCT00074646', 'Phase I Trial of CC-8490 for the Treatment of Subjects With Recurrent/Refractory High-Grade Gliomas', '', '\n \n Phase I trial of CC-8490 for the treatment of subjects with recurrent/refractory high-grade\n gliomas\n \n ', '', re.compile('^\n \n Inclusion Criteria:\n\n \\- Patients with glioblastoma multiforme \\(GBM\\), glios.{4283}lism\\.\n\n \\- Use of other experimental study drug within 28 days of registration\\.\n \n $', flags=48)), }) self._test_docs('clinicaltrials/2019', count=306238, items={ 0: ClinicalTrialsDoc('NCT00704457', 'Impact Of Sacral Neuromodulation On Urine Markers For Interstitial Cystitis (IC)', '', '\n \n Urine will be collected and sent to the University of Maryland. Urines will be analyzed for\n urine markers.\n \n ', re.compile('^\n \n Urine will be collected and flash frozen in liquid nitrogen then placed in a \\-70 C freez.{376} in urine\n marker levels will be analyzed and correlated with change in symptom scores\\.\n \n $', flags=48), '\n \n Inclusion Criteria:\n\n - Patients will be drawn from Dr. Peters patient base that covers Southeast Michigan.\n\n Exclusion Criteria:\n\n - Male\n \n '), 9: ClinicalTrialsDoc('NCT00705887', 'A Motivational Enhancement Approach to Skin Cancer Prevention', '', re.compile('^\n \n The specific aims of this research are:\n\n Aim 1 \\- To describe the UV protection beh.{579} protection stages of change, UV protection self\\-efficacy, and UV protection attitudes\\.\n \n $', flags=48), re.compile('^\n \n Although skin cancer is the most common form of cancer in the United States, it is highl.{806}nvestigated the application of these techniques to\n skin cancer prevention discussions\\.\n \n $', flags=48), re.compile('^\n \n Inclusion Criteria:\n\n \\- Dermatology patient presenting for scheduled appoi.{157}lish\n\n \\- Having previously received medical treatment from the interventionist\n \n $', flags=48)), 306237: ClinicalTrialsDoc('NCT03548415', 'Safety, Tolerability, and Efficacy of IONIS-GHR-LRx in up to 42 Adult Patients With Acromegaly Being Treated With Long-acting Somatostatin Receptor Ligands', '', '\n \n The purpose is to assess the Safety, Tolerability, and Efficacy of IONIS-GHR-LRx in up to 42\n Patients with Acromegaly\n \n ', re.compile('^\n \n This short\\-term study will assess changes in serum insulin\\-like growth factor 1 \\(IGF\\-1\\) .{68}sed with Acromegaly being treated\n with Long\\-acting Somatostatin Receptor Ligands \\(SRL\\)\n \n $', flags=48), re.compile('^\n \n Inclusion Criteria:\n\n 1\\. Males or females with documented diagnosis of Acro.{2002} stable dose and regimen for >= 3 months prior to screening and throughout the trial\n \n $', flags=48)), }) self._test_docs('clinicaltrials/2021', count=375580, items={ 0: ClinicalTrialsDoc('NCT00000102', 'Congenital Adrenal Hyperplasia: Calcium Channels as Therapeutic Targets', '', re.compile('^\n \n This study will test the ability of extended release nifedipine \\(Procardia XL\\), a blood\\\r.{66}ucocorticoid medication children\\\r\n take to treat congenital adrenal hyperplasia \\(CAH\\)\\.\\\r\n \n $', flags=48), re.compile('^\n \n This protocol is designed to assess both acute and chronic effects of the calcium channe.{716}e would, in turn, reduce the deleterious effects of glucocorticoid\\\r\n treatment in CAH\\.\\\r\n \n $', flags=48), re.compile('^\n \n Inclusion Criteria:\\\r\n\\\r\n \\- diagnosed with Congenital Adrenal Hyperplasia \\(C.{126}ase, or elevated liver function tests\\\r\n\\\r\n \\- history of cardiovascular disease\\\r\n \n $', flags=48)), 9: ClinicalTrialsDoc('NCT00000113', 'Correction of Myopia Evaluation Trial (COMET)', '', re.compile('^\n \n To evaluate whether progressive addition lenses \\(PALs\\) slow the rate of progression of\\\r\n.{291}opia in a group of children receiving\\\r\n conventional treatment \\(single vision lenses\\)\\.\\\r\n \n $', flags=48), re.compile('^\n \n Myopia \\(nearsightedness\\) is an important public health problem, which entails substantia.{3027}e secondary outcome of the study is axial length measured by A\\-scan\\\r\n ultrasonography\\.\\\r\n \n $', flags=48), re.compile('^\n \n Children between the ages of 6 and 12 years with myopia in both eyes \\(defined as sph.{484}ssive\\\r\n addition lenses, or any conditions precluding adherence to the protocol\\.\\\r\n \n $', flags=48)), 375579: ClinicalTrialsDoc('NCT04862312', 'Video Chat During Meals to Improve Nutritional Intake in Older Adults', '', re.compile('^\n \n The VideoDining study is a Stage IB behavioral intervention development project\\. The\\\r\n .{184}to\\\r\n evaluate changes in nutritional intake and loneliness in response to VideoDining\\.\\\r\n \n $', flags=48), re.compile('^\n \n The U\\.S\\. population is growing older and more adults are aging at home alone, by choice,.{1998}efinement of the\\\r\n VideoDining intervention are additional key outcomes of this study\\.\\\r\n \n $', flags=48), re.compile('^\n \n Inclusion Criteria:\\\r\n\\\r\n 1\\. Receive Meals\\-on\\-Wheels meals from Foodnet in To.{370} and participate in the study\\.\\\r\n\\\r\n 5\\. Already own and use an Amazon Echo Show\\.\\\r\n \n $', flags=48)), }) def test_queries(self): self._test_queries('clinicaltrials/2017/trec-pm-2017', count=30, items={ 0: TrecPm2017Query('1', 'Liposarcoma', 'CDK4 Amplification', '38-year-old male', 'GERD'), 9: TrecPm2017Query('10', 'Lung adenocarcinoma', 'KRAS (G12C)', '61-year-old female', 'Hypertension, Hypercholesterolemia'), 29: TrecPm2017Query('30', 'Pancreatic adenocarcinoma', 'RB1, TP53, KRAS', '57-year-old female', 'None'), }) self._test_queries('clinicaltrials/2017/trec-pm-2018', count=50, items={ 0: TrecPmQuery('1', 'melanoma', 'BRAF (V600E)', '64-year-old male'), 9: TrecPmQuery('10', 'melanoma', 'KIT (L576P)', '65-year-old female'), 49: TrecPmQuery('50', 'acute myeloid leukemia', 'FLT3', '13-year-old male'), }) self._test_queries('clinicaltrials/2019/trec-pm-2019', count=40, items={ 0: TrecPmQuery('1', 'melanoma', 'BRAF (E586K)', '64-year-old female'), 9: TrecPmQuery('10', 'mucosal melanoma', 'KIT (L576P), KIT amplification', '62-year-old female'), 39: TrecPmQuery('40', 'malignant hyperthermia', 'RYR1', '54-year-old male'), }) self._test_queries('clinicaltrials/2021/trec-ct-2021', count=75, items={ 0: GenericQuery('1', '\nPatient is a 45-year-old man with a history of anaplastic astrocytoma of the spine complicated by severe lower extremity weakness and urinary retention s/p Foley catheter, high-dose steroids, hypertension, and chronic pain. The tumor is located in the T-L spine, unresectable anaplastic astrocytoma s/p radiation. Complicated by progressive lower extremity weakness and urinary retention. Patient initially presented with RLE weakness where his right knee gave out with difficulty walking and right anterior thigh numbness. MRI showed a spinal cord conus mass which was biopsied and found to be anaplastic astrocytoma. Therapy included field radiation t10-l1 followed by 11 cycles of temozolomide 7 days on and 7 days off. This was followed by CPT-11 Weekly x4 with Avastin Q2 weeks/ 2 weeks rest and repeat cycle. \n'), 9: GenericQuery('10', "\nPt is a 22yo F otherwise healthy with a 5 yr history of the systemic mastocytosis, with flares normally 3/year, presenting with flushing and tachycardia concerning for another flare. This is patient's 3rd flare in 2 months, while still on steroid taper which is new for her. She responded well to 125 mg IV steroids q 8 hrs and IV diphenydramine in addition to her continuing home regimen. CBC was at her baseline, w/normal differential. Serum tryptase revealed a high value at 84. The patient failed aspirin challenge due to adverse reaction. She was stabilized on IV steroids and IV benadryl and transferred back to the medical floor. She continued on her home histamine receptor blockers and was transitioned from IV to PO steroids and benadryl and observed overnight and was discharged on her home meds, prednisone taper, GI prophylaxis with PPI, Calcium and vitamin D, and SS bactrim for PCP.\n"), 74: GenericQuery('75', "\nThe patient is a 55-year-old man who was recently diagnosed with Parkinson's disease. He is complaining of slowness of movement and tremors. His disease is ranked as mild, Hoehn-Yahr Stage I. His past medical history is significant for hypertension and hypercholesterolemia. He lives with his wife. They have three children. He used to be active with gardening before his diagnosis. He complains of shaking and slow movement. He had difficulty entering through a door, as he was frozen and needed guidance to step in. His handwriting is getting smaller. He is offered Levodopa and Trihexyphenidyl. He is an alert and cooperative man who does not have any signs of dementia. He does not smoke or use any illicit drugs.\n"), }) self._test_queries('clinicaltrials/2021/trec-ct-2022', count=50, items={ 0: GenericQuery('1', '\nA 19-year-old male came to clinic with some sexual concern. He recently engaged in a relationship and is worried about the satisfaction of his girlfriend. He has a "baby face" according to his girlfriend\'s statement and he is not as muscular as his classmates. On physical examination, there is some pubic hair and poorly developed secondary sexual characteristics. He is unable to detect coffee smell during the examination, but the visual acuity is normal. Ultrasound reveals the testes volume of 1-2 ml. The hormonal evaluation showed serum testosterone level of 65 ng/dL with low levels of GnRH.\n'), 9: GenericQuery('10', '\nA 19-year-old girl comes to the clinic due to a left wrist mass. She noticed swelling on the top of her wrist about 4 months ago and came to the clinic due to cosmetic concerns. Examination shows a nontender, rounded mass on the dorsal wrist that transilluminates with a penlight. Vital signs are normal. The patient needs to type on her computer almost all day. She is left-handed. She does not smoke or use illicit drugs. She is in sexual relationship with two male partners and uses condoms. \n'), 49: GenericQuery('50', '\nA 70-year-old man comes to the office accompanied by his wife. The patient has experienced progressive memory loss over the last years. He needs help with some of his routine activities, such as paying bills. The patient\'s wife says, "He used to be such an independent person, but now he needs help with many things, even finding direction to home!" Medical history includes hypertension, hyperlipidemia, and type 2 diabetes mellitus. Family history includes Alzheimer disease in his father. MRI reveals diffuse cortical and hippocampal atrophy. The diagnosis of AD is made using the National Institute on Aging and the Alzheimer\'s Association (NIA-AA) criteria.\n'), }) def test_qrels(self): self._test_qrels('clinicaltrials/2017/trec-pm-2017', count=13019, items={ 0: TrecQrel('1', 'NCT00001188', 0, '0'), 9: TrecQrel('1', 'NCT00002898', 0, '0'), 13018: TrecQrel('30', 'NCT03080974', 0, '0'), }) self._test_qrels('clinicaltrials/2017/trec-pm-2018', count=14188, items={ 0: TrecQrel('1', 'NCT00001452', 0, '0'), 9: TrecQrel('1', 'NCT00341991', 0, '0'), 14187: TrecQrel('50', 'NCT03096782', 0, '0'), }) self._test_qrels('clinicaltrials/2019/trec-pm-2019', count=12996, items={ 0: TrecQrel('1', 'NCT00001685', 0, '0'), 9: TrecQrel('1', 'NCT00119249', 1, '0'), 12995: TrecQrel('40', 'NCT03955640', 0, '0'), }) self._test_qrels('clinicaltrials/2021/trec-ct-2021', count=35832, items={ 0: TrecQrel('1', 'NCT00002569', 1, '0'), 9: TrecQrel('1', 'NCT00003466', 0, '0'), 35831: TrecQrel('75', 'NCT04858074', 1, '0'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/clirmatrix.py ================================================ import re import unittest import ir_datasets from ir_datasets.formats import GenericDoc, GenericQuery, TrecQrel from .base import DatasetIntegrationTest _logger = ir_datasets.log.easy() # Note: there's > 100k combinations here, so we are only testing a few cases class TestCLIRMatrix(DatasetIntegrationTest): def test_docs(self): self._test_docs('clirmatrix/af', count=87705, items={ 0: GenericDoc('123393', 'Weeskindertjies (plant) weeskind'), 9: GenericDoc('14515', re.compile('^Die Groot Beer \\(Latyn: Ursa Major\\) is ’n sterrebeeld wat heeljaar in die Noordelike Halfrond sigbaar.{873}8\xa0mag\\. 47\xa0Ursae Majoris het twee bevestigde planete, wat 2,54 en 0,76 keer die massa van Jupiter is\\.$', flags=48)), 87704: GenericDoc('18801', re.compile('^Die Suid\\-Afrikaanse Leër is die landmagkomponent van die Suid\\-Afrikaanse Nasionale Weermag en van sy.{964}Amptelike webwerf Hierdie artikel is ’n saadjie\\. Voel vry om Wikipedia te help deur dit uit te brei\\.$', flags=48)), }) self._test_docs('clirmatrix/en', count=5984197, items={ 0: GenericDoc('4274592', re.compile('^Transtar was the model name given to the line of trucks produced by the Studebaker Corporation of So.{910}asons, the Transtar name was dropped for the 1959 4E series Studebaker trucks and changed to Deluxe\\.$', flags=48)), 9: GenericDoc('23065547', re.compile('^Standard sea\\-level conditions \\(SSL\\), also known as sea\\-level standard \\(SLS\\), defines a set of atmosp.{827}orda, Introduction to Aerospace Engineering with a Flight Test Perspective, John Wiley \\& Sons, 2017\\.$', flags=48)), 5984196: GenericDoc('2160901', re.compile('^Resentment \\(also called ranklement or bitterness\\) is a complex, multilayered emotion that has been d.{1021}of by others; and having achievements go unrecognized, while others succeed without working as hard\\.$', flags=48)), }) self._test_docs('clirmatrix/simple', count=153408, items={ 0: GenericDoc('12559', re.compile('^A superlative, in grammar, is an adjective describing a noun that is the best example of a given qua.{684}the adverb "most" before the adjective\\. For instance, you do not say "funnest," or "interestingest"\\.$', flags=48)), 9: GenericDoc('120355', re.compile('^Occult refers to an area of knowledge or thought that is hidden\\. The word occult has many uses in th.{1069}pretation of Hinduism within Theosophy or the various occult interpretations of the Jewish Kabbalah\\.$', flags=48)), 153407: GenericDoc('54463', re.compile('^The history of the Christian religion and the Christian church began with Jesus and his apostles\\. Ch.{934}t\\. Peter, was that they did not, and the matter was further addressed with the Council of Jerusalem\\.$', flags=48)), }) self._test_docs('clirmatrix/zh', count=1089043, items={ 0: GenericDoc('449241', '虿盆,商朝时酷刑之一。将作弊官人跣剥干净,送下坑中,餵毒蛇、毒蝎等物。相传商朝最后一任君主纣王曾在大将黄飞虎之妻与纣王之妃子苏妲己发生口角之后将其推下虿盆,令其惨死。此刑罚在历史上使用较少。'), 9: GenericDoc('664068', re.compile('^篡位是一個貶义詞,即不合法或有爭議地取得王位\\(皇位\\)。包括殺上任皇帝/太子/廢立/逼迫上現任皇帝或君主交出皇位 在非君主制语境下,亦可泛指非法谋夺更高权力的行为(例如違反憲法而推行独裁,或在權限以外越.{29}为在元武宗\\(1307年\\)至元寧宗\\(1332年\\)的25年間,竟然換了八個皇帝,当中有三位皇帝\\(元天順帝、元明宗、元寧宗\\)在位時間甚至不足一年。 在同一王朝中通过杀害或逼退合法继承人或在位者的篡位者 政变$', flags=48)), 1089042: GenericDoc('6844113', re.compile('^谷風隧道為台灣的一條公路隧道,屬「台9線蘇花公路山區路段改善計劃」\\(蘇花改\\)南澳\\~和平段的其中一座隧道,北起鼓音橋,南接漢本高架橋,它穿越中央山脈鼓音溪至花蓮縣漢本的山區。谷風隧道南下及北上線均為45.{425}作、避難聯絡通道襯砌、通風隔板施作、新建通風機房,此外還須在避難聯絡通道內安裝照明系統及通訊設備,主隧道亦須安裝隧道照明燈具結線,安裝水霧支管,安裝噴流風機,此外隧道的所有土建工程及機電工程同步施工。$', flags=48)), }) def test_queries(self): self._test_queries('clirmatrix/af/bi139-base/en/train', count=9999, items={ 0: GenericQuery('690', 'Aruba'), 9: GenericQuery('5615', 'Cretaceous'), 9998: GenericQuery('62732112', 'Efrain Gusquiza'), }) self._test_queries('clirmatrix/af/bi139-base/en/dev', count=1000, items={ 0: GenericQuery('2038', 'August Horch'), 9: GenericQuery('77606', 'Charles VIII of France'), 999: GenericQuery('62708410', '2020 in Morocco'), }) self._test_queries('clirmatrix/af/bi139-base/en/test1', count=1000, items={ 0: GenericQuery('3649', 'Geography of the British Virgin Islands'), 9: GenericQuery('107443', 'Coalinga, California'), 999: GenericQuery('62716625', 'Kevin Hall (disambiguation)'), }) self._test_queries('clirmatrix/af/bi139-base/en/test2', count=1000, items={ 0: GenericQuery('6011', 'Chomsky hierarchy'), 9: GenericQuery('97597', 'Flag of San Marino'), 999: GenericQuery('62707449', 'Machiel Kiel'), }) self._test_queries('clirmatrix/en/bi139-base/af/train', count=10000, items={ 0: GenericQuery('3', 'Lys van Afrikaanse skrywers'), 9: GenericQuery('95', 'Geskiedenis'), 9999: GenericQuery('285953', 'Jean-Claude Casadesus'), }) self._test_queries('clirmatrix/en/bi139-full/af/train', count=58745, items={ 0: GenericQuery('3', 'Lys van Afrikaanse skrywers'), 9: GenericQuery('26', 'Benue-Kongo-tale'), 58744: GenericQuery('286010', 'Lugmag van die Volksbevrydingsleër'), }) self._test_queries('clirmatrix/en/multi8/fr/train', count=10000, items={ 0: GenericQuery('45187', 'Mort'), 9: GenericQuery('7740', 'Lituanie'), 9999: GenericQuery('28573', 'Chiffres arabes'), }) self._test_queries('clirmatrix/fr/multi8/en/train', count=10000, items={ 0: GenericQuery('8221', 'Death'), 9: GenericQuery('17675', 'Lithuania'), 9999: GenericQuery('1786', 'Arabic numerals'), }) self._test_queries('clirmatrix/de/multi8/en/train', count=10000, items={ 0: GenericQuery('8221', 'Death'), 9: GenericQuery('17675', 'Lithuania'), 9999: GenericQuery('1786', 'Arabic numerals'), }) def test_qrels(self): self._test_qrels('clirmatrix/af/bi139-base/en/train', count=999900, items={ 0: TrecQrel('690', '14013', 6, '0'), 9: TrecQrel('690', '15050', 0, '0'), 999899: TrecQrel('62732112', '259879', 0, '0'), }) self._test_qrels('clirmatrix/af/bi139-base/en/dev', count=100000, items={ 0: TrecQrel('2038', '13762', 3, '0'), 9: TrecQrel('2038', '272786', 0, '0'), 99999: TrecQrel('62708410', '258719', 0, '0'), }) self._test_qrels('clirmatrix/af/bi139-base/en/test1', count=100000, items={ 0: TrecQrel('3649', '50129', 5, '0'), 9: TrecQrel('3649', '93300', 0, '0'), 99999: TrecQrel('62716625', '140128', 0, '0'), }) self._test_qrels('clirmatrix/af/bi139-base/en/test2', count=100000, items={ 0: TrecQrel('6011', '11475', 6, '0'), 9: TrecQrel('6011', '69338', 0, '0'), 99999: TrecQrel('62707449', '112726', 0, '0'), }) self._test_qrels('clirmatrix/en/bi139-base/af/train', count=1000000, items={ 0: TrecQrel('3', '1617690', 5, '0'), 9: TrecQrel('3', '3943287', 3, '0'), 999999: TrecQrel('285953', '43443609', 0, '0'), }) self._test_qrels('clirmatrix/en/bi139-full/af/train', count=3011938, items={ 0: TrecQrel('3', '1617690', 5, '0'), 9: TrecQrel('3', '3943287', 3, '0'), 3011937: TrecQrel('286010', '400853', 1, '0'), }) self._test_qrels('clirmatrix/en/multi8/fr/train', count=1000000, items={ 0: TrecQrel('45187', '49703357', 5, '0'), 9: TrecQrel('45187', '12161221', 3, '0'), 999999: TrecQrel('28573', '40255894', 0, '0'), }) self._test_qrels('clirmatrix/fr/multi8/en/train', count=1000000, items={ 0: TrecQrel('8221', '45187', 6, '0'), 9: TrecQrel('8221', '1331378', 4, '0'), 999999: TrecQrel('1786', '9567503', 0, '0'), }) self._test_qrels('clirmatrix/de/multi8/en/train', count=1000000, items={ 0: TrecQrel('8221', '5204', 6, '0'), 9: TrecQrel('8221', '1092811', 4, '0'), 999999: TrecQrel('1786', '10264293', 0, '0'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/clueweb09.py ================================================ import re import unittest import ir_datasets from ir_datasets.datasets.clueweb09 import TrecWebTrackQuery, TrecPrel from ir_datasets.formats import TrecQrel, TrecSubtopic, GenericDoc, GenericQuery, WarcDoc from .base import DatasetIntegrationTest _logger = ir_datasets.log.easy() class TestClueWeb09(DatasetIntegrationTest): def test_clueweb09_docs(self): self._test_docs('clueweb09', items={ 0: WarcDoc('clueweb09-ar0000-00-00000', 'http://0098shop.com/product_"EH24_A\'1C3_Forex_(\'2\'131E\'�G.html', '2009-03-84T15:35:08-0700', re.compile(b'^HTTP/1\\.1 200 OK\nServer: Apache/2\\.2\\.11 \\(Unix\\) mod_ssl/2\\.2\\.11 OpenSSL/0\\.9\\.8b DAV/2 mod_auth_passthroug.{92}\nConnection: close\nContent\\-Type: text/html\nDate: Fri, 27 Feb 2009 16:04:39 GMT\nContent\\-Length: 38889$', flags=16), re.compile(b'^<meta httpequiv=Content\\-Type content="text/html; charset=utf\\-8"><meta httpequiv=Content\\-Type content.{38691}m3d\'\\);\\\r\n</SCRIPT>\\\r\n</span>\n\\\t<span id=\'HAM3D_counter\' class=\'HAM3D_hidden\'></span>\n</body>\n</html>\n\n\n$', flags=16), 'text/html'), 9: WarcDoc('clueweb09-ar0000-00-00009', 'http://00perdomain.com/kids_and_teens/international/arabic/', '2009-03-84T15:35:08-0700', re.compile(b'^HTTP/1\\.1 200 OK\nContent\\-Type: text/html; charset=utf\\-8\nDate: Mon, 09 Feb 2009 12:41:10 GMT\nPragma: n.{145} sid=iorj3059uaka0isae61uh29494; path=/\nExpires: Thu, 19 Nov 1981 08:52:00 GMT\nContent\\-Length: 28444$', flags=16), re.compile(b'^\n\n\n<!\\-\\- Site by Zaz Corporation, 1 \\- 8 8 8 \\- 2 \\- Z A Z C O R http://www\\.zazcorp\\.us \\-\\->\n\n\n\n<html><he.{28246}<script type="text/javascript">\n_uacct = "UA\\-488717\\-2";\nurchinTracker\\(\\);\n</script>\n\n</body></html>\n\n$', flags=16), 'text/html'), 1000: WarcDoc('clueweb09-ar0000-00-01000', 'http://213.136.192.26/311276/ln59d.htm', '2009-03-84T15:35:13-0700', re.compile(b'^HTTP/1\\.1 200 OK\nAccept\\-Ranges: bytes\nContent\\-Type: text/html\nDate: Wed, 28 Jan 2009 18:47:22 GMT\nHos.{85}: close\nLast\\-Modified: Thu, 08 Jan 2009 00:28:36 GMT\nETag: "548bbb2871c91:a76"\nContent\\-Length: 65536$', flags=16), re.compile(b'^<html>\\\r\n <head>\\\r\n <META http\\-equiv="Content\\-Type" content="text/html; charset=windows\\-1256">\\\r\n .{65338}\n <font face="Simplified Arabic" color="\\#4D5064">\\\r\n\n\n$', flags=16), 'text/html'), }) self._test_docs('clueweb09/ar', items={ 0: WarcDoc('clueweb09-ar0000-00-00000', 'http://0098shop.com/product_"EH24_A\'1C3_Forex_(\'2\'131E\'�G.html', '2009-03-84T15:35:08-0700', re.compile(b'^HTTP/1\\.1 200 OK\nServer: Apache/2\\.2\\.11 \\(Unix\\) mod_ssl/2\\.2\\.11 OpenSSL/0\\.9\\.8b DAV/2 mod_auth_passthroug.{92}\nConnection: close\nContent\\-Type: text/html\nDate: Fri, 27 Feb 2009 16:04:39 GMT\nContent\\-Length: 38889$', flags=16), re.compile(b'^<meta httpequiv=Content\\-Type content="text/html; charset=utf\\-8"><meta httpequiv=Content\\-Type content.{38691}m3d\'\\);\\\r\n</SCRIPT>\\\r\n</span>\n\\\t<span id=\'HAM3D_counter\' class=\'HAM3D_hidden\'></span>\n</body>\n</html>\n\n\n$', flags=16), 'text/html'), 9: WarcDoc('clueweb09-ar0000-00-00009', 'http://00perdomain.com/kids_and_teens/international/arabic/', '2009-03-84T15:35:08-0700', re.compile(b'^HTTP/1\\.1 200 OK\nContent\\-Type: text/html; charset=utf\\-8\nDate: Mon, 09 Feb 2009 12:41:10 GMT\nPragma: n.{145} sid=iorj3059uaka0isae61uh29494; path=/\nExpires: Thu, 19 Nov 1981 08:52:00 GMT\nContent\\-Length: 28444$', flags=16), re.compile(b'^\n\n\n<!\\-\\- Site by Zaz Corporation, 1 \\- 8 8 8 \\- 2 \\- Z A Z C O R http://www\\.zazcorp\\.us \\-\\->\n\n\n\n<html><he.{28246}<script type="text/javascript">\n_uacct = "UA\\-488717\\-2";\nurchinTracker\\(\\);\n</script>\n\n</body></html>\n\n$', flags=16), 'text/html'), 1000: WarcDoc('clueweb09-ar0000-00-01000', 'http://213.136.192.26/311276/ln59d.htm', '2009-03-84T15:35:13-0700', re.compile(b'^HTTP/1\\.1 200 OK\nAccept\\-Ranges: bytes\nContent\\-Type: text/html\nDate: Wed, 28 Jan 2009 18:47:22 GMT\nHos.{85}: close\nLast\\-Modified: Thu, 08 Jan 2009 00:28:36 GMT\nETag: "548bbb2871c91:a76"\nContent\\-Length: 65536$', flags=16), re.compile(b'^<html>\\\r\n <head>\\\r\n <META http\\-equiv="Content\\-Type" content="text/html; charset=windows\\-1256">\\\r\n .{65338}\n <font face="Simplified Arabic" color="\\#4D5064">\\\r\n\n\n$', flags=16), 'text/html'), }) self._test_docs('clueweb09/zh', items={ 0: WarcDoc('clueweb09-zh0000-00-00000', 'http://000027.istock.jrj.com.cn/', '2009-03-77T19:52:11-0700', re.compile(b'^HTTP/1\\.1 200 OK\nContent\\-Type: text/html; charset=GBK\nContent\\-Encoding: gzip\nVary: Accept\\-Encoding\nDa.{63}ose\nSet\\-Cookie: JSESSIONID=aWnxiSnSp6rg; path=/\nContent\\-Language: zh\\-CN, zh\\-CN\nContent\\-Length: 65536$', flags=16), re.compile(b'^\\\r\n\\\r\n\\\r\n\\\r\n\\\r\n\\\r\n<!DOCTYPE html PUBLIC "\\-//W3C//DTD XHTML 1\\.0 Transitional//EN" "http://www\\.w3\\.org/TR/xht.{65338}\\.com\\.cn/component/editor/imgs/B012\\.gif" height="30" width="30" onclick="oblog_InsertImg\\(this\\.src\\)"\n\n$', flags=16), 'text/html'), 9: WarcDoc('clueweb09-zh0000-00-00009', 'http://000078.istock.jrj.com.cn/forum000078/topic1001345.html', '2009-03-77T19:52:11-0700', re.compile(b'^HTTP/1\\.1 200 OK\nContent\\-Type: text/html; charset=GBK\nContent\\-Encoding: gzip\nVary: Accept\\-Encoding\nDa.{63}ose\nSet\\-Cookie: JSESSIONID=bbPnYVFWx8D6; path=/\nContent\\-Language: zh\\-CN, zh\\-CN\nContent\\-Length: 26075$', flags=16), re.compile(b'^\\\r\n\\\r\n\\\r\n\\\r\n\\\r\n<!DOCTYPE html PUBLIC "\\-//W3C//DTD XHTML 1\\.0 Transitional//EN" "http://www\\.w3\\.org/TR/xhtml.{25877}lt="" />\\\r\n</body>\\\r\n<script src="http://istock\\.jrj\\.com\\.cn/includes/js/tongji\\.js"></script>\\\r\n</html>\n\n$', flags=16), 'text/html'), 1000: WarcDoc('clueweb09-zh0000-00-01000', 'http://119go.com/shop/index.htm', '2009-03-77T19:52:15-0700', re.compile(b'^HTTP/1\\.1 200 OK\nContent\\-Type: text/html\nAccept\\-Ranges: bytes\nDate: Fri, 09 Jan 2009 21:54:09 GMT\nSer.{56} close\nLast\\-Modified: Fri, 19 Sep 2008 01:50:00 GMT\nETag: "054996fa19c91:337a"\nContent\\-Length: 18005$', flags=16), re.compile(b'^<!DOCTYPE HTML PUBLIC "\\-//W3C//DTD HTML 4\\.01 Transitional//EN"\\\r\n"http://www\\.w3\\.org/TR/html4/loose\\.dt.{17807}UA\\-105821\\-1";urchinTracker\\(\\);</script><!\\-\\-\\#include virtual="/include/ads\\.asp"\\-\\-></body>\\\r\n</html>\\\r\n\n\n$', flags=16), 'text/html'), }) self._test_docs('clueweb09/en', items={ 0: WarcDoc('clueweb09-en0000-00-00000', 'http://00000-nrt-realestate.homepagestartup.com/', '2009-03-65T08:43:19-0800', re.compile(b'^HTTP/1\\.1 200 OK\nContent\\-Type: text/html\nDate: Tue, 13 Jan 2009 18:05:10 GMT\nPragma: no\\-cache\nCache\\-C.{100}Modified: Tue, 13 Jan 2009 18:05:10 GMT\nExpires: Mon, 20 Dec 1998 01:00:00 GMT\nContent\\-Length: 16254$', flags=16), re.compile(b'^<head> <meta http\\-equiv="Content\\-Language" content="en\\-gb"> <meta http\\-equiv="Content\\-Type" content=.{16056} 8pt">YouTube Videos</span></td> </tr> </table> </td> </tr> </table></div> </div> </body> </html> \n\n$', flags=16), 'text/html'), 9: WarcDoc('clueweb09-en0000-00-00009', 'http://00perdomain.com/computers/', '2009-03-65T08:43:20-0800', re.compile(b'^HTTP/1\\.1 200 OK\nContent\\-Type: text/html; charset=utf\\-8\nDate: Sat, 17 Jan 2009 23:40:59 GMT\nPragma: n.{145} sid=i35idajmde65hlem4m2jpmrc37; path=/\nExpires: Thu, 19 Nov 1981 08:52:00 GMT\nContent\\-Length: 23500$', flags=16), re.compile(b'^\n\n\n<!\\-\\- Site by Zaz Corporation, 1 \\- 8 8 8 \\- 2 \\- Z A Z C O R http://www\\.zazcorp\\.us \\-\\->\n\n\n\n<html><he.{23302}<script type="text/javascript">\n_uacct = "UA\\-488717\\-2";\nurchinTracker\\(\\);\n</script>\n\n</body></html>\n\n$', flags=16), 'text/html'), 1000: WarcDoc('clueweb09-en0000-00-01000', 'http://2modern.com/designer/FLOS/Flos-Archimoon-Soft-Table-Lamp', '2009-03-65T08:44:07-0800', re.compile(b'^HTTP/1\\.1 200 OK\nContent\\-Type: text/html; charset=utf\\-8\nKeep\\-Alive: timeout=15, max=965\nContent\\-Encod.{359}4Pa38Ta38Nb350; path=/\nLast\\-Modified: Tue, 13 Jan 2009 21:10:47 GMT\nExpires: 0\nContent\\-Length: 52741$', flags=16), re.compile(b'^\n<html>\n<head>\n<meta http\\-equiv="Content\\-Type" content="text/html; charset=UTF\\-8">\n<title>FLOS \\- Arc.{52543}\\- \\[ 418126 \\] \\[ \\] \\[ /s\\.nl \\] \\[ Tue Jan 13 13:10:47 PST 2009 \\] \\-\\->\n<!\\-\\- Not logging slowest SQL \\-\\->\n\n\n$', flags=16), 'text/html'), }) self._test_docs('clueweb09/fr', items={ 0: WarcDoc('clueweb09-fr0000-00-00000', 'http://0-charmedgallery.skyrock.com/', '2009-03-85T01:34:33-0700', re.compile(b'^HTTP/1\\.1 200 OK\nContent\\-Type: text/html\nContent\\-Encoding: gzip\nP3P: CP="NOI DSP COR CURa DEVa PSAa O.{328}xpires: Sun, 08 Feb 2009 15:13:41 GMT\nETag: "92c422f2ea325185cfadb9a9088b2b1f"\nContent\\-Length: 53898$', flags=16), re.compile(b'^<!DOCTYPE html PUBLIC "\\-//W3C//DTD XHTML 1\\.0 Strict//EN" "http://www\\.w3\\.org/TR/xhtml1/DTD/xhtml1\\-str.{53700}ascript" src="http://pagead2\\.googlesyndication\\.com/pagead/show_ads\\.js"></script>\n\n\n</body>\n</html>\n\n$', flags=16), 'text/html'), 9: WarcDoc('clueweb09-fr0000-00-00009', 'http://000221.skyrock.com/', '2009-03-85T01:34:33-0700', re.compile(b'^HTTP/1\\.1 200 OK\nContent\\-Type: text/html\nContent\\-Encoding: gzip\nP3P: CP="NOI DSP COR CURa DEVa PSAa O.{329}xpires: Fri, 13 Feb 2009 21:03:18 GMT\nETag: "bc2011cb3798d5b21d8df1709d7270f1"\nContent\\-Length: 31594$', flags=16), re.compile(b'^<!DOCTYPE html PUBLIC "\\-//W3C//DTD XHTML 1\\.0 Strict//EN" "http://www\\.w3\\.org/TR/xhtml1/DTD/xhtml1\\-str.{31396}ascript" src="http://pagead2\\.googlesyndication\\.com/pagead/show_ads\\.js"></script>\n\n\n</body>\n</html>\n\n$', flags=16), 'text/html'), 1000: WarcDoc('clueweb09-fr0000-00-01000', 'http://123maigrir.com.ivchost3.com/pratique/institut/institut2.htm', '2009-03-85T01:34:39-0700', re.compile(b'^HTTP/1\\.1 200 OK\nContent\\-Type: text/html\nAccept\\-Ranges: bytes\nDate: Thu, 29 Jan 2009 16:25:21 GMT\nSer.{95}ose\nLast\\-Modified: Fri, 15 Jun 2007 23:09:10 GMT\nETag: "18441c9\\-adbe\\-46731c16"\nContent\\-Length: 44478$', flags=16), re.compile(b'^<html><!\\-\\- \\#BeginTemplate "/Templates/page_menu_complet\\.dwt" \\-\\-><!\\-\\- DW6 \\-\\->\\\r\n<head>\\\r\n<script langua.{44280}"279" align="center">\\ </td>\\\r\n </tr>\\\r\n</table>\\\r\n\\\r\n</body>\\\r\n\\\r\n<!\\-\\- \\#EndTemplate \\-\\-></html>\\\r\n\\\r\n\n\n$', flags=16), 'text/html'), }) self._test_docs('clueweb09/de', items={ 0: WarcDoc('clueweb09-de0000-00-00000', 'http://00perdomain.com/kids_and_teens/international/deutsch/', '2009-03-84T23:38:37-0700', re.compile(b'^HTTP/1\\.1 200 OK\nContent\\-Type: text/html; charset=utf\\-8\nDate: Sun, 08 Feb 2009 06:17:05 GMT\nPragma: n.{144}: sid=g3njt512js43a04j8vekkt0627; path=/\nExpires: Thu, 19 Nov 1981 08:52:00 GMT\nContent\\-Length: 9942$', flags=16), re.compile(b'^\n\n\n<!\\-\\- Site by Zaz Corporation, 1 \\- 8 8 8 \\- 2 \\- Z A Z C O R http://www\\.zazcorp\\.us \\-\\->\n\n\n\n<html><he.{9744}<script type="text/javascript">\n_uacct = "UA\\-488717\\-2";\nurchinTracker\\(\\);\n</script>\n\n</body></html>\n\n$', flags=16), 'text/html'), 9: WarcDoc('clueweb09-de0000-00-00009', 'http://00perdomain.com/world/deutsch/computer/', '2009-03-84T23:38:37-0700', re.compile(b'^HTTP/1\\.1 200 OK\nContent\\-Type: text/html; charset=utf\\-8\nDate: Tue, 03 Feb 2009 03:48:40 GMT\nCache\\-Con.{145} sid=fit0t220fh9vpsefik8l628866; path=/\nExpires: Thu, 19 Nov 1981 08:52:00 GMT\nContent\\-Length: 21957$', flags=16), re.compile(b'^\n\n\n<!\\-\\- Site by Zaz Corporation, 1 \\- 8 8 8 \\- 2 \\- Z A Z C O R http://www\\.zazcorp\\.us \\-\\->\n\n\n\n<html><he.{21759}<script type="text/javascript">\n_uacct = "UA\\-488717\\-2";\nurchinTracker\\(\\);\n</script>\n\n</body></html>\n\n$', flags=16), 'text/html'), 1000: WarcDoc('clueweb09-de0000-00-01000', 'http://1689494.rc-welt.com/', '2009-03-84T23:38:43-0700', re.compile(b'^HTTP/1\\.1 200 OK\nContent\\-Type: text/html; charset=iso\\-8859\\-1\nContent\\-Encoding: gzip\nDate: Mon, 26 Jan.{332}Modified: Mon, 26 Jan 2009 01:42:05 GMT\nExpires: Mon, 26 Jan 2009 01:42:05 GMT\nContent\\-Length: 38206$', flags=16), re.compile(b'^ <!DOCTYPE HTML PUBLIC "\\-//W3C//DTD HTML 4\\.01 Transitional//EN"> <html> <head> <meta http\\-equiv="Co.{38008}"0" height="0" scrolling="no" name="dynamicFrame" src="RCheartbeat\\.php" ></iframe> </body> </html>\n\n$', flags=16), 'text/html'), }) self._test_docs('clueweb09/it', items={ 0: WarcDoc('clueweb09-it0000-00-00000', 'http://00lucianoligabue00.giovani.it/', '2009-03-84T17:45:02-0700', re.compile(b'^HTTP/1\\.1 200 OK\nContent\\-Type: text/html\nContent\\-Encoding: gzip\nVary: Host,Accept\\-Encoding,User\\-Agent.{248}ec0cfa05ca1; path=/; domain=\\.giovani\\.it\nExpires: Thu, 19 Nov 1981 08:52:00 GMT\nContent\\-Length: 12895$', flags=16), re.compile(b'^<!DOCTYPE html PUBLIC "\\-//W3C//DTD XHTML 1\\.0 Transitional//EN" "http://www\\.w3\\.org/TR/xhtml1/DTD/xhtm.{12697}avascript">\n_uacct = "UA\\-746038\\-2";\n_udn="giovani\\.it";\nurchinTracker\\(\\);\n</script>\n</body>\n</html>\n\n\n$', flags=16), 'text/html'), 9: WarcDoc('clueweb09-it0000-00-00009', 'http://00perdomain.com/world/furlan/sal%c3%bbt/', '2009-03-84T17:45:02-0700', re.compile(b'^HTTP/1\\.1 200 OK\nContent\\-Type: text/html; charset=utf\\-8\nDate: Sun, 08 Feb 2009 06:13:46 GMT\nCache\\-Con.{144}: sid=bqreqd4mrjrm3fi30405ldfnq6; path=/\nExpires: Thu, 19 Nov 1981 08:52:00 GMT\nContent\\-Length: 9794$', flags=16), re.compile(b'^\n\n\n<!\\-\\- Site by Zaz Corporation, 1 \\- 8 8 8 \\- 2 \\- Z A Z C O R http://www\\.zazcorp\\.us \\-\\->\n\n\n\n<html><he.{9596}<script type="text/javascript">\n_uacct = "UA\\-488717\\-2";\nurchinTracker\\(\\);\n</script>\n\n</body></html>\n\n$', flags=16), 'text/html'), 1000: WarcDoc('clueweb09-it0000-00-01000', 'http://aaronsummers.ifrance.com/new/socci/', '2009-03-84T17:45:07-0700', re.compile(b'^HTTP/1\\.1 200 OK\nContent\\-Type: text/html; charset=ISO\\-8859\\-1\nAccept\\-Ranges: bytes\nDate: Tue, 10 Feb 2.{70}lose\nLast\\-Modified: Wed, 10 Oct 2007 19:43:32 GMT\nETag: "804131\\-3d59\\-b356a100"\nContent\\-Length: 16483$', flags=16), re.compile(b'^<script language="Javascript" type="text/javascript">\n<!\\-\\-\nvar d=new Date; rnd=d\\.getDay\\(\\)\\+\'\\-\'\\+d\\.getH.{16285}\'<sc\'\\+\'ript src="http://js\\-perso\\.ifrance\\.com/js2\\.php\\?\'\\+rnd\\+\'"><\'\\+\'/sc\'\\+\'ript>\'\\);\n// \\-\\->\n</script>\n\n\n$', flags=16), 'text/html'), }) self._test_docs('clueweb09/ja', items={ 0: WarcDoc('clueweb09-ja0000-00-00000', 'http://00077.web.fc2.com/', '2009-03-85T02:09:10-0700', re.compile(b'^HTTP/1\\.1 200 OK\nAccept\\-Ranges: bytes\nContent\\-Type: text/html\nDate: Tue, 13 Jan 2009 21:29:20 GMT\nSer.{98}ection: close\nLast\\-Modified: Tue, 18 Nov 2008 14:07:53 GMT\nContent\\-Language: en\nContent\\-Length: 9911$', flags=16), re.compile(b'^<!DOCTYPE html PUBLIC "\\-//W3C//DTD XHTML 1\\.0 Transitional//EN"\n "http://www\\.w3\\.org/TR/xhtml1/DTD/xht.{9713}_cs \\+ \'\\&dm=\' \\+ fhp_dm \\+ \'"><\' \\+ \'/script>\';\ndocument\\.write\\(fhp_wt\\);\n//\\-\\-></script></body>\n</html>\n\n\n$', flags=16), 'text/html'), 9: WarcDoc('clueweb09-ja0000-00-00009', 'http://0095.jp/', '2009-03-85T02:09:10-0700', b'HTTP/1.1 200 OK\nContent-Type: text/html\nDate: Thu, 08 Jan 2009 04:28:30 GMT\nServer: Apache/1.3.39 (Unix)\nX-Powered-By: PHP/5.2.8\nConnection: close\nContent-Length: 20822', re.compile(b'^<!DOCTYPE html PUBLIC "\\-//W3C//DTD XHTML 1\\.0 Transitional//EN" "http://www\\.w3\\.org/TR/xhtml1/DTD/xhtm.{20624}r>\n\xa1\xa1\xa1\xa1\xa1\xa1\xa1\xa1\\©2009\\ <a href="http://0095\\.jp/">\xa5\xad\xa5\xa6\xa5\xa4\xb6\xe6\xb3\xda\xc9\xf4</a></div>\n</div>\n</body>\n</html>\n\n\n$', flags=16), 'text/html'), 1000: WarcDoc('clueweb09-ja0000-00-01000', 'http://1bunting.info/sanc3/0066/020/', '2009-03-85T02:09:15-0700', re.compile(b'^HTTP/1\\.1 200 OK\nAccept\\-Ranges: bytes\nContent\\-Type: text/html\nDate: Wed, 07 Jan 2009 12:39:57 GMT\nSer.{145}ast\\-Modified: Sun, 04 Jan 2009 02:38:48 GMT\nETag: "64d86f0\\-2d98\\-45f9f12e0be00"\nContent\\-Length: 11672$', flags=16), re.compile(b'^<!DOCTYPE html PUBLIC "\\-//W3C//DTD XHTML 1\\.0 Transitional//EN" "http://www\\.w3\\.org/TR/xhtml1/DTD/xhtm.{11474}en\\.height\\+"\\&"\\);\ndocument\\.write\\("color="\\+screen\\.colorDepth\\+"\'>"\\);\n// \\-\\->\n</SCRIPT>\n</body>\n</html>\n\n\n$', flags=16), 'text/html'), }) self._test_docs('clueweb09/ko', items={ 0: WarcDoc('clueweb09-ko0000-00-00000', 'http://00perdomain.com/kids_and_teens/international/korean/', '2009-03-84T16:00:51-0700', re.compile(b'^HTTP/1\\.1 200 OK\nContent\\-Type: text/html; charset=utf\\-8\nDate: Wed, 11 Feb 2009 16:27:15 GMT\nPragma: n.{145} sid=6eoehtb6uqtbtc5i6j1ohsht55; path=/\nExpires: Thu, 19 Nov 1981 08:52:00 GMT\nContent\\-Length: 10481$', flags=16), re.compile(b'^\n\n\n<!\\-\\- Site by Zaz Corporation, 1 \\- 8 8 8 \\- 2 \\- Z A Z C O R http://www\\.zazcorp\\.us \\-\\->\n\n\n\n<html><he.{10283}<script type="text/javascript">\n_uacct = "UA\\-488717\\-2";\nurchinTracker\\(\\);\n</script>\n\n</body></html>\n\n$', flags=16), 'text/html'), 9: WarcDoc('clueweb09-ko0000-00-00009', 'http://00perdomain.com/world/korean/%ea%b1%b4%ea%b0%95,%ec%9d%98%ed%95%99/%ec%a7%88%eb%b3%91,%ec%a7%88%ed%99%98/', '2009-03-84T16:00:51-0700', re.compile(b'^HTTP/1\\.1 200 OK\nContent\\-Type: text/html; charset=utf\\-8\nDate: Fri, 27 Feb 2009 20:07:39 GMT\nCache\\-Con.{145} sid=6s444g9ae4nv9uk4mfbb9e2dv3; path=/\nExpires: Thu, 19 Nov 1981 08:52:00 GMT\nContent\\-Length: 18781$', flags=16), re.compile(b'^\n\n\n<!\\-\\- Site by Zaz Corporation, 1 \\- 8 8 8 \\- 2 \\- Z A Z C O R http://www\\.zazcorp\\.us \\-\\->\n\n\n\n<html><he.{18583}<script type="text/javascript">\n_uacct = "UA\\-488717\\-2";\nurchinTracker\\(\\);\n</script>\n\n</body></html>\n\n$', flags=16), 'text/html'), 1000: WarcDoc('clueweb09-ko0000-00-01000', 'http://208.70.77.133/9ea771f89c09d26fdd4f9309023b.html', '2009-03-84T16:00:55-0700', b'HTTP/1.1 200 OK\nContent-Type: text/html; charset=EUC-KR\nDate: Sat, 17 Jan 2009 08:48:54 GMT\nServer: Apache/2.0.52 (CentOS)\nX-Powered-By: PHP/5.1.6\nConnection: close\nContent-Length: 14595', re.compile(b'^<!DOCTYPE HTML PUBLIC "\\-//W3C//DTD HTML 4\\.0 Transitional//EN">\n<html>\n<head>\n<title>\xc1\xea\xbd\xc3\xc6\xae\xb7\xb9\xc0\xcc\xb4\xd7\xba\xb9 \xc1.{14397}:<a href="mailto:webmaster@208\\.70\\.77\\.133">\xc0\xa5\xb8\xb6\xbd\xba\xc5\xcd</a>\n</center>\n<body>\n</html>\n<!\\-\\- 0\\.110781 \\-\\->\n\n\n$', flags=16), 'text/html'), }) self._test_docs('clueweb09/pt', items={ 0: WarcDoc('clueweb09-pt0000-00-00000', 'http://005bom.ezdir.net/', '2009-03-84T20:11:48-0700', re.compile(b'^HTTP/1\\.1 200 OK\nServer: Apache/1\\.3\\.37 \\(Unix\\) mod_auth_passthrough/1\\.8 mod_log_bytes/1\\.2 mod_bwlimite.{91}Content\\-Type: text/html; charset=iso\\-8859\\-1\nDate: Wed, 18 Feb 2009 23:32:21 GMT\nContent\\-Length: 1970$', flags=16), re.compile(b'^<HTML>\n<HEAD>\n<META NAME="Generator" CONTENT="ezDIR \\- www\\.ezdir\\.net">\n<META NAME="Description" CONTE.{1772}/ezdir\\.inweb\\.adm\\.br/ezdir/anuncie\\.php">Clique aqui</A></FONT>\n</TD></TR>\n</TABLE>\n</BODY>\n</HTML>\n\n\n$', flags=16), 'text/html'), 9: WarcDoc('clueweb09-pt0000-00-00009', 'http://00perdomain.com/world/portugu%c3%aas/artes/artesanato/', '2009-03-84T20:11:48-0700', re.compile(b'^HTTP/1\\.1 200 OK\nContent\\-Type: text/html; charset=utf\\-8\nDate: Thu, 12 Feb 2009 05:26:23 GMT\nCache\\-Con.{145} sid=8j75130i6a5iibrmaq85ji1753; path=/\nExpires: Thu, 19 Nov 1981 08:52:00 GMT\nContent\\-Length: 18361$', flags=16), re.compile(b'^\n\n\n<!\\-\\- Site by Zaz Corporation, 1 \\- 8 8 8 \\- 2 \\- Z A Z C O R http://www\\.zazcorp\\.us \\-\\->\n\n\n\n<html><he.{18163}<script type="text/javascript">\n_uacct = "UA\\-488717\\-2";\nurchinTracker\\(\\);\n</script>\n\n</body></html>\n\n$', flags=16), 'text/html'), 1000: WarcDoc('clueweb09-pt0000-00-01000', 'http://1001gatos.org/john-wood-saiu-da-microsoft-para-mudar-o-mundo/', '2009-03-84T20:11:55-0700', re.compile(b'^HTTP/1\\.1 200 OK\nContent\\-Type: text/html; charset=UTF\\-8\nDate: Sun, 08 Feb 2009 00:43:44 GMT\nServer: A.{223}, 08\\-Feb\\-2010 00:43:45 GMT; path=/\nX\\-Pingback: http://1001gatos\\.org/xmlrpc\\.php\nContent\\-Length: 40352$', flags=16), re.compile(b'^<!DOCTYPE html PUBLIC "\\-//W3C//DTD XHTML 1\\.0 Strict//EN" "http://www\\.w3\\.org/TR/xhtml1/DTD/xhtml1\\-str.{40154}DynaboxConfig\\?div_nome=dynabox\\&site_origem=8662542\\&cor=cc0000"></script>\\\r\n</body>\\\r\n</html>\n\n$', flags=16), 'text/html'), }) self._test_docs('clueweb09/es', items={ 0: WarcDoc('clueweb09-es0000-00-00000', 'http://00001101.blogspot.com/', '2009-03-81T23:59:24-0700', re.compile(b'^HTTP/1\\.1 200 OK\nContent\\-Type: text/html; charset=UTF\\-8\nDate: Sat, 24 Jan 2009 18:07:26 GMT\nCache\\-Con.{101}ed: Sat, 17 Jan 2009 18:35:52 GMT\nETag: "a341615f\\-6594\\-4524\\-ae74\\-53101f87cdba"\nContent\\-Length: 34361$', flags=16), re.compile(b'^<!DOCTYPE html PUBLIC "\\-//W3C//DTD XHTML 1\\.0 Strict//EN" "http://www\\.w3\\.org/TR/xhtml1/DTD/xhtml1\\-str.{34163}\\.com/fb/images/pub/feed\\-icon16x16\\.png" alt="" style="border:0"/></a></p>\\\r\n</div>\\\r\n</body>\\\r\n</html>\n\n$', flags=16), 'text/html'), 9: WarcDoc('clueweb09-es0000-00-00009', 'http://00perdomain.com/world/espa%c3%b1ol/', '2009-03-81T23:59:24-0700', re.compile(b'^HTTP/1\\.1 200 OK\nContent\\-Type: text/html; charset=utf\\-8\nDate: Thu, 29 Jan 2009 19:26:46 GMT\nCache\\-Con.{144}: sid=javrlcp0o652co0p1gvpttlqe6; path=/\nExpires: Thu, 19 Nov 1981 08:52:00 GMT\nContent\\-Length: 9073$', flags=16), re.compile(b'^\n\n\n<!\\-\\- Site by Zaz Corporation, 1 \\- 8 8 8 \\- 2 \\- Z A Z C O R http://www\\.zazcorp\\.us \\-\\->\n\n\n\n<html><he.{8875}<script type="text/javascript">\n_uacct = "UA\\-488717\\-2";\nurchinTracker\\(\\);\n</script>\n\n</body></html>\n\n$', flags=16), 'text/html'), 1000: WarcDoc('clueweb09-es0000-00-01000', 'http://abrahamzabludovsky.radiotrece.com.mx/2008/11/27/deportes-del-27-de-noviembre-de-2008/', '2009-03-81T23:59:31-0700', re.compile(b'^HTTP/1\\.1 200 OK\nContent\\-Type: text/html; charset=UTF\\-8\nDate: Mon, 09 Feb 2009 08:53:49 GMT\nServer: A.{51}tion: close\nX\\-Pingback: http://abrahamzabludovsky\\.radiotrece\\.com\\.mx/xmlrpc\\.php\nContent\\-Length: 28172$', flags=16), re.compile(b'^<!DOCTYPE html PUBLIC "\\-//W3C//DTD XHTML 1\\.0 Strict//EN" "http://www\\.w3\\.org/TR/xhtml1/DTD/xhtml1\\-str.{27974}UA\\-1620189\\-24"\\);\npageTracker\\._initData\\(\\);\npageTracker\\._trackPageview\\(\\);\n</script>\n</body>\n</html>\n\n\n$', flags=16), 'text/html'), }) self._test_docs('clueweb09/catb', items={ 0: WarcDoc('clueweb09-en0000-00-00000', 'http://00000-nrt-realestate.homepagestartup.com/', '2009-03-65T08:43:19-0800', re.compile(b'^HTTP/1\\.1 200 OK\nContent\\-Type: text/html\nDate: Tue, 13 Jan 2009 18:05:10 GMT\nPragma: no\\-cache\nCache\\-C.{100}Modified: Tue, 13 Jan 2009 18:05:10 GMT\nExpires: Mon, 20 Dec 1998 01:00:00 GMT\nContent\\-Length: 16254$', flags=16), re.compile(b'^<head> <meta http\\-equiv="Content\\-Language" content="en\\-gb"> <meta http\\-equiv="Content\\-Type" content=.{16056} 8pt">YouTube Videos</span></td> </tr> </table> </td> </tr> </table></div> </div> </body> </html> \n\n$', flags=16), 'text/html'), 9: WarcDoc('clueweb09-en0000-00-00009', 'http://00perdomain.com/computers/', '2009-03-65T08:43:20-0800', re.compile(b'^HTTP/1\\.1 200 OK\nContent\\-Type: text/html; charset=utf\\-8\nDate: Sat, 17 Jan 2009 23:40:59 GMT\nPragma: n.{145} sid=i35idajmde65hlem4m2jpmrc37; path=/\nExpires: Thu, 19 Nov 1981 08:52:00 GMT\nContent\\-Length: 23500$', flags=16), re.compile(b'^\n\n\n<!\\-\\- Site by Zaz Corporation, 1 \\- 8 8 8 \\- 2 \\- Z A Z C O R http://www\\.zazcorp\\.us \\-\\->\n\n\n\n<html><he.{23302}<script type="text/javascript">\n_uacct = "UA\\-488717\\-2";\nurchinTracker\\(\\);\n</script>\n\n</body></html>\n\n$', flags=16), 'text/html'), 1000: WarcDoc('clueweb09-en0000-00-01000', 'http://2modern.com/designer/FLOS/Flos-Archimoon-Soft-Table-Lamp', '2009-03-65T08:44:07-0800', re.compile(b'^HTTP/1\\.1 200 OK\nContent\\-Type: text/html; charset=utf\\-8\nKeep\\-Alive: timeout=15, max=965\nContent\\-Encod.{359}4Pa38Ta38Nb350; path=/\nLast\\-Modified: Tue, 13 Jan 2009 21:10:47 GMT\nExpires: 0\nContent\\-Length: 52741$', flags=16), re.compile(b'^\n<html>\n<head>\n<meta http\\-equiv="Content\\-Type" content="text/html; charset=UTF\\-8">\n<title>FLOS \\- Arc.{52543}\\- \\[ 418126 \\] \\[ \\] \\[ /s\\.nl \\] \\[ Tue Jan 13 13:10:47 PST 2009 \\] \\-\\->\n<!\\-\\- Not logging slowest SQL \\-\\->\n\n\n$', flags=16), 'text/html'), }) self._assert_namedtuple(ir_datasets.load('clueweb09').docs.lookup('clueweb09-en0007-01-40637'), WarcDoc('clueweb09-en0007-01-40637', 'http://www.job-hunt.org/marketing.shtml', '2009-03-65T12:44:49-0800', re.compile(b'HTTP/1.1 200 OK.*44437', flags=16), body=re.compile(b'<!DOCTYPE HTML .*</script> \n\n</BODY>\n\n</HTML>\n\n\n', flags=16), body_content_type='text/html')) def test_clueweb09_docstore(self): docstore = ir_datasets.load('clueweb09').docs_store() docstore.clear_cache() with _logger.duration('cold fetch'): result = docstore.get_many(['clueweb09-en0000-00-00003', 'clueweb09-en0000-00-35154', 'clueweb09-ar0000-48-02342']) self.assertEqual(len(result), 3) with _logger.duration('warm fetch'): result = docstore.get_many(['clueweb09-en0000-00-00003', 'clueweb09-en0000-00-35154', 'clueweb09-ar0000-48-02342']) self.assertEqual(len(result), 3) docstore = ir_datasets.load('clueweb09').docs_store() with _logger.duration('warm fetch (new docstore)'): result = docstore.get_many(['clueweb09-en0000-00-00003', 'clueweb09-en0000-00-35154', 'clueweb09-ar0000-48-02342']) self.assertEqual(len(result), 3) with _logger.duration('cold fetch (nearby)'): result = docstore.get_many(['clueweb09-en0000-00-00023', 'clueweb09-en0000-00-35167', 'clueweb09-ar0000-48-02348']) self.assertEqual(len(result), 3) with _logger.duration('cold fetch (earlier)'): result = docstore.get_many(['clueweb09-en0000-00-00001', 'clueweb09-ar0000-48-00009']) self.assertEqual(len(result), 2) def test_clueweb09_queries(self): self._test_queries('clueweb09/en/trec-web-2009', count=50, items={ 0: TrecWebTrackQuery('1', 'obama family tree', "Find information on President Barack Obama's family\n history, including genealogy, national origins, places and dates of\n birth, etc.\n ", 'faceted', (TrecSubtopic(number='1', text='\n Find the TIME magazine photo essay "Barack Obama\'s Family Tree".\n ', type='nav'), TrecSubtopic(number='2', text="\n Where did Barack Obama's parents and grandparents come from?\n ", type='inf'), TrecSubtopic(number='3', text="\n Find biographical information on Barack Obama's mother.\n ", type='inf'))), 9: TrecWebTrackQuery('10', 'cheap internet', "I'm looking for cheap (i.e. low-cost) internet service.\n ", 'faceted', (TrecSubtopic(number='1', text='\n What are some low-cost broadband internet providers?\n ', type='inf'), TrecSubtopic(number='2', text='\n Do any internet providers still sell dial-up?\n ', type='inf'), TrecSubtopic(number='3', text='\n Who can provide inexpensive digital cable television bundled with\n internet service?\n ', type='inf'), TrecSubtopic(number='4', text="\n I'm looking for the Vonage homepage.\n ", type='nav'), TrecSubtopic(number='5', text='\n Find me some providers of free wireless internet access.\n ', type='inf'), TrecSubtopic(number='6', text='\n I want to find cheap DSL providers.\n ', type='inf'), TrecSubtopic(number='7', text='\n Is there a way to get internet access without phone service?\n ', type='inf'), TrecSubtopic(number='8', text="\n Take me to Comcast's homepage.\n ", type='nav'))), 49: TrecWebTrackQuery('50', 'dog heat', 'What is the effect of excessive heat on dogs?\n ', 'ambiguous', (TrecSubtopic(number='1', text='\n What is the effect of excessive heat on dogs?\n ', type='inf'), TrecSubtopic(number='2', text='\n What are symptoms of heat stroke and other heat-related illnesses\n in dogs?\n ', type='inf'), TrecSubtopic(number='3', text='\n Find information on dogs\' reproductive cycle. What does it mean\n when a dog is "in heat"?\n ', type='inf'))), }) self._test_queries('clueweb09/en/trec-web-2010', count=50, items={ 0: TrecWebTrackQuery('51', 'horse hooves', '\n Find information about horse hooves, their care, and diseases of hooves.\n ', 'faceted', (TrecSubtopic(number='1', text="\n Find information about horses' hooves and how to care for them.\n ", type='inf'), TrecSubtopic(number='2', text='\n Find pictures of horse hooves.\n ', type='nav'), TrecSubtopic(number='3', text='\n What are some injuries or diseases of hooves in horses, and how\n are they treated?\n ', type='inf'), TrecSubtopic(number='4', text="\n Describe the anatomy of horses' feet and hooves.\n ", type='inf'), TrecSubtopic(number='5', text='\n Find information on shoeing horses and horseshoe problems.\n ', type='inf'))), 9: TrecWebTrackQuery('60', 'bellevue', '\n Find information about Bellevue, Washington.\n ', 'ambiguous', (TrecSubtopic(number='1', text='\n Find information about Bellevue, Washington.\n ', type='inf'), TrecSubtopic(number='2', text='\n Find information about Bellevue, Nebraska.\n ', type='inf'), TrecSubtopic(number='3', text='\n Find information about Bellevue Hospital Center in New York, NY.\n ', type='inf'), TrecSubtopic(number='4', text='\n Find the homepage of Bellevue University.\n ', type='nav'), TrecSubtopic(number='5', text='\n Find the homepage of Bellevue College, Washington.\n ', type='nav'), TrecSubtopic(number='6', text='\n Find the homepage of Bellevue Hospital Center in New York, NY.\n ', type='nav'))), 49: TrecWebTrackQuery('100', 'rincon puerto rico', '\n Find information about Rincon, Puerto Rico.\n ', 'faceted', (TrecSubtopic(number='1', text='\n Find hotels and beach resorts in Rincon, Puerto Rico.\n ', type='inf'), TrecSubtopic(number='2', text='\n Find information on the history of Rincon, Puerto Rico.\n ', type='inf'), TrecSubtopic(number='3', text='\n Find surf forecasts for Rincon, Puerto Rico.\n ', type='inf'), TrecSubtopic(number='4', text='\n Find pictures of Rincon, Puerto Rico.\n ', type='nav'), TrecSubtopic(number='5', text='\n Find information about real estate and rental properties in\n Rincon, Puerto Rico.\n ', type='inf'))), }) self._test_queries('clueweb09/en/trec-web-2011', count=50, items={ 0: TrecWebTrackQuery('101', 'ritz carlton lake las vegas', '\n Find information about the Ritz Carlton resort at Lake Las Vegas.\n ', 'faceted', (TrecSubtopic(number='1', text='\n Find information about the Ritz Carlton resort at Lake Las Vegas.\n ', type='inf'), TrecSubtopic(number='2', text='\n Find a site where I can determine room price and availability.\n ', type='nav'), TrecSubtopic(number='3', text='\n Find directions to the Ritz Carlton Lake Las Vegas.\n ', type='nav'), TrecSubtopic(number='4', text='\n Find reviews of the Ritz Carlton Lake Las Vegas.\n ', type='inf'))), 9: TrecWebTrackQuery('110', 'map of brazil', '\n What are the boundaries of the political jurisdictions in Brazil?\n ', 'ambiguous', (TrecSubtopic(number='1', text='\n What are the boundaries of the political jurisdictions in Brazil?\n ', type='inf'), TrecSubtopic(number='2', text='\n I am looking for information about taking a vacation trip to Brazil.\n ', type='inf'), TrecSubtopic(number='3', text='\n I want to buy a road map of Brazil.\n ', type='nav'))), 49: TrecWebTrackQuery('150', 'tn highway patrol', '\n What are the requirements to become a Tennessee Highway Patrol State Trooper?\n ', 'faceted', (TrecSubtopic(number='1', text='\n What are the requirements to become a Tennessee Highway Patrol State Trooper?\n ', type='inf'), TrecSubtopic(number='2', text='\n information about the responsibilities of the Tennessee Highway Patrol\n ', type='inf'), TrecSubtopic(number='3', text='\n home page of the Tennessee Highway Patrol\n ', type='nav'), TrecSubtopic(number='4', text='\n I want to fill in the customer satisfaction survey about my interaction with a Tennessee Highway Patrol State Trooper.\n ', type='nav'))), }) self._test_queries('clueweb09/en/trec-web-2012', count=50, items={ 0: TrecWebTrackQuery('151', '403b', '\n What is a 403b plan?\n ', 'faceted', (TrecSubtopic(number='1', text='\n What is a 403b plan?\n ', type='inf'), TrecSubtopic(number='2', text='\n Who is eligible for a 403b plan?\n ', type='inf'), TrecSubtopic(number='3', text='\n What are the rules for a 403b retirement plan?\n ', type='nav'), TrecSubtopic(number='4', text='\n What is the difference between 401k and 403b retirement plans?\n ', type='inf'), TrecSubtopic(number='5', text='\n What are the withdrawal limitations for a 403b retirement plan?\n ', type='nav'))), 9: TrecWebTrackQuery('160', 'grilling', '\n Find kabob recipes.\n ', 'ambiguous', (TrecSubtopic(number='1', text='\n Find kabob recipes.\n ', type='nav'), TrecSubtopic(number='2', text='\n Find tips on grilling vegetables.\n ', type='inf'), TrecSubtopic(number='3', text='\n Find tips on grilling fish.\n ', type='inf'), TrecSubtopic(number='4', text='\n Find instructions for grilling chicken.\n ', type='inf'), TrecSubtopic(number='5', text='\n Find the Grilling Magazine website.\n ', type='nav'), TrecSubtopic(number='6', text='\n Find information on gas barbecue grills and cooking on a gas grill.\n ', type='inf'))), 49: TrecWebTrackQuery('200', 'ontario california airport', '\n Find flight information for the Ontario, CA airport.\n ', 'faceted', (TrecSubtopic(number='1', text='\n Find flight information for the Ontario, CA airport.\n ', type='inf'), TrecSubtopic(number='2', text='\n What hotels are near the Ontario, CA airport?\n ', type='inf'), TrecSubtopic(number='3', text='\n What services/facilities does the Ontario, CA airport offer?\n ', type='inf'), TrecSubtopic(number='4', text='\n What is the address of the Ontario, CA airport?\n ', type='nav'))), }) self._test_queries('clueweb09/catb/trec-web-2009', count=50, items={ 0: TrecWebTrackQuery('1', 'obama family tree', "Find information on President Barack Obama's family\n history, including genealogy, national origins, places and dates of\n birth, etc.\n ", 'faceted', (TrecSubtopic(number='1', text='\n Find the TIME magazine photo essay "Barack Obama\'s Family Tree".\n ', type='nav'), TrecSubtopic(number='2', text="\n Where did Barack Obama's parents and grandparents come from?\n ", type='inf'), TrecSubtopic(number='3', text="\n Find biographical information on Barack Obama's mother.\n ", type='inf'))), 9: TrecWebTrackQuery('10', 'cheap internet', "I'm looking for cheap (i.e. low-cost) internet service.\n ", 'faceted', (TrecSubtopic(number='1', text='\n What are some low-cost broadband internet providers?\n ', type='inf'), TrecSubtopic(number='2', text='\n Do any internet providers still sell dial-up?\n ', type='inf'), TrecSubtopic(number='3', text='\n Who can provide inexpensive digital cable television bundled with\n internet service?\n ', type='inf'), TrecSubtopic(number='4', text="\n I'm looking for the Vonage homepage.\n ", type='nav'), TrecSubtopic(number='5', text='\n Find me some providers of free wireless internet access.\n ', type='inf'), TrecSubtopic(number='6', text='\n I want to find cheap DSL providers.\n ', type='inf'), TrecSubtopic(number='7', text='\n Is there a way to get internet access without phone service?\n ', type='inf'), TrecSubtopic(number='8', text="\n Take me to Comcast's homepage.\n ", type='nav'))), 49: TrecWebTrackQuery('50', 'dog heat', 'What is the effect of excessive heat on dogs?\n ', 'ambiguous', (TrecSubtopic(number='1', text='\n What is the effect of excessive heat on dogs?\n ', type='inf'), TrecSubtopic(number='2', text='\n What are symptoms of heat stroke and other heat-related illnesses\n in dogs?\n ', type='inf'), TrecSubtopic(number='3', text='\n Find information on dogs\' reproductive cycle. What does it mean\n when a dog is "in heat"?\n ', type='inf'))), }) self._test_queries('clueweb09/catb/trec-web-2010', count=50, items={ 0: TrecWebTrackQuery('51', 'horse hooves', '\n Find information about horse hooves, their care, and diseases of hooves.\n ', 'faceted', (TrecSubtopic(number='1', text="\n Find information about horses' hooves and how to care for them.\n ", type='inf'), TrecSubtopic(number='2', text='\n Find pictures of horse hooves.\n ', type='nav'), TrecSubtopic(number='3', text='\n What are some injuries or diseases of hooves in horses, and how\n are they treated?\n ', type='inf'), TrecSubtopic(number='4', text="\n Describe the anatomy of horses' feet and hooves.\n ", type='inf'), TrecSubtopic(number='5', text='\n Find information on shoeing horses and horseshoe problems.\n ', type='inf'))), 9: TrecWebTrackQuery('60', 'bellevue', '\n Find information about Bellevue, Washington.\n ', 'ambiguous', (TrecSubtopic(number='1', text='\n Find information about Bellevue, Washington.\n ', type='inf'), TrecSubtopic(number='2', text='\n Find information about Bellevue, Nebraska.\n ', type='inf'), TrecSubtopic(number='3', text='\n Find information about Bellevue Hospital Center in New York, NY.\n ', type='inf'), TrecSubtopic(number='4', text='\n Find the homepage of Bellevue University.\n ', type='nav'), TrecSubtopic(number='5', text='\n Find the homepage of Bellevue College, Washington.\n ', type='nav'), TrecSubtopic(number='6', text='\n Find the homepage of Bellevue Hospital Center in New York, NY.\n ', type='nav'))), 49: TrecWebTrackQuery('100', 'rincon puerto rico', '\n Find information about Rincon, Puerto Rico.\n ', 'faceted', (TrecSubtopic(number='1', text='\n Find hotels and beach resorts in Rincon, Puerto Rico.\n ', type='inf'), TrecSubtopic(number='2', text='\n Find information on the history of Rincon, Puerto Rico.\n ', type='inf'), TrecSubtopic(number='3', text='\n Find surf forecasts for Rincon, Puerto Rico.\n ', type='inf'), TrecSubtopic(number='4', text='\n Find pictures of Rincon, Puerto Rico.\n ', type='nav'), TrecSubtopic(number='5', text='\n Find information about real estate and rental properties in\n Rincon, Puerto Rico.\n ', type='inf'))), }) self._test_queries('clueweb09/catb/trec-web-2011', count=50, items={ 0: TrecWebTrackQuery('101', 'ritz carlton lake las vegas', '\n Find information about the Ritz Carlton resort at Lake Las Vegas.\n ', 'faceted', (TrecSubtopic(number='1', text='\n Find information about the Ritz Carlton resort at Lake Las Vegas.\n ', type='inf'), TrecSubtopic(number='2', text='\n Find a site where I can determine room price and availability.\n ', type='nav'), TrecSubtopic(number='3', text='\n Find directions to the Ritz Carlton Lake Las Vegas.\n ', type='nav'), TrecSubtopic(number='4', text='\n Find reviews of the Ritz Carlton Lake Las Vegas.\n ', type='inf'))), 9: TrecWebTrackQuery('110', 'map of brazil', '\n What are the boundaries of the political jurisdictions in Brazil?\n ', 'ambiguous', (TrecSubtopic(number='1', text='\n What are the boundaries of the political jurisdictions in Brazil?\n ', type='inf'), TrecSubtopic(number='2', text='\n I am looking for information about taking a vacation trip to Brazil.\n ', type='inf'), TrecSubtopic(number='3', text='\n I want to buy a road map of Brazil.\n ', type='nav'))), 49: TrecWebTrackQuery('150', 'tn highway patrol', '\n What are the requirements to become a Tennessee Highway Patrol State Trooper?\n ', 'faceted', (TrecSubtopic(number='1', text='\n What are the requirements to become a Tennessee Highway Patrol State Trooper?\n ', type='inf'), TrecSubtopic(number='2', text='\n information about the responsibilities of the Tennessee Highway Patrol\n ', type='inf'), TrecSubtopic(number='3', text='\n home page of the Tennessee Highway Patrol\n ', type='nav'), TrecSubtopic(number='4', text='\n I want to fill in the customer satisfaction survey about my interaction with a Tennessee Highway Patrol State Trooper.\n ', type='nav'))), }) self._test_queries('clueweb09/catb/trec-web-2012', count=50, items={ 0: TrecWebTrackQuery('151', '403b', '\n What is a 403b plan?\n ', 'faceted', (TrecSubtopic(number='1', text='\n What is a 403b plan?\n ', type='inf'), TrecSubtopic(number='2', text='\n Who is eligible for a 403b plan?\n ', type='inf'), TrecSubtopic(number='3', text='\n What are the rules for a 403b retirement plan?\n ', type='nav'), TrecSubtopic(number='4', text='\n What is the difference between 401k and 403b retirement plans?\n ', type='inf'), TrecSubtopic(number='5', text='\n What are the withdrawal limitations for a 403b retirement plan?\n ', type='nav'))), 9: TrecWebTrackQuery('160', 'grilling', '\n Find kabob recipes.\n ', 'ambiguous', (TrecSubtopic(number='1', text='\n Find kabob recipes.\n ', type='nav'), TrecSubtopic(number='2', text='\n Find tips on grilling vegetables.\n ', type='inf'), TrecSubtopic(number='3', text='\n Find tips on grilling fish.\n ', type='inf'), TrecSubtopic(number='4', text='\n Find instructions for grilling chicken.\n ', type='inf'), TrecSubtopic(number='5', text='\n Find the Grilling Magazine website.\n ', type='nav'), TrecSubtopic(number='6', text='\n Find information on gas barbecue grills and cooking on a gas grill.\n ', type='inf'))), 49: TrecWebTrackQuery('200', 'ontario california airport', '\n Find flight information for the Ontario, CA airport.\n ', 'faceted', (TrecSubtopic(number='1', text='\n Find flight information for the Ontario, CA airport.\n ', type='inf'), TrecSubtopic(number='2', text='\n What hotels are near the Ontario, CA airport?\n ', type='inf'), TrecSubtopic(number='3', text='\n What services/facilities does the Ontario, CA airport offer?\n ', type='inf'), TrecSubtopic(number='4', text='\n What is the address of the Ontario, CA airport?\n ', type='nav'))), }) self._test_queries('clueweb09/trec-mq-2009', count=40000, items={ 0: GenericQuery('20001', '1:obama family tree'), 9: GenericQuery('20010', '1:cheap internet'), 39999: GenericQuery('60000', '4:bird shingles'), }) def test_clueweb09_qrels(self): self._test_qrels('clueweb09/en/trec-web-2009', count=23601, items={ 0: TrecPrel('1', 'clueweb09-en0003-55-31884', 0, 0, 1.0), 9: TrecPrel('1', 'clueweb09-en0009-84-37392', 0, 1, 0.0136322534877696), 23600: TrecPrel('50', 'clueweb09-en0007-05-20194', 0, 1, 1.0), }) self._test_qrels('clueweb09/en/trec-web-2010', count=25329, items={ 0: TrecQrel('51', 'clueweb09-en0000-16-19379', 0, '0'), 9: TrecQrel('51', 'clueweb09-en0001-55-24197', 0, '0'), 25328: TrecQrel('99', 'clueweb09-enwp03-23-18429', 0, '0'), }) self._test_qrels('clueweb09/en/trec-web-2011', count=19381, items={ 0: TrecQrel('101', 'clueweb09-en0007-71-07471', 0, '0'), 9: TrecQrel('101', 'clueweb09-en0044-05-29808', 2, '0'), 19380: TrecQrel('150', 'clueweb09-en0003-86-25593', -2, '0'), }) self._test_qrels('clueweb09/en/trec-web-2012', count=16055, items={ 0: TrecQrel('151', 'clueweb09-en0000-00-03430', -2, '0'), 9: TrecQrel('151', 'clueweb09-en0000-00-04023', -2, '0'), 16054: TrecQrel('200', 'clueweb09-enwp03-49-00268', 0, '0'), }) self._test_qrels('clueweb09/catb/trec-web-2009', count=13118, items={ 0: TrecPrel('1', 'clueweb09-en0003-55-31884', 0, 0, 1.0), 9: TrecPrel('1', 'clueweb09-enwp01-17-09993', 2, 1, 1.0), 13117: TrecPrel('50', 'clueweb09-en0007-05-20194', 0, 1, 1.0), }) self._test_qrels('clueweb09/catb/trec-web-2010', count=15845, items={ 0: TrecQrel('51', 'clueweb09-en0000-16-19379', 0, '0'), 9: TrecQrel('51', 'clueweb09-en0001-55-24197', 0, '0'), 15844: TrecQrel('99', 'clueweb09-enwp03-23-18429', 0, '0'), }) self._test_qrels('clueweb09/catb/trec-web-2011', count=13081, items={ 0: TrecQrel('101', 'clueweb09-en0007-71-07471', 0, '0'), 9: TrecQrel('101', 'clueweb09-en0001-12-16652', 0, '0'), 13080: TrecQrel('150', 'clueweb09-en0003-86-25593', -2, '0'), }) self._test_qrels('clueweb09/catb/trec-web-2012', count=10022, items={ 0: TrecQrel('151', 'clueweb09-en0000-00-03430', -2, '0'), 9: TrecQrel('151', 'clueweb09-en0000-00-04023', -2, '0'), 10021: TrecQrel('200', 'clueweb09-enwp03-49-00268', 0, '0'), }) self._test_qrels('clueweb09/trec-mq-2009', count=34534, items={ 0: TrecPrel('20001', 'clueweb09-en0003-55-31884', 0, 0, 1.0), 9: TrecPrel('20001', 'clueweb09-enwp01-17-09993', 2, 1, 1.0), 34533: TrecPrel('57118', 'clueweb09-en0010-39-07801', 0, 1, 0.0612006151868131), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/clueweb12.py ================================================ import re import unittest import ir_datasets from ir_datasets.datasets.clueweb12 import TrecWebTrackQuery, NtcirQuery, MisinfoQrel, MisinfoQuery, EhealthQrel from ir_datasets.formats import TrecQrel, TrecSubtopic, GenericDoc, GenericQuery, WarcDoc, TrecSubQrel from .base import DatasetIntegrationTest _logger = ir_datasets.log.easy() class TestClueWeb12(DatasetIntegrationTest): def test_clueweb12_docs(self): self._test_docs('clueweb12', items={ 0: WarcDoc('clueweb12-0000tw-00-00000', 'http://tsawer.net/2012/02/10/france-image-pool-2012-02-10-162252/', '2012-02-10T22:50:41Z', re.compile(b'^HTTP/1\\.1 200 OK\\\r\nDate: Fri, 10 Feb 2012 22:50:40 GMT\\\r\nServer: Apache/2\\.2\\.21 \\(Unix\\) mod_ssl/2\\.2\\.21 Op.{338}ortlink\\\r\nVary: Accept\\-Encoding,User\\-Agent\\\r\nConnection: close\\\r\nContent\\-Type: text/html; charset=UTF\\-8$', flags=16), re.compile(b'^<!DOCTYPE html PUBLIC "\\-//W3C//DTD XHTML 1\\.0 Strict//EN"\\\r\n "http://www\\.w3\\.org/TR/xhtml1/DTD/x.{22239}p://tsawer\\.net/wpaggregator/wp\\-content/plugins/contact\\-form\\-7/scripts\\.js\\?ver=3\\.1\'></script>\n</html>\n$', flags=16), 'text/html'), 9: WarcDoc('clueweb12-0000tw-00-00009', 'http://claywginn.com/2012/02/10/lessons-learned-from-a-week-on-vacation/', '2012-02-10T21:47:35Z', re.compile(b'^HTTP/1\\.1 200 OK\\\r\nDate: Fri, 10 Feb 2012 21:47:36 GMT\\\r\nServer: Apache\\\r\nX\\-Powered\\-By: PHP/5\\.2\\.17\\\r\nX\\-Pi.{45}: <http://wp\\.me/p1zQki\\-AT>; rel=shortlink\\\r\nConnection: close\\\r\nContent\\-Type: text/html; charset=UTF\\-8$', flags=16), re.compile(b'^<!DOCTYPE html PUBLIC "\\-//W3C//DTD XHTML 1\\.0 Transitional//EN" "http://www\\.w3\\.org/TR/xhtml1/DTD/xhtm.{25532}f addLoadEvent != \'undefined\' \\) addLoadEvent\\(load_cmc\\);\n\\\telse load_cmc\\(\\);\n\\\t</script></body>\\\r\n</html>$', flags=16), 'text/html'), 1000: WarcDoc('clueweb12-0000tw-00-01002', 'http://beanpotscastiron.waffleironshapes.com/le-creuset-enameled-cast-iron-7-14-quart-round-french-oven-cherry-red-save-price-shopping-online/', '2012-02-10T21:55:43Z', re.compile(b'^HTTP/1\\.1 200 OK\\\r\nDate: Fri, 10 Feb 2012 21:55:42 GMT\\\r\nServer: Apache\\\r\nX\\-Pingback: http://beanpotscas.{70}waffleironshapes\\.com/\\?p=5>; rel=shortlink\\\r\nConnection: close\\\r\nContent\\-Type: text/html; charset=UTF\\-8$', flags=16), re.compile(b'^<!DOCTYPE html PUBLIC "\\-//W3C//DTD XHTML 1\\.0 Transitional//EN" "http://www\\.w3\\.org/TR/xhtml1/DTD/xhtm.{27884}insertBefore\\(js, fjs\\);\\\r\n\\}\\(document, \'script\', \'facebook\\-jssdk\'\\)\\);</script>\\\r\n</div></body>\\\r\n</html>\\\r\n$', flags=16), 'text/html'), }) self._test_docs('clueweb12/b13', items={ 0: WarcDoc('clueweb12-0000tw-00-00013', 'http://cheapcosthealthinsurance.com/2012/01/25/what-is-hiv-aids/', '2012-02-10T21:51:20Z', re.compile(b'^HTTP/1\\.1 200 OK\\\r\nDate: Fri, 10 Feb 2012 21:51:22 GMT\\\r\nServer: Apache/2\\.2\\.21 \\(Unix\\) mod_ssl/2\\.2\\.21 Op.{213}ealthinsurance\\.com/\\?p=711>; rel=shortlink\\\r\nConnection: close\\\r\nContent\\-Type: text/html; charset=UTF\\-8$', flags=16), re.compile(b'^\n<!DOCTYPE html PUBLIC "\\-//W3C//DTD XHTML 1\\.0 Transitional//EN" "http://www\\.w3\\.org/TR/xhtml1/DTD/xht.{71109}\\.js"></script>\n</body>\n\n</html>\n\n<!\\-\\-\nEnd of footer\\.php\n\\~\\~\\~ \\-\\->\n\\\t\n\n\n<!\\-\\-\nEnd of single\\.php \n\\~\\~\\~ \\-\\->\n$', flags=16), 'text/html'), 9: WarcDoc('clueweb12-0000tw-00-00139', 'http://data-protection.safenet-inc.com/social-media/', '2012-02-10T21:56:06Z', re.compile(b'^HTTP/1\\.0 200 OK\\\r\nDate: Fri, 10 Feb 2012 21:56:06 GMT\\\r\nServer: Apache/2\\.2\\.14 \\(Ubuntu\\)\\\r\nX\\-Powered\\-By: .{66}inc\\.com/xmlrpc\\.php\\\r\nVary: Accept\\-Encoding\\\r\nConnection: close\\\r\nContent\\-Type: text/html; charset=UTF\\-8$', flags=16), re.compile(b'^<!DOCTYPE html>\n<!\\-\\-\\[if IE 6\\]>\n<html id="ie6" dir="ltr" lang="en\\-US">\n<!\\[endif\\]\\-\\->\n<!\\-\\-\\[if IE 7\\]>\n<h.{13819}cs\\.com/ga\\.js";\n s\\.parentNode\\.insertBefore\\(g,s\\)\\}\\(document,"script"\\)\\);\n </script>\n\\\t</body>\n</html>$', flags=16), 'text/html'), 1000: WarcDoc('clueweb12-0000tw-00-14061', 'http://opinionator.blogs.nytimes.com/2006/01/12/this-wont-hurt-a-bit/', '2012-02-10T22:24:09Z', re.compile(b'^HTTP/1\\.0 200 OK\\\r\nDate: Fri, 10 Feb 2012 22:24:09 GMT\\\r\nServer: Apache\\\r\nVary: Cookie\\\r\nX\\-Pingback: http.{72}logs\\.nytimes\\.com/\\?p=30475>; rel=shortlink\\\r\nConnection: close\\\r\nContent\\-Type: text/html; charset=UTF\\-8$', flags=16), re.compile(b'^<!DOCTYPE html PUBLIC "\\-//W3C//DTD XHTML 1\\.0 Transitional//EN" "http://www\\.w3\\.org/TR/xhtml1/DTD/xhtm.{105755},Spon3,ADX_CLIENTSIDE,SponLink2\\&pos=Bottom8\\&query=qstring\\&keywords=\\?"></a></noscript></body>\n</html>$', flags=16), 'text/html'), }) def test_clueweb12_docs_html(self): self._test_docs(ir_datasets.wrappers.HtmlDocExtractor(ir_datasets.load('clueweb12')), items={ 0: WarcDoc('clueweb12-0000tw-00-00000', 'http://tsawer.net/2012/02/10/france-image-pool-2012-02-10-162252/', '2012-02-10T22:50:41Z', re.compile(b'^HTTP/1\\.1 200 OK\\\r\nDate: Fri, 10 Feb 2012 22:50:40 GMT\\\r\nServer: Apache/2\\.2\\.21 \\(Unix\\) mod_ssl/2\\.2\\.21 Op.{338}ortlink\\\r\nVary: Accept\\-Encoding,User\\-Agent\\\r\nConnection: close\\\r\nContent\\-Type: text/html; charset=UTF\\-8$', flags=16), re.compile('^\\\r\n\\\t\\\t\\\t France image Pool 2012\\-02\\-10 16:22:52\\\t \n \n \n \n \n rss § \n atom § \n rdf \n \n \n Photos aggregator.{736}essages\\. \n \n \n \n \n \n \n Based on Ocular Professor § Powered by WordPress \n \n \n \n \n \n \n \n \n \n \n \n \n $', flags=48), 'text/plain'), 9: WarcDoc('clueweb12-0000tw-00-00009', 'http://claywginn.com/2012/02/10/lessons-learned-from-a-week-on-vacation/', '2012-02-10T21:47:35Z', re.compile(b'^HTTP/1\\.1 200 OK\\\r\nDate: Fri, 10 Feb 2012 21:47:36 GMT\\\r\nServer: Apache\\\r\nX\\-Powered\\-By: PHP/5\\.2\\.17\\\r\nX\\-Pi.{45}: <http://wp\\.me/p1zQki\\-AT>; rel=shortlink\\\r\nConnection: close\\\r\nContent\\-Type: text/html; charset=UTF\\-8$', flags=16), re.compile('^Lessons learned from a week on vacation \\| claywginn\\.com \n \n \n \n \n Home \n About me \n Contact me \n \n.{5287} Words Posts: 21,458 Words \\(511 Avg\\.\\) \n Powered by WordPress \\| Designed by Elegant Themes \n \n $', flags=48), 'text/plain'), 1000: WarcDoc('clueweb12-0000tw-00-01002', 'http://beanpotscastiron.waffleironshapes.com/le-creuset-enameled-cast-iron-7-14-quart-round-french-oven-cherry-red-save-price-shopping-online/', '2012-02-10T21:55:43Z', re.compile(b'^HTTP/1\\.1 200 OK\\\r\nDate: Fri, 10 Feb 2012 21:55:42 GMT\\\r\nServer: Apache\\\r\nX\\-Pingback: http://beanpotscas.{70}waffleironshapes\\.com/\\?p=5>; rel=shortlink\\\r\nConnection: close\\\r\nContent\\-Type: text/html; charset=UTF\\-8$', flags=16), re.compile('^Le Creuset Enameled Cast\\-Iron 7\\-1/4\\-Quart Round French Oven, Cherry Red Save Price Shopping Online \\|.{4936}sites to earn advertising fees by advertising and linking to amazon\\.com Web Toolbar by Wibiya \n \n \n $', flags=48), 'text/plain'), }) extracted_text = { 'clueweb12-0000tw-00-00000': ' rss § atom § rdf\n\n Photos aggregator\n\n dynamic content\n\n Search:\n * \n * Add album/Contact us\n * News\n * Reviews\n\n shaggyshoo has added a photo to the pool:\n\n annecy. france.\n\n France image Pool 2012-02-10 16:22:52\n\n * February 10th, 2012\n\n * Tags cloud\n\n WP Cumulus Flash tag cloud by Roy Tanck and Luke Morton requires Flash Player 9 or better.\n\n * Twits from \'photobabble\'\n\n + No public Twitter messages.\n\n Based on Ocular Professor § Powered by WordPress\n', 'clueweb12-0000tw-00-00009': ' * Home\n * About me\n * Contact me\n Feb 10, 2012\n\n Posted by clay in Christ and culture, Writing | 0 Comments\n\n Lessons learned from a week on vacation\n\n It’s been quite a week. We have one more night here at Walt Disney World, then we are headed home. I’ll give you a bit of a preview for next week though. There are a few things I’ve noticed here that develop into great analogies for life lessons. Anything from a large, burly fellow with a pink Disney Princess backpack, to using the bathroom at the dinner table, to the lack of wisdom in crowds. I think you’ll enjoy these.\n\n However, today I’d like to share with a you a few lessons that have been forcefully taught to me this week. Some painful, some really painful, and some rather enjoyable.\n\n I love that I’ve been able to spend a week in a single hotel room with my wife and kids. However, I’m going crazy spending a week in a single hotel room with my kids. Bedtime is the absolute craziest time of day. I don’t know if it is the tired children, the tired parents, or the insanity of being in one room, but little ones do not go to sleep well this way, at least not ours. Now, this is a very first world problem, and I know that I should be thankful that I’m able to take trips like this with my children. Don’t misunderstand me, because I am very thankful for that. The simple fact remains that it is so far out of the ordinary for us as to be uncomfortable and even more tiring.\n\n The second thing I’ve learned is that you can tell how many days people have been here just by the look on their faces. There is an unspoken “I’ve sat through ;It’s A Small World’ so many times that i might just twist off at the next puppet I see” look to some faces. These are the people that are not hardcore Disney fans, but came mostly for their children. (An aside: you know what would make “Small World” so much better? Bucket o’ softballs. You’ll have to create a queue that would stretch all the way across the park for it. So many childhood doll nightmares erased in a hail of tossed balls. It’s a goldmine. I expect my royalty checks to pour in soon.)\n\n Too many people come here with the expectation that they can accomplish everything at WDW in a week. This is so far from the truth. My wife and I have been coming here since 1996, and we’ve still not seen or done it all. The likelihood is pretty slim that it will happen too. Too many things changing or disappearing to be able to conquer that mountain.\n\n The one lasting memory I have from our trips to WDW is the sense of awe that my children feel. Heck, I feel it too. Everything here is so much bigger, so much more perfect than the real world. Of course, if it weren’t that way, we wouldn’t be paying to visit. If someone built a Computer Progeammer World, I’d have to be drugged to be taken in to it. I don’t want my real life, I want a different reality. That might be good sometimes, but it definitely isn’t a way to live.\n\n Finally, it appears to me that while they are here, people become a clearer, more exact picture of themselves due to the money and time spent, and emotional and physical investment paid to be here. Vacations, especially big ones like WDW have a way of refining our personalities in a way that other experiences don’t. I’ve seen adults act like complete children and complete jerks. I’ve seen children be perfect angels and ceaseless terrors (and you should have seen everyone else’s kids, too). I’m sure that I exhibit some of this too. We are who we are, and exhaustion makes the pretensions go away.\n\n All in all, it’s been a great time here. I”m anxious to start home and return to my own reality though. Too many exciting things coming up in the next few weeks that I don’t want to miss. I hope you come back again next week for my Observations on the World series.\n\n Related Posts:\n\n * Next year’s model\n * Flummoxed: a week in review\n * What is bi-vocational ministry?\n * What’s in your back yard?\n * Where to go from here\n * Powered by Contextual Related Posts\n\n Leave a Reply\n\n Your email address will not be published. Required fields are marked *\n\n Name *\n\n Email *\n\n Website\n\n Comment\n\n You may use these HTML tags and attributes: <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <strike> <strong>\n\n Notify me of follow-up comments by email.\n\n Notify me of new posts by email.\n\n Subscribe to claywginn.com via email!\n\n Don\'t want to come check out the site every day? Subscribe here and I\'ll send you an email every time a new post goes up.\n\n Join 7 other subscribers\n\n Popular Posts\n\n * “What kind of God wants you to be poor and miserable?” (33)\n * Humble pride (33)\n * Humble pride: a follow up (31)\n * The beginning (20)\n * Why do I write? (19)\n * Starting over… (18)\n * Lost in the woods (18)\n * Expectations (18)\n * What is your mission? (18)\n * What is bi-vocational ministry? (17)\n\n Categories\n\n * Apologetics (6)\n * Bi-vocational Ministry (1)\n * Christ and culture (15)\n * Discipleship (18)\n * Featured (3)\n * Mission (2)\n * Writing (19)\n\n Friends\n\n Sitemeter\n\n AdSense\n\n Word Count Statistics\n\n Total: 21,977 Words\n Posts: 21,458 Words (511 Avg.)\n\n Powered by WordPress | Designed by Elegant Themes\n', 'clueweb12-0000tw-00-01002': ' Bean Pots Cast Iron\n\n « Joyce Chen J90-0704 6-Quart Stoneware Chinese Cooking Pot, Black Cheapest Prices\n Le Creuset Heritage Collection Enameled Cast-Iron 2-Quart Legumier, Cobalt Online Shopping »\n\n Le Creuset Enameled Cast-Iron 7-1/4-Quart Round French Oven, Cherry Red Save Price Shopping Online\n\n ►►►Save Big on Le Creuset Enameled Cast-Iron 7-1/4-Quart Round French Oven, Cherry Red\n\n Find Deals on Le Creuset Enameled Cast-Iron 7-1/4-Quart Round French Oven, Cherry Red. We Offer Le Creuset Enameled Cast-Iron 7-1/4-Quart Round French Oven, Cherry Red For Shopping Deals\n\n I read a lot of the Le Creuset Enameled Cast-Iron 7-1/4-Quart Round French Oven, Cherry Red affordable price reviews before I bought this. Don’t spend more than you have to! I already done the research for you. Read Here where to buy Le Creuset Enameled Cast-Iron 7-1/4-Quart Round French Oven, Cherry Red at The CRAZY Prices!\n\n Visit the bestsellers in Le Creuset Enameled Cast-Iron 7-1/4-Quart Round French Oven, Cherry Red list for authoritative information on this product – current rank.\n\n read more\n\n Since 1925, Le Creuset has been handcrafting Enameled Cast Iron cookware, and particularly Round French Ovens (or Dutch Ovens), in Northern France. While this popular shape has been around for many centuries before that, the basic design has changed very little thus endorsing the cooking qualities that it provides. Generation after generation has come to cherish the Le Creuset Round French Oven’s quality, durability, and versatility, and it easily becomes the core piece in any well-equipped kitchen. The cast iron provides superb heat retention and distribution, and the enamel is hard-wearing and non-reactive, making the number of recipes that you can do in this pot endless: anything from savory rice to braised chicken to mouth-watering cake. Your imagination is the only limit!.\n\n » See More Images «\n\n Le Creuset Enameled Cast-Iron 7-1/4-Quart Round French Oven, Cherry Red | Save Price Shopping Online at Shop Online\n\n Price List: $375.00\n GET DISCOUNT / FREE SHIPPING\n » See Best Price « \n \n\n Le Creuset Enameled Cast-Iron 7-1/4-Quart Round French Oven, Cherry Red Product Feature\n\n * 7-1/4-quart round-shaped French oven made of enameled cast iron\n * Cast-iron loop side handles; black, phenolic, stay-cool lid knob\n * Heavy, tight-fitting lid helps lock in heat, moisture, and flavor\n * Washing by hand recommended; oven-safe to 350 degrees F\n * Measures 11-3/5 by 14 by 7 inches; limited lifetime warranty\n\n I have tried to find information for Le Creuset Enameled Cast-Iron 7-1/4-Quart Round French Oven, Cherry Red on the Internet. There are a lot affordable price as well. You can purchase a Le Creuset Enameled Cast-Iron 7-1/4-Quart Round French Oven, Cherry Red in the best reasonable price of the web site, Amazon, you will not hesitate a moment.\n\n ★★★ HOT ITEM LIKE THIS TEND TO SELL OUT VERY QUICKLY ★★★\n\n\n\n Title\n\n\n 4\n 29\n\n\n Relate keywords : bean pots cast iron,ebay cast iron,cookware cast iron,crocks cast iron,bed bath and beyond cast iron,kohls cast iron,bean pots brass,bean pots gas,bean pots metal,bean pots porcelain,bean pots copper,bean pot recipes cast iron,walmart cast iron,bean pots cast aluminum,bean pots dutch oven,bean pots stainless steel,pottery cast iron,williams sonoma cast iron,bean pots steel,bean pots cast iron camp,bean pots cast iron covered\n\n Comments\n\n Tags: 714Quart, CastIron, Cherry, Creuset, Enameled, French, French CastIron, Online, Shopping, Shopping 714Quart\n\n Comments are closed.\n\n Recent Posts\n * Nordic Ware Grilling Essentials Cast Bean Pot Save Price with Promotion Today\n * Le Creuset Heritage Collection Enameled Cast-Iron 2-Quart Legumier, Cobalt Online Shopping\n * Le Creuset Enameled Cast-Iron 7-1/4-Quart Round French Oven, Cherry Red Save Price Shopping Online\n * Joyce Chen J90-0704 6-Quart Stoneware Chinese Cooking Pot, Black Cheapest Prices\n * LE CREUSET Enameled Cast Iron 4-1/4 Quart Soup Pot Blue Online Shopping\n * Bayou Classic 7448, 2.5-Qt. Cast Iron Bean Pot with Lid Online Shopping\n * Le Creuset Enameled Cast-Iron 7-1/4-Quart Round French Oven, Cherry Red Get Discount\n Tags\n 714Quart CastIron CastIron 714Quart Cherry Creuset Discount Enameled French Online Shopping\n\n Powered by WordPress and Created for Bean Pots Cast Iron | Waffle Iron Shapes\n\n http://beanpotscastiron.waffleironshapes.com is a participant in the Amazon Services LLC Associates Program, an affiliate advertising program designed to provide a means for sites to earn advertising fees by advertising and linking to amazon.com\n\n Web Toolbar by Wibiya', } self._test_docs(ir_datasets.wrappers.HtmlDocExtractor(ir_datasets.load('clueweb12'), extractor='inscriptis'), items={ 0: WarcDoc('clueweb12-0000tw-00-00000', 'http://tsawer.net/2012/02/10/france-image-pool-2012-02-10-162252/', '2012-02-10T22:50:41Z', re.compile(b'^HTTP/1\\.1 200 OK\\\r\nDate: Fri, 10 Feb 2012 22:50:40 GMT\\\r\nServer: Apache/2\\.2\\.21 \\(Unix\\) mod_ssl/2\\.2\\.21 Op.{338}ortlink\\\r\nVary: Accept\\-Encoding,User\\-Agent\\\r\nConnection: close\\\r\nContent\\-Type: text/html; charset=UTF\\-8$', flags=16), re.compile(re.escape(extracted_text['clueweb12-0000tw-00-00000']), flags=48), 'text/plain'), 9: WarcDoc('clueweb12-0000tw-00-00009', 'http://claywginn.com/2012/02/10/lessons-learned-from-a-week-on-vacation/', '2012-02-10T21:47:35Z', re.compile(b'^HTTP/1\\.1 200 OK\\\r\nDate: Fri, 10 Feb 2012 21:47:36 GMT\\\r\nServer: Apache\\\r\nX\\-Powered\\-By: PHP/5\\.2\\.17\\\r\nX\\-Pi.{45}: <http://wp\\.me/p1zQki\\-AT>; rel=shortlink\\\r\nConnection: close\\\r\nContent\\-Type: text/html; charset=UTF\\-8$', flags=16), re.compile(re.escape(extracted_text['clueweb12-0000tw-00-00009']), flags=48), 'text/plain'), 1000: WarcDoc('clueweb12-0000tw-00-01002', 'http://beanpotscastiron.waffleironshapes.com/le-creuset-enameled-cast-iron-7-14-quart-round-french-oven-cherry-red-save-price-shopping-online/', '2012-02-10T21:55:43Z', re.compile(b'^HTTP/1\\.1 200 OK\\\r\nDate: Fri, 10 Feb 2012 21:55:42 GMT\\\r\nServer: Apache\\\r\nX\\-Pingback: http://beanpotscas.{70}waffleironshapes\\.com/\\?p=5>; rel=shortlink\\\r\nConnection: close\\\r\nContent\\-Type: text/html; charset=UTF\\-8$', flags=16), re.compile(re.escape(extracted_text['clueweb12-0000tw-00-01002']), flags=48), 'text/plain'), }) def test_clueweb12_docstore(self): docstore = ir_datasets.load('clueweb12').docs_store() docstore.clear_cache() with _logger.duration('cold fetch'): docstore.get_many(['clueweb12-0000tw-05-00014', 'clueweb12-0000tw-05-12119', 'clueweb12-0106wb-18-19516']) docstore.clear_cache() with _logger.duration('cold fetch (cleared)'): docstore.get_many(['clueweb12-0000tw-05-00014', 'clueweb12-0000tw-05-12119', 'clueweb12-0106wb-18-19516']) with _logger.duration('warm fetch'): docstore.get_many(['clueweb12-0000tw-05-00014', 'clueweb12-0000tw-05-12119', 'clueweb12-0106wb-18-19516']) docstore = ir_datasets.load('clueweb12').docs_store() with _logger.duration('warm fetch (new docstore)'): docstore.get_many(['clueweb12-0000tw-05-00014', 'clueweb12-0000tw-05-12119', 'clueweb12-0106wb-18-19516']) with _logger.duration('cold fetch (nearby)'): docstore.get_many(['clueweb12-0000tw-05-00020', 'clueweb12-0000tw-05-12201', 'clueweb12-0106wb-18-19412']) with _logger.duration('cold fetch (earlier)'): docstore.get_many(['clueweb12-0000tw-05-00001', 'clueweb12-0106wb-18-08131']) docstore.clear_cache() with _logger.duration('cold fetch (earlier, cleared)'): docstore.get_many(['clueweb12-0000tw-05-00001', 'clueweb12-0106wb-18-08131']) def test_clueweb12_queries(self): self._test_queries('clueweb12/trec-web-2013', count=50, items={ 0: TrecWebTrackQuery('201', 'raspberry pi', '\n What is a raspberry pi?\n ', 'faceted', (TrecSubtopic(number='1', text='\n What is a raspberry pi?\n ', type='inf'), TrecSubtopic(number='2', text='\n What software does a raspberry pi use?\n ', type='inf'), TrecSubtopic(number='3', text='\n What are hardware options for a raspberry pi?\n ', type='inf'), TrecSubtopic(number='4', text='\n How much does a basic raspberry pi cost?\n ', type='nav'), TrecSubtopic(number='5', text='\n Find info about the raspberry pi foundation.\n ', type='inf'), TrecSubtopic(number='6', text='\n Find a picture of a raspberry pi.\n ', type='nav'))), 9: TrecWebTrackQuery('210', 'golf gps', '\n What is the best golf gps device?\n ', 'faceted', (TrecSubtopic(number='1', text='\n What is the best golf gps device?\n ', type='inf'), TrecSubtopic(number='2', text='\n Compare Bushnell, Callaway and Garmin golf gps systems.\n ', type='inf'), TrecSubtopic(number='3', text='\n Is there a golf gps app for the Iphone?\n ', type='nav'), TrecSubtopic(number='4', text='\n Find information on handheld golf gps devices.\n ', type='inf'), TrecSubtopic(number='5', text='\n Is there a golf gps system that can be used world wide?\n ', type='nav'), TrecSubtopic(number='6', text='\n Where can I get a used golf gps device?\n ', type='inf'))), 49: TrecWebTrackQuery('250', 'ford edge problems', '\n What problems have afflicted the Ford Edge car model?\n ', 'single', ()), }) self._test_queries('clueweb12/trec-web-2014', count=50, items={ 0: TrecWebTrackQuery('251', 'identifying spider bites', '\n \tFind data on how to identify spider bites.\n ', 'single', ()), 9: TrecWebTrackQuery('260', 'the american revolutionary', '\n \tFind a list of the major battles of the American Revolution.\n ', 'faceted', (TrecSubtopic(number='1', text='\n \tFind a list of the major battles of the American Revolution.\n ', type='nav'), TrecSubtopic(number='2', text='\n \tFind a time line of the American Revolution.\n ', type='nav'), TrecSubtopic(number='3', text='\n \tFind images of the American Revolution.\n ', type='inf'), TrecSubtopic(number='4', text='\n \tWhat were the causes of the American revolutionary war?\n ', type='inf'), TrecSubtopic(number='5', text='\n \tWhat is the history of the American revolutionary war?\n ', type='inf'))), 49: TrecWebTrackQuery('300', 'how to find the mean', '\n \tFind a page that explains how to compute the mean of a set of numbers.\n ', 'single', ()), }) self._test_queries('clueweb12/b13/ntcir-www-1', count=100, items={ 0: GenericQuery('0001', 'ascii code'), 9: GenericQuery('0010', 'Jurassic World'), 99: GenericQuery('0100', 'weight loss'), }) self._test_queries('clueweb12/b13/ntcir-www-2', count=80, items={ 0: NtcirQuery('0001', 'Halloween picture', 'Halloween is coming. You want to find some pictures about Halloween to introduce it to your children.'), 9: NtcirQuery('0010', 'career plan', 'You are an undergraduate student who is about to graduate. You want to search some information about how to plan your career.'), 79: NtcirQuery('0080', 'www.gardenburger.com', 'You want to find the website "www.gardenburger.com"'), }) self._test_queries('clueweb12/b13/ntcir-www-3', count=160, items={ 0: NtcirQuery('0001', 'Halloween picture', 'Halloween is coming. You want to find some pictures about Halloween to introduce it to your children.'), 9: NtcirQuery('0010', 'career plan', 'You are an undergraduate student who is about to graduate. You want to search some information about how to plan your career.'), 159: NtcirQuery('0180', 'quincy jones productions', 'You want a list of famous records produced by Quincy Jones.'), }) self._test_queries('clueweb12/b13/trec-misinfo-2019', count=51, items={ 0: MisinfoQuery('1', 'cranberries urinary tract infections', '10.1002/14651858.CD001321.pub5', 'Can cranberries prevent urinary tract infections?', 'Symptoms of a urinary tract infection (UTI) include burning while urinating and a persistent urge to urinate. Relevant documents should discuss the effectiveness of consuming cranberries or cranberry juice for prevention of UTIs. This topic is specifically about prevention rather than treatment of an existing infection.'), 9: MisinfoQuery('10', 'gene therapy sickle cell', '10.1002/14651858.CD007652.pub6', 'Can gene therapy prevent complications caused by sickle cell disease?', 'Sickle cell disease (SCD) is an inherited blood disorder that affects the development of healthy red blood cells and causes red blood cells to change their form from a normal round shape to a crescent and rigid shape. People with sickle cell disease have fewer healthy blood cells, which can affect their oxygen carrying capacity and lead to serious or life-threatening complications. Gene therapy, as a newly advanced field, is claimed to be helpful for this disease. A relevant document discusses using gene therapy for preventing the symptoms and complications of SCD.'), 50: MisinfoQuery('51', 'dehumidifiers asthma', '10.1002/14651858.CD003563.pub2', 'Can dehumidifiers be used to control asthma?', 'Dehumidification homes might improve lives of people with asthma. Dehumidifiers are electronic devices to control the level of humidity of environment which is suggested to contribute to factors that might affect asthma. A relevant document should discuss whether or not dehumidifiers can be used to control asthma symptoms or can improve lives of people with asthma.'), }) self._test_queries('clueweb12/b13/clef-ehealth', count=300, items={ 0: GenericQuery('101001', 'inguinal hernia repair laproscopic mesh benefits risks'), 9: GenericQuery('102004', '"anal" skin tags removal or treatments "recovery"'), 299: GenericQuery('150006', 'what causes painful erections after have a foley catheter'), }) self._test_queries('clueweb12/b13/clef-ehealth/cs', count=300, items={ 0: GenericQuery('101001-cs', 'korekce inguinální hernie laparoskopická síťka přínosy rizika'), 9: GenericQuery('102004-cs', 'odstranění kožních výrůstků v oblasti konečníku nebo zotavení se z léčby'), 299: GenericQuery('150006-cs', 'co způsobuje bolestivou erekci po zavedení Foleyova katétru'), }) self._test_queries('clueweb12/b13/clef-ehealth/de', count=300, items={ 0: GenericQuery('101001-de', 'Leistenbruch Reparatur laparoskopisch Netz Vorteile Risiken'), 9: GenericQuery('102004-de', 'anal "Hautauswuchs Entfernung oder Behandlungen" Heilung'), 299: GenericQuery('150006-de', 'was verursacht schmerzhafte Erektion nach einem Foley-Katheter'), }) self._test_queries('clueweb12/b13/clef-ehealth/fr', count=300, items={ 0: GenericQuery('101001-fr', 'avantages et risques du traitement des hernies inguinales par laparoscopie à maillage '), 9: GenericQuery('102004-fr', 'l\'élimination des balises anales de peau ou "la rémission" après les traitements'), 299: GenericQuery('150006-fr', 'quelle est la cause des érections douloureuses après avoir eu la sonde de Foley sur place'), }) self._test_queries('clueweb12/b13/clef-ehealth/hu', count=300, items={ 0: GenericQuery('101001-hu', 'lágyéksérv helyreállítás laparoszkópiás háló előnyök kockázatok'), 9: GenericQuery('102004-hu', 'anális" bőrfüggelékek eltávolítás or kezelések "gyógyulás'), 299: GenericQuery('150006-hu', 'mi okozza a fájdalmas erekciót foley katéterezést követően'), }) self._test_queries('clueweb12/b13/clef-ehealth/pl', count=300, items={ 0: GenericQuery('101001-pl', 'operacja laparoskopowa przepukliny pachwinowej z użyciem siatki korzyści ryzyko'), 9: GenericQuery('102004-pl', 'odbytowy "usunięcie brodawek miękkich skóry lub leczenie" powrót do zdrowia'), 299: GenericQuery('150006-pl', "co powoduje bolesne erekcje po cewnikowaniu cewnikiem Foley'a"), }) self._test_queries('clueweb12/b13/clef-ehealth/sv', count=300, items={ 0: GenericQuery('101001-sv', 'ljumskbråck reparation laparoskopisk nät fördelar risker'), 9: GenericQuery('102004-sv', 'anal" hudflikar borttagning eller behandlingar "återhämtning'), 299: GenericQuery('150006-sv', 'vad som orsakar smärtsamma erektioner efter att ha haft en Foley-kateter'), }) def test_clueweb12_qrels(self): self._test_qrels('clueweb12/trec-web-2013', count=14474, items={ 0: TrecQrel('201', 'clueweb12-0000tw-05-12114', 1, '0'), 9: TrecQrel('201', 'clueweb12-0108wb-22-26598', 0, '0'), 14473: TrecQrel('250', 'clueweb12-1914wb-21-25488', 0, '0'), }) self._test_qrels('clueweb12/trec-web-2013/diversity', count=46985, items={ 0: TrecSubQrel('201', 'clueweb12-0000tw-05-12114', 1, '1'), 9: TrecSubQrel('201', 'clueweb12-0108wb-22-26598', 0, '1'), 46984: TrecSubQrel('250', 'clueweb12-1914wb-21-25488', 0, '0'), }) self._test_qrels('clueweb12/trec-web-2014', count=14432, items={ 0: TrecQrel('251', 'clueweb12-0000tw-34-04382', 1, '0'), 9: TrecQrel('251', 'clueweb12-0000wb-90-35684', 1, '0'), 14431: TrecQrel('300', 'clueweb12-1911wb-40-07107', 0, '0'), }) self._test_qrels('clueweb12/trec-web-2014/diversity', count=43840, items={ 0: TrecSubQrel('251', 'clueweb12-0000tw-34-04382', 1, '0'), 9: TrecSubQrel('251', 'clueweb12-0000wb-90-35684', 1, '0'), 43839: TrecSubQrel('300', 'clueweb12-1911wb-40-07107', 0, '0'), }) self._test_qrels('clueweb12/b13/ntcir-www-1', count=25465, items={ 0: TrecQrel('0001', 'clueweb12-0000wb-16-36432', 3, '0'), 9: TrecQrel('0001', 'clueweb12-0002wb-06-22258', 1, '0'), 25464: TrecQrel('0100', 'clueweb12-1913wb-56-22315', 2, '0'), }) self._test_qrels('clueweb12/b13/ntcir-www-2', count=27627, items={ 0: TrecQrel('0001', 'clueweb12-0000wb-83-31594', 0, '0'), 9: TrecQrel('0001', 'clueweb12-0003wb-36-30766', 0, '0'), 27626: TrecQrel('0080', 'clueweb12-1910wb-52-19011', 0, '0'), }) self._test_qrels('clueweb12/b13/trec-misinfo-2019', count=22859, items={ 0: MisinfoQrel('1', 'clueweb12-0000wb-03-01030', 1, 2, 0), 9: MisinfoQrel('1', 'clueweb12-0002wb-45-16639', 0, -1, -1), 22858: MisinfoQrel('51', 'clueweb12-1913wb-78-31232', 0, -1, -1), }) self._test_qrels('clueweb12/b13/clef-ehealth', count=269232, items={ 0: EhealthQrel('101001', 'clueweb12-0000tw-08-16795', 0, 0, 95, '0'), 9: EhealthQrel('101004', 'clueweb12-0000wb-06-29427', 0, 50, 99, '0'), 269231: EhealthQrel('150006', 'clueweb12-0504wb-17-23016', 0, 70, 80, '1'), }) self._test_qrels('clueweb12/b13/clef-ehealth/cs', count=269232, items={ 0: EhealthQrel('101001-cs', 'clueweb12-0000tw-08-16795', 0, 0, 95, '0'), 9: EhealthQrel('101004-cs', 'clueweb12-0000wb-06-29427', 0, 50, 99, '0'), 269231: EhealthQrel('150006-cs', 'clueweb12-0504wb-17-23016', 0, 70, 80, '1'), }) self._test_qrels('clueweb12/b13/clef-ehealth/de', count=269232, items={ 0: EhealthQrel('101001-de', 'clueweb12-0000tw-08-16795', 0, 0, 95, '0'), 9: EhealthQrel('101004-de', 'clueweb12-0000wb-06-29427', 0, 50, 99, '0'), 269231: EhealthQrel('150006-de', 'clueweb12-0504wb-17-23016', 0, 70, 80, '1'), }) self._test_qrels('clueweb12/b13/clef-ehealth/fr', count=269232, items={ 0: EhealthQrel('101001-fr', 'clueweb12-0000tw-08-16795', 0, 0, 95, '0'), 9: EhealthQrel('101004-fr', 'clueweb12-0000wb-06-29427', 0, 50, 99, '0'), 269231: EhealthQrel('150006-fr', 'clueweb12-0504wb-17-23016', 0, 70, 80, '1'), }) self._test_qrels('clueweb12/b13/clef-ehealth/hu', count=269232, items={ 0: EhealthQrel('101001-hu', 'clueweb12-0000tw-08-16795', 0, 0, 95, '0'), 9: EhealthQrel('101004-hu', 'clueweb12-0000wb-06-29427', 0, 50, 99, '0'), 269231: EhealthQrel('150006-hu', 'clueweb12-0504wb-17-23016', 0, 70, 80, '1'), }) self._test_qrels('clueweb12/b13/clef-ehealth/pl', count=269232, items={ 0: EhealthQrel('101001-pl', 'clueweb12-0000tw-08-16795', 0, 0, 95, '0'), 9: EhealthQrel('101004-pl', 'clueweb12-0000wb-06-29427', 0, 50, 99, '0'), 269231: EhealthQrel('150006-pl', 'clueweb12-0504wb-17-23016', 0, 70, 80, '1'), }) self._test_qrels('clueweb12/b13/clef-ehealth/sv', count=269232, items={ 0: EhealthQrel('101001-sv', 'clueweb12-0000tw-08-16795', 0, 0, 95, '0'), 9: EhealthQrel('101004-sv', 'clueweb12-0000wb-06-29427', 0, 50, 99, '0'), 269231: EhealthQrel('150006-sv', 'clueweb12-0504wb-17-23016', 0, 70, 80, '1'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/codec.py ================================================ import re import unittest from ir_datasets.datasets.codec import CodecQuery, CodecDoc from ir_datasets.formats import TrecQrel from .base import DatasetIntegrationTest class TestCodec(DatasetIntegrationTest): def test_docs(self): self._test_docs('codec', count=729824, items={ 0: CodecDoc('5f8fddcc5a1a25d5fce9ec5e331ba8dd', "Apple's Tim Cook calls for calm heads on China, US trade", re.compile("^Apple's Tim Cook calls for calm heads on China, US trade\nApple CEO Tim Cook\\.Getty Images\nApple Chief.{2099}d last year when he called for China to increase trade and continue opening itself up to the world\\.\n$", flags=48), 'https://www.cnbc.com/2018/03/24/apples-tim-cook-calls-for-calm-heads-on-china-us-trade.html'), 9: CodecDoc('ac2c854a75425587461307c506c48330', 'Buying the yen to protect against a trade war is ‘pretty peculiar,’ wealth manager says', re.compile('^Buying the yen to protect against a trade war is ‘pretty peculiar,’ wealth manager says\nVIDEO3:5803:.{2303}d that it would look to be exempted from the metal tariffs, though they came into effect on Friday\\.\n$', flags=48), 'https://www.cnbc.com/2018/03/26/yen-to-protect-against-a-trade-war-is-wrong.html?__source=twitter%7Cmain'), 729823: CodecDoc('2dc726c7c5de68b9766e35336fa214a0', 'Damped vibration', re.compile('^Damped vibration\nThe simple harmonic oscillations discussed above continue forever, at constant ampl.{103}tors behave somewhat differently, however\\. Harmonic oscillations tend to die away as time goes on\\.…\n$', flags=48), 'https://www.britannica.com/science/damped-vibration'), }) def test_queries(self): self._test_queries('codec', count=42, items={ 0: CodecQuery('economics-1', "How has the UK's Open Banking Regulation benefited challenger banks?", 'finance', 'UK’s Open Banking regulation, which has parallels to the EU’s second payment service directive (PSD2), went live in January 2018. This piece of legislation “will require banks to open their payments infrastructure and customer data assets to third parties”. As a result, banks no longer have a monopoly on user data if clients grant permission. \n\nChallenger banks are small, recently created retail banks that compete directly with the longer-established banks in the UK. Specifically, seeking market share from the "big four" UK retail banks (Barclays, HSBC, Lloyds Banking Group, and NatWest Group). The banks distinguish themselves from the historic banks by modern financial technology practices, such as online-only operations, that avoid the costs and complexities of traditional banking. The largest UK-operating challenger banks include Atom Bank, Revolut, Starling, N26, and Tide.\n\nRelevant documents and entities will discuss how challenger banks have used open banking to develop new products or capture market share from traditional retail banks in the UK.'), 9: CodecQuery('economics-18', 'Was the crash that followed the dot-com bubble an overreaction considering the ultimate success of the internet?', 'finance', 'The dot-com buddle from 1995-2000 saw incredible growth in stocks that were considered to have anything to do with the internet. NASDAQ rose 400% only to fall 78% from its peak by 2002. Several high-profile failures included Pets.com, Boo.com, Woldcom, and Global Crossing. However, many internet-based companies survived and went on to thrive, i.e. eBay, Amazon, Qualcomm, Cisco Systems. Venture Capital pulled back from the internet space for a period.\n\nHowever, with FANNG stocks and incredible software/internet-based companies performance from the mid-2000s until the present day, it could be argued that the direction of the dot-com bubble was absolutely correct. Nonetheless, the magnitude of equity growth and capital allocation to companies with limited commercial value was irrational and required a market correction.'), 41: CodecQuery('politics-23', 'Is the rise of European populism a threat to the European Union?', 'politics', '"Populist" is a broad term that describes, typically a politician, who targets people who feel that established elite groups disregard their concerns. Some critics highlight negative connotations, including criticising foreign migration or minorities.\n\nIn recent years, populism has been stronger in Eastern Europe, i.e. Bulgaria, Hungary, Austrian, and Poland, etc. have seen the rise of populist politicians. Right-wing populist movements have also gained momentum in France, Spain, the United Kingdom and other parts of Europe. In Hungary and Poland, some critics argue this has led to an erosion of the rule of law, increased persecution, and authoritarianism.\n\nEuropean Union is a political union of democratic nations. However, radical right-wing politics reject what the EU stands for and how it works, i.e. against European supranational integration and push for national policies. Populists also criticise the EU\'s perceived bureaucracy and failures - common arguments during the Brexit Leave campaign. The EU is founded based on shared democratic values that countries need to be relatively ideologically aligned to function within a political union. There is also the threat that disillusioned Eastern European countries will turn away from the EU and toward Russia.'), }) self._test_queries('codec/economics', count=14, items={ 0: CodecQuery('economics-1', "How has the UK's Open Banking Regulation benefited challenger banks?", 'finance', 'UK’s Open Banking regulation, which has parallels to the EU’s second payment service directive (PSD2), went live in January 2018. This piece of legislation “will require banks to open their payments infrastructure and customer data assets to third parties”. As a result, banks no longer have a monopoly on user data if clients grant permission. \n\nChallenger banks are small, recently created retail banks that compete directly with the longer-established banks in the UK. Specifically, seeking market share from the "big four" UK retail banks (Barclays, HSBC, Lloyds Banking Group, and NatWest Group). The banks distinguish themselves from the historic banks by modern financial technology practices, such as online-only operations, that avoid the costs and complexities of traditional banking. The largest UK-operating challenger banks include Atom Bank, Revolut, Starling, N26, and Tide.\n\nRelevant documents and entities will discuss how challenger banks have used open banking to develop new products or capture market share from traditional retail banks in the UK.'), 9: CodecQuery('economics-18', 'Was the crash that followed the dot-com bubble an overreaction considering the ultimate success of the internet?', 'finance', 'The dot-com buddle from 1995-2000 saw incredible growth in stocks that were considered to have anything to do with the internet. NASDAQ rose 400% only to fall 78% from its peak by 2002. Several high-profile failures included Pets.com, Boo.com, Woldcom, and Global Crossing. However, many internet-based companies survived and went on to thrive, i.e. eBay, Amazon, Qualcomm, Cisco Systems. Venture Capital pulled back from the internet space for a period.\n\nHowever, with FANNG stocks and incredible software/internet-based companies performance from the mid-2000s until the present day, it could be argued that the direction of the dot-com bubble was absolutely correct. Nonetheless, the magnitude of equity growth and capital allocation to companies with limited commercial value was irrational and required a market correction.'), 13: CodecQuery('economics-23', 'Offering non-accounting services arguably creates a conflict of interest for the Big Four. Is this the reason for their inability to uncover recent financial scandals?', 'finance', 'The Big Four are the four largest global accounting firms that dominate corporate accounting, i.e. Deloitte, Ernst & Young (EY), PricewaterhouseCoopers (PwC), and Klynveld Peat Marwick Goerdeler (KPMG). As well as offering accounting services, these firms also over other many higher-margin services such as tax, consultancy, and technology services (~80% revenue). \n\nInternal controls (firewalls, departments, etc.) are meant to prohibit any cooperation between audit and non-audit services in winning customer contracts or favour. However, critics would argue that this does not go far enough and can lead to less rigorous audits, calling for the Big Four to be broken up. Critics highlight poor practices in recent accounting scandals, including Wirecard, Carillion, Satyam Computer Services, South Africa examples, etc. \n\nHowever, it should be highlighted that even if the Big Four were separated, this would not necessarily lead to no accounting scandals. Some fraudulent practices may be hard to catch even if objective and following best practices. Also, there are other areas of possible conflict of interest that non-accounting work, including how the corporate hires the auditor.'), }) self._test_queries('codec/history', count=14, items={ 0: CodecQuery('history-1', 'Would the United Kingdom have been ready for WWII without the time gained through Appeasement?', 'history', "Many argue Britain's army was depleted in the early 1930s and stretched across the globe. UK defence spending had fallen significantly during the 1920s, from over £700 million in 1919 to 100 million in 1931.\n\nBetween 1934 and 1939, the UK launched a substantial programme of re-arming, recognising that war with Hitler was becoming increasingly likely. Although Appeasement was also motivated by Chamberlain's desire to end war, some argue this meant that the UK was more prepared in 1939 when war eventually broke out. \n\nDespite these efforts, Germany was still better prepared for war under Hilter's single-minded preparation since he came to power in 1933. However, without Appeasement, the differential might have been much worse."), 9: CodecQuery('history-19', 'How close did the world come to nuclear war during the Cuban Missile Crisis?', 'history', 'During the Cuban Missile Crisis, leaders of the United States and the Soviet Union engaged in a 13-day political and military standoff in October 1962 over the installation of nuclear-armed Soviet missiles on Cuba. This was the peak of the Cold War and a high-stakes political and military situation, given the potential devastation of nuclear weapons. However, exactly how close we came to nuclear armageddon is still debated. \n\nThe political leaders within this crisis were JFK for the United States, Nikita Khrushchev of the Soviet Union, and Fidel Casto of Cuba. Many highlights that JFK and Khrushchev were measured in their leadership styles and understood nuclear war meant mutual destruction. Some highlight exchanging of letters and other communications to prevent a nuclear war. Nonetheless, many highlight the mistrust and fear between both sides and how a single false move could have led to a disaster. Both sides were actively preparing for a nuclear war, and some within each camp through nuclear strikes was likely.'), 13: CodecQuery('history-25', 'How responsible was Rasputin for the fall of the Romanov dynasty?', 'history', "The Russian Tsar, Nicholas II, abdicated from power in 1917, bringing the 300-year-old Romanov dynasty to an end. Some historians suggest that Grigori Rasputin's scandalous reputation helped discredit the Tsarist government and helped to lay the foundation for the Russian Revolution. \n\nHowever, many historians argue that although Rasputin was a useful propaganda tool against the Tsar, much larger factors were at play. For example, Nicholas II was viewed as a weak leader, the Russo-Japanese War, Bloody Sunday, Tsarina unpopularity, and WWI. There was also significant economic issues, including inflation and food shortages."), }) self._test_queries('codec/politics', count=14, items={ 0: CodecQuery('politics-1', 'Is Scottish Independence inevitable?', 'politics', "This questions focuses on the long-term political, economic and social reasoning behind whether Scotland will likely become independent. Short-term facts and opinions are less central to this question.\n\nSome argue that Scottish independence is inevitable given the surge of support towards SNP in recent decades. Labour has lost political weight in Scotland since the Blair era, and Conservatives historically struggle to penetrate a more left-wing Scottish demographic. Brexit further exacerbated this political unease, i.e. right-leaning Britain and left-leaning Scotland. While the younger demographic is more likely to be independence supporters and the older demographic is more likely to be pro-union.\n\nThere are several arguments that Scotland independence is not inevitable. Historically there has been a union for 300+ years covering a full range of circumstances (world wars, successes). Economically, Britain funds lots of Scottish spending through Barnett-based funding, Scotland's oil is less valuable than previously, and there is much economic uncertainly around currency and debt. Some also argue that Scotland has large political independence due to devolution; thus, independence is unnecessary."), 9: CodecQuery('politics-16', 'Why did Hilary Clinton lose the 2016 US presidential election?', 'politics', 'Hillary Clinton lost the 2016 US presidential election to Donald Trump in 2016. Political commentators highlight many reasons for Clinton\'s loss. \n\nFor example, Donald Trump managed to craft a strong populist message that resonated with many voters who were disenfranchised with current politics, particularly with the "political elite" who some felt Clinton represented. The Democratats were also divided, specifically far-left factions led by Bernie Sanders. Political gridlock under an Obama Administration that, rightly or wrongly, some felt the public wanting change. Hilary also had relatively low personal approval ratings, which were not helped by the FBI investigation into her use of email. Some also point to external factors, including Russian interference.'), 13: CodecQuery('politics-23', 'Is the rise of European populism a threat to the European Union?', 'politics', '"Populist" is a broad term that describes, typically a politician, who targets people who feel that established elite groups disregard their concerns. Some critics highlight negative connotations, including criticising foreign migration or minorities.\n\nIn recent years, populism has been stronger in Eastern Europe, i.e. Bulgaria, Hungary, Austrian, and Poland, etc. have seen the rise of populist politicians. Right-wing populist movements have also gained momentum in France, Spain, the United Kingdom and other parts of Europe. In Hungary and Poland, some critics argue this has led to an erosion of the rule of law, increased persecution, and authoritarianism.\n\nEuropean Union is a political union of democratic nations. However, radical right-wing politics reject what the EU stands for and how it works, i.e. against European supranational integration and push for national policies. Populists also criticise the EU\'s perceived bureaucracy and failures - common arguments during the Brexit Leave campaign. The EU is founded based on shared democratic values that countries need to be relatively ideologically aligned to function within a political union. There is also the threat that disillusioned Eastern European countries will turn away from the EU and toward Russia.'), }) def test_qrels(self): self._test_qrels('codec', count=6186, items={ 0: TrecQrel('economics-8', '089a22846f6ba15fb4ef4cca0a884dd4', 2, 'Q0'), 9: TrecQrel('economics-8', '24de81ea95c7df32941e8bd200d3528a', 2, 'Q0'), 6185: TrecQrel('politics-23', 'f7975664f7fc45416c7256d12d06fbdf', 1, 'Q0'), }) self._test_qrels('codec/economics', count=1970, items={ 0: TrecQrel('economics-8', '089a22846f6ba15fb4ef4cca0a884dd4', 2, 'Q0'), 9: TrecQrel('economics-8', '24de81ea95c7df32941e8bd200d3528a', 2, 'Q0'), 1969: TrecQrel('economics-23', 'ebd36155b9808933bbbfc26af6d18dec', 1, 'Q0'), }) self._test_qrels('codec/history', count=2024, items={ 0: TrecQrel('history-20', '00aa648a657bdf73369bcb093030cc41', 0, 'Q0'), 9: TrecQrel('history-20', '0d239d8dea605cd079d1a0144aa6ed46', 1, 'Q0'), 2023: TrecQrel('history-25', 'dba9433750c2505f2a6a69661d4eb2fd', 0, 'Q0'), }) self._test_qrels('codec/politics', count=2192, items={ 0: TrecQrel('politics-12', '02d8176599da0bfc15caaf7e0b3bba6b', 3, 'Q0'), 9: TrecQrel('politics-12', '0e23e45de303e5f440a5a17ccd7974ec', 1, 'Q0'), 2191: TrecQrel('politics-23', 'f7975664f7fc45416c7256d12d06fbdf', 1, 'Q0'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/codesearchnet.py ================================================ import re import unittest from ir_datasets.datasets.codesearchnet import CodeSearchNetDoc, CodeSearchNetChallengeQrel from ir_datasets.formats import GenericDoc, GenericQuery, TrecQrel from .base import DatasetIntegrationTest class TestCodeSearchNet(DatasetIntegrationTest): def test_codesearchnet_docs(self): self._test_docs('codesearchnet', count=2070536, items={ 0: CodeSearchNetDoc('https://github.com/soimort/you-get/blob/b746ac01c9f39de94cac2d56f665285b0523b974/src/you_get/extractors/youtube.py#L135-L143', 'soimort/you-get', 'src/you_get/extractors/youtube.py', 'YouTube.get_vid_from_url', re.compile('^def get_vid_from_url\\(url\\):\n """Extracts video ID from URL\\.\n """\n return match1\\(.{210} parse_query_param\\(url, \'v\'\\) or \\\\\n parse_query_param\\(parse_query_param\\(url, \'u\'\\), \'v\'\\)$', flags=48), 'python'), 9: CodeSearchNetDoc('https://github.com/soimort/you-get/blob/b746ac01c9f39de94cac2d56f665285b0523b974/src/you_get/extractors/sina.py#L54-L64', 'soimort/you-get', 'src/you_get/extractors/sina.py', 'sina_download_by_vkey', re.compile('^def sina_download_by_vkey\\(vkey, title=None, output_dir=\'\\.\', merge=True, info_only=False\\):\n """Dow.{229} info_only:\n download_urls\\(\\[url\\], title, \'flv\', size, output_dir = output_dir, merge = merge\\)$', flags=48), 'python'), 2070535: CodeSearchNetDoc('https://github.com/christophehurpeau/SpringbokJS/blob/bc1069baafc0785d361a33ff5a2fa604b8b3b454/src/browser/base/S.History.js#L72-L78', 'christophehurpeau/SpringbokJS', 'src/browser/base/S.History.js', '', re.compile('^function\\(fragmentOverride,state\\)\\{\n\\\t\\\t\\\tvar fragment = baseUrl\\+\\( this\\.fragment = this\\.getFragment\\(fragm.{32}\\\t\\\t\\\t\\\tvar a=\\$\\(\'a\\[href="\'\\+fragment\\+\'"\\]\'\\);\n\\\t\\\t\\\t\\\ta\\.length===0 \\? S\\.redirect\\(fragment\\) : a\\.click\\(\\);\n\\\t\\\t\\\t\\}\n\\\t\\\t\\}$', flags=48), 'javascript'), }) def test_codesearchnet_queries(self): self._test_queries('codesearchnet/train', count=1880853, items={ 0: GenericQuery('https://github.com/ageitgey/face_recognition/blob/c96b010c02f15e8eeb0f71308c641179ac1f19bb/examples/face_recognition_knn.py#L46-L108', 'Trains a k-nearest neighbors classifier for face recognition.\n\n :param train_dir: directory that contains a sub-directory for each known person, with its name.\n\n (View in source code to see train_dir example tree structure)\n\n Structure:\n <train_dir>/\n ├── <person1>/\n │ ├── <somename1>.jpeg\n │ ├── <somename2>.jpeg\n │ ├── ...\n ├── <person2>/\n │ ├── <somename1>.jpeg\n │ └── <somename2>.jpeg\n └── ...\n\n :param model_save_path: (optional) path to save model on disk\n :param n_neighbors: (optional) number of neighbors to weigh in classification. Chosen automatically if not specified\n :param knn_algo: (optional) underlying data structure to support knn.default is ball_tree\n :param verbose: verbosity of training\n :return: returns knn classifier that was trained on the given data.'), 9: GenericQuery('https://github.com/ageitgey/face_recognition/blob/c96b010c02f15e8eeb0f71308c641179ac1f19bb/face_recognition/api.py#L135-L151', "Returns an 2d array of bounding boxes of human faces in a image using the cnn face detector\n If you are using a GPU, this can give you much faster results since the GPU\n can process batches of images at once. If you aren't using a GPU, you don't need this function.\n\n :param img: A list of images (each as a numpy array)\n :param number_of_times_to_upsample: How many times to upsample the image looking for faces. Higher numbers find smaller faces.\n :param batch_size: How many images to include in each GPU processing batch.\n :return: A list of tuples of found face locations in css (top, right, bottom, left) order"), 1880852: GenericQuery('https://github.com/nutella-framework/nutella_lib.js/blob/b3a3406a407e2a1ada6edcc503b70991f9cb249b/src/run_net_bin.js#L87-L102', 'Helper function This function uploads a file with a certain file name. If the upload is successful the first callback is executed, otherwise the second one is.'), }) self._test_queries('codesearchnet/valid', count=89154, items={ 0: GenericQuery('https://github.com/openai/baselines/blob/3301089b48c42b87b396e246ea3f56fa4bfc9678/baselines/deepq/deepq.py#L95-L333', 'Train a deepq model.\n\n Parameters\n -------\n env: gym.Env\n environment to train on\n network: string or a function\n neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models\n (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which\n will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that)\n seed: int or None\n prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used.\n lr: float\n learning rate for adam optimizer\n total_timesteps: int\n number of env steps to optimizer for\n buffer_size: int\n size of the replay buffer\n exploration_fraction: float\n fraction of entire training period over which the exploration rate is annealed\n exploration_final_eps: float\n final value of random action probability\n train_freq: int\n update the model every `train_freq` steps.\n set to None to disable printing\n batch_size: int\n size of a batched sampled from replay buffer for training\n print_freq: int\n how often to print out training progress\n set to None to disable printing\n checkpoint_freq: int\n how often to save the model. This is so that the best version is restored\n at the end of the training. If you do not wish to restore the best version at\n the end of the training set this variable to None.\n learning_starts: int\n how many steps of the model to collect transitions for before learning starts\n gamma: float\n discount factor\n target_network_update_freq: int\n update the target network every `target_network_update_freq` steps.\n prioritized_replay: True\n if True prioritized replay buffer will be used.\n prioritized_replay_alpha: float\n alpha parameter for prioritized replay buffer\n prioritized_replay_beta0: float\n initial value of beta for prioritized replay buffer\n prioritized_replay_beta_iters: int\n number of iterations over which beta will be annealed from initial value\n to 1.0. If set to None equals to total_timesteps.\n prioritized_replay_eps: float\n epsilon to add to the TD errors when updating priorities.\n param_noise: bool\n whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)\n callback: (locals, globals) -> None\n function called at every steps with state of the algorithm.\n If callback returns true training stops.\n load_path: str\n path to load the model from. (default: None)\n **network_kwargs\n additional keyword arguments to pass to the network builder.\n\n Returns\n -------\n act: ActWrapper\n Wrapper over act function. Adds ability to save it and load it.\n See header of baselines/deepq/categorical.py for details on the act function.'), 9: GenericQuery('https://github.com/openai/baselines/blob/3301089b48c42b87b396e246ea3f56fa4bfc9678/baselines/common/cmd_util.py#L21-L52', 'Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo.'), 89153: GenericQuery('https://github.com/christophehurpeau/SpringbokJS/blob/bc1069baafc0785d361a33ff5a2fa604b8b3b454/src/browser/base/S.History.js#L72-L78', 'Attempt to load the current URL fragment.'), }) self._test_queries('codesearchnet/test', count=100529, items={ 0: GenericQuery('https://github.com/soimort/you-get/blob/b746ac01c9f39de94cac2d56f665285b0523b974/src/you_get/extractors/youtube.py#L135-L143', 'Extracts video ID from URL.'), 9: GenericQuery('https://github.com/soimort/you-get/blob/b746ac01c9f39de94cac2d56f665285b0523b974/src/you_get/extractors/sina.py#L54-L64', 'Downloads a Sina video by its unique vkey.\n http://video.sina.com/'), 100528: GenericQuery('https://github.com/dherges/grunt-bower-event/blob/ce7dc2303ef186ccf5eaa8d5b691102e13523076/tasks/lib/BowerTask.js#L24-L30', "Creates a new task.\n\n@param context Task function context (='this' inside a grunt task function)\n@param grunt Grunt object"), }) self._test_queries('codesearchnet/challenge', count=99, items={ 0: GenericQuery('1', 'convert int to string'), 9: GenericQuery('10', 'binomial distribution'), 98: GenericQuery('99', 'how to read .csv file in an efficient way?'), }) def test_codesearchnet_qrels(self): self._test_qrels('codesearchnet/train', count=1880853, items={ 0: TrecQrel('https://github.com/ageitgey/face_recognition/blob/c96b010c02f15e8eeb0f71308c641179ac1f19bb/examples/face_recognition_knn.py#L46-L108', 'https://github.com/ageitgey/face_recognition/blob/c96b010c02f15e8eeb0f71308c641179ac1f19bb/examples/face_recognition_knn.py#L46-L108', 1, '0'), 9: TrecQrel('https://github.com/ageitgey/face_recognition/blob/c96b010c02f15e8eeb0f71308c641179ac1f19bb/face_recognition/api.py#L135-L151', 'https://github.com/ageitgey/face_recognition/blob/c96b010c02f15e8eeb0f71308c641179ac1f19bb/face_recognition/api.py#L135-L151', 1, '0'), 1880852: TrecQrel('https://github.com/nutella-framework/nutella_lib.js/blob/b3a3406a407e2a1ada6edcc503b70991f9cb249b/src/run_net_bin.js#L87-L102', 'https://github.com/nutella-framework/nutella_lib.js/blob/b3a3406a407e2a1ada6edcc503b70991f9cb249b/src/run_net_bin.js#L87-L102', 1, '0'), }) self._test_qrels('codesearchnet/valid', count=89154, items={ 0: TrecQrel('https://github.com/openai/baselines/blob/3301089b48c42b87b396e246ea3f56fa4bfc9678/baselines/deepq/deepq.py#L95-L333', 'https://github.com/openai/baselines/blob/3301089b48c42b87b396e246ea3f56fa4bfc9678/baselines/deepq/deepq.py#L95-L333', 1, '0'), 9: TrecQrel('https://github.com/openai/baselines/blob/3301089b48c42b87b396e246ea3f56fa4bfc9678/baselines/common/cmd_util.py#L21-L52', 'https://github.com/openai/baselines/blob/3301089b48c42b87b396e246ea3f56fa4bfc9678/baselines/common/cmd_util.py#L21-L52', 1, '0'), 89153: TrecQrel('https://github.com/christophehurpeau/SpringbokJS/blob/bc1069baafc0785d361a33ff5a2fa604b8b3b454/src/browser/base/S.History.js#L72-L78', 'https://github.com/christophehurpeau/SpringbokJS/blob/bc1069baafc0785d361a33ff5a2fa604b8b3b454/src/browser/base/S.History.js#L72-L78', 1, '0'), }) self._test_qrels('codesearchnet/test', count=100529, items={ 0: TrecQrel('https://github.com/soimort/you-get/blob/b746ac01c9f39de94cac2d56f665285b0523b974/src/you_get/extractors/youtube.py#L135-L143', 'https://github.com/soimort/you-get/blob/b746ac01c9f39de94cac2d56f665285b0523b974/src/you_get/extractors/youtube.py#L135-L143', 1, '0'), 9: TrecQrel('https://github.com/soimort/you-get/blob/b746ac01c9f39de94cac2d56f665285b0523b974/src/you_get/extractors/sina.py#L54-L64', 'https://github.com/soimort/you-get/blob/b746ac01c9f39de94cac2d56f665285b0523b974/src/you_get/extractors/sina.py#L54-L64', 1, '0'), 100528: TrecQrel('https://github.com/dherges/grunt-bower-event/blob/ce7dc2303ef186ccf5eaa8d5b691102e13523076/tasks/lib/BowerTask.js#L24-L30', 'https://github.com/dherges/grunt-bower-event/blob/ce7dc2303ef186ccf5eaa8d5b691102e13523076/tasks/lib/BowerTask.js#L24-L30', 1, '0'), }) self._test_qrels('codesearchnet/challenge', count=4006, items={ 0: CodeSearchNetChallengeQrel('71', 'https://github.com/tylertreat/BoomFilters/blob/611b3dbe80e85df3a0a10a43997d4d5784e86245/topk.go#L70-L85', '0', ''), 9: CodeSearchNetChallengeQrel('24', 'https://github.com/uber-go/ratelimit/blob/c15da02342779cb6dc027fc95ee2277787698f36/internal/clock/clock.go#L66-L76', '1', ''), 4005: CodeSearchNetChallengeQrel('45', 'https://github.com/conanite/aduki/blob/2e17522b9536fe0a12d2dd97ae601cabb2ee293e/lib/aduki.rb#L167-L176', '0', ''), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/cord19.py ================================================ import re import unittest from ir_datasets.datasets.cord19 import Cord19Doc, Cord19FullTextDoc, Cord19FullTextSection from ir_datasets.formats import TrecQrel, TrecQuery from .base import DatasetIntegrationTest class TestCord19(DatasetIntegrationTest): def test_cord19_docs(self): self._test_docs('cord19', count=192509, items={ 0: Cord19Doc('ug7v899j', 'Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia', '10.1186/1471-2334-1-6', '2001-07-04', re.compile('^OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 pa.{1647}preschool children and that the mortality rate of pneumonia in patients with comorbidities was high\\.$', flags=48)), 9: Cord19Doc('jg13scgo', 'Technical Description of RODS: A Real-time Public Health Surveillance System', '10.1197/jamia.m1345', '2003-09-01', re.compile('^This report describes the design and implementation of the Real\\-time Outbreak and Disease Surveillan.{1077} be a resource for implementing, evaluating, and applying new methods of public health surveillance\\.$', flags=48)), 192508: Cord19Doc('pnl9th2c', 'Vascular Life during the COVID-19 Pandemic Reminds Us to Prepare for the Unexpected', '10.1016/j.ejvs.2020.04.040', '2020-05-12', ''), }) self._test_docs('cord19/fulltext', count=192509, items={ 0: Cord19FullTextDoc('ug7v899j', 'Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia', '10.1186/1471-2334-1-6', '2001-07-04', re.compile('^OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 pa.{1647}preschool children and that the mortality rate of pneumonia in patients with comorbidities was high\\.$', flags=48), (Cord19FullTextSection(title='Introduction', text='Mycoplasma pneumoniae is a common cause of upper and lower respiratory tract infections. It remains one of the most frequent causes of atypical pneumonia particularly among young adults.[1,2,3,4,5] Although it is highly transmissible, most infections caused by this organism are relatively minor and include pharyngitis, tracheobronchitis, bronchiolitis, and croup with one fifth of infections being asymptomatic.[6,7] Only 3 -10% of infected subjects develop symptoms consistent with bronchopneumonia and mortality from infection is rare.[6,7] The organism is fastidious and difficult to grow on cultures. Therefore, diagnosis of infections caused by this organism is usually confirmed with serological tests or polymerase chain reaction-gene amplification techniques. At King Abdulaziz University Hospital (KAUH), Jeddah, Saudi Arabia, the facility to perform Mycoplasma culture has been available since January 1997. As published information concerning M. pneumoniae infections in Saudi Arabia is scarce,[8,9,10] we wished to study the epidemiology and clinical features of culture-proven infections caused by this organism at this hospital.'), Cord19FullTextSection(title='Institution and patient population ::: Methods', text='KAUH is a tertiary care teaching hospital with a bed capacity of 265 beds and annual admissions of 18000 to 19000 patients. Patients with M. pneumoniae positive cultures from respiratory specimens were identified over a 24-months" period from January, 1997 through December, 1998 for this review.'), Cord19FullTextSection(title='Data collection ::: Methods', text="During the study period, respiratory specimens (sputum, nasopharyngeal aspiration, endotracheal secretion, and bronchoalveolar lavage) for M. pneumoniae culture were obtained from patients with upper or lower respiratory tract infections seen as inpatients or in the outpatient or emergency departments. Respiratory specimens were aslo Gram-stained and cultured for bacteria and viruses. M. pneumoniae serological tests for IgG or IgM were not available at KAUH during the study period. All positive culture results were obtained from the Microbiology laboratory records. Charts of patients were reviewed with standardized data collection. Information collected included patients' demographics, comorbidities, clinical manifestations, complications, and outcome."), Cord19FullTextSection(title='Microbiological methods ::: Methods', text='M. pneumoniae was cultured using the classic M. pneumoniae agar medium (M.P. agar) and the Pneumofast tray (Pneumofast®, International Microbio, Signes, France). Specimens were processed according to the instructions of the manufacturer. The M.P. agars and Pneumofast trays were incubated anaerobically at 37°C and inspected daily for 4 weeks. The organism was identified based on typical colonial morphology (granular colonies, rarely fried-egg-like, 10-150 ∝ in diameter) on the M.P. agar medium and the change in the Pneumofast broth color from red to orange then to yellow (glucose fermentation) in the absence of turbidity of the broth. Antibiotic sensitivity profile on the Pneumofast tray was also used for identification according to the instructions of the manufacturer. Bacterial and viral cultures were performed using standard methods.'), Cord19FullTextSection(title='Definitions ::: Methods', text='M. pneumoniae isolates were considered community-acquired if they were recovered from unhospitalized patients or within 72 hours of admission to the hospital, and nosocomial if they were recovered beyond that period.'), Cord19FullTextSection(title='Definitions ::: Methods', text='Pneumonia was diagnosed based on clinical symptoms and signs, along with radiographic evidence of pneumonia when possible. Severe pneumonia was defined as pneumonia associated with tachycardia (>140 /minute), tachypnoea (>30/minute), hypotension (Systolic blood pressure <90 mmHg), hypoxemia (arterial oxygen partial pressure <8 kPa or oxygen saturation <90%), and/or more than 2 areas of consolidation.'), Cord19FullTextSection(title='Definitions ::: Methods', text='Outcome of patients with M. pneumoniae infection was classified into 4 categories; uneventful recovery, recovery following complications, death due to M. pneumoniae infection, or death unrelated to M. pneumoniae infection.'), Cord19FullTextSection(title='Data Analysis', text="The Statistical Package for Social Sciences (SPSS) program was used for data analysis. Comparison of categorical data was by Chi-square statistic or Fisher's exact test for small expected values."), Cord19FullTextSection(title='Results', text="A total of 40 respiratory specimens from 40 patients were positive for M. pneumoniae over the 24-months study period. The demographic and epidemiological characteristics of the patients are summarized in Table 1. Of all isolates, 37 (92.5%) were community-acquired and 3 (7.5%) were nosocomial. Thirty-three (82.5%) patients required admission to the hospital and the remaining 7 (17.5%) were treated as outpatients. Twenty-four isolates (60%) were associated with pneumonia, 14 (35%) with upper respiratory tract infections, and 2 (5%) with bronchiolitis. Of the 24 cases of pneumonia, 21 were confirmed radiologically and the remaining 3 were diagnosed clinically. The two cases of bronchiolitis occurred in 2 children, one and three years old. Thirty-one patients (77.5%) had comorbidities. Eleven patients (27.5%) had cardiopulmonary comorbidities (asthma, 8, lung fibrosis, 1, congestive heart failure, 1, congenial heart disease, 1), 9 patients (22.5%) were immunocompromised (malignancy, 7, steroid therapy, 3, Human immunodeficiency virus infection, 1), and 11 patients (27.5%) had other comorbidities (premature newborns, 2, and one each of myelodysplastic syndrome, myeloproliferative disorder, sickle cell anemia, Evan's syndrome, Down syndrome, sarcoidosis, demyelinating disease, cerebral palsy, and spinal muscle atrophy). Organisms concomitantly isolated with M. pneumoniae from the respiratory tract included herpes simplex virus type 1 (2 occasions), adenovirus (2 occasions), cytomegalo virus (1 occasion), respiratory syncytial virus (1 occasion), and bacterial isolates (2 occasions: Acinetobacter species, 1, and Enter obacter cloacae, 1)."), Cord19FullTextSection(title='Results', text='Clinical manifestations associated with M. pneumoniae infections are summarized in Table 2. Pneumonia was more common than upper respiratory tract infections (57.5 % versus 27.5%, respectively). Immunocompromised patients were more likely to present with pneumonia as opposed to upper respiratory tract infection or bronchiolitis than non-immunocompromised patients (8/9 versus 16/31, P = 0.05). Similarly, there was a tendency for patients 60 years of age or older to present with pneumonia more frequently than those below 60 (4/4 versus 20/36, P = 0.1). Of the 24 patients with clinically or radiologically confirmed pneumonia, 19 (79.2%) had crepitations and only 6 (25%) had bronchial breath sounds on physical examination. Of the 16 patients in whom wheezes were detected, 9 (56.3%) were not known to have asthma or other obstructive airway disease.'), Cord19FullTextSection(title='Results', text='Complications and outcome of patients are shown in Table 3. Of the 24 patients with pneumonia, 21 (87.5%) were admitted to the hospital, and 20 (83.3%) had comorbidities. All patients with upper respiratory tract infections (11 patients) or bronchiolitis (2 patients) had uneventful recovery. Of the 24 patients with pneumonia, 14 (58.3%) had uneventful recovery, 4 (16.7%) recovered following some complications (acute respiratory distress syndrome, 2, respiratory failure, 1, septic shock, 1), 3 (12.5%) died because of M pneumoniae infection, and 3 (12.5%) died due to underlying comorbidities. The 3 patients who died of M pneumoniae pneumonia had other comorbidities; one had congestive heart failure, the second had congenital heart disease, and the third was a 3-months old infant born prematurely at 32 weeks of gestation who previously had 3 episodes of pneumonia due to other pathogens.'), Cord19FullTextSection(title='Discussion', text='Mycoplasma pneumoniae is one of the most common causes of atypical pneumonia accounting for 5-23% of community-acquired pneumonia,[1,2,3,4,5] In a study of 511 children with acute respiratory tract infection in Riyadh, Saudi Arabia, Mycoplasma pneumoniae was found to be the second most common causative agent after Respiratory syncytial virus (RSV) accounting for 9% of all cases,[8] In a study of 112 adult patients with community acquired pneumonia admitted to a military hospital in Riyadh, Saudi Arabia, this organism accounted for 6% of all cases,[9] In another retrospective study of 567 pneumonic episodes in adult patients from Al-Qassim area, the organism accounted for 23% of all episodes,[10] The organism also causes other relatively minor infections such as pharyngitis, tracheobronchitis, bronchiolitis, and croup. It is transmitted from person-to-person by infected respiratory droplets during close contact. It is most common in school-aged children, military recruits, and college students.[11] Most cases occur singly or as family outbreaks. Larger outbreaks can also occur in closed populations such as military recruit camps or boarding schools,[12] Infection occurs most frequently during the fall and winter in temperate climates but may develop year-round,[13] The average incubation period is 3 weeks following exposure,[6] Although rare, complications are protean and may involve virtually any organ system such as the respiratory system (e.g.: pleurisy, pneumothorax, acute respiratory distress syndrome, lung abscess), the hematologic system (e.g.: hemolytic anemia, intravascular coagulation, thrombocytopenia), the dermatologic system (e.g.: maculopapular or urticarial rashes, erythema multiforme, erythema nodosum), the musculoskeletal system (e.g.: myalgias, arthralgias, arthritis), the cardiovascular system (e.g.: pericarditis, myocarditis), the nervous system (e.g.: meningoencephalitis, Guillain-Barre syndrome, neuropathies, acute psychosis), or the eye (optic disc edema, optic nerve atrophy, retinal exudation and hemorrhages).[6,7,14,15,16,17,18] Immunity following infection is not long lasting.[11]'), Cord19FullTextSection(title='Discussion', text='In our study, the infection affected all age groups but was most common in infants (32.5%) and preschool children (22.5%), and least common in adults aged 15 to 30 years (2.5%) and elderly above 70 years of age (5%). This contrasts with data from temperate countries where the infection is most common in school-aged children, and young adults.[11] One possible explanation for this difference is that infants and preschool children perhaps had more severe infections than did school-aged children, and young adults which prompted presentation of the former group to the hospital. The infection occurred year-round but was most common in the fall (35%), and spring (30%), and least common in the summer (10%). Most infections were community-acquired (92.5%).'), Cord19FullTextSection(title='Discussion', text='More than one half of patients (57.5%) presented with pneumonia, and about a third (27.5%) presented with upper respiratory tract infection, Immunocompromised patients and patients 60 years of age or older were more likely to present with pneumonia as opposed to upper respiratory tract infection than non-immunocompromised patients or those below 60 years of age. Cough (82.5%), fever (75%), and malaise (58.8%) were the most common presenting symptoms. Cough was usually dry or slightly productive of white sputum and mild to moderate in severity. Most febrile patients had mild to moderate fever of 39°C or less; high-grade fever of more than 39°C was rare. Crepitations (60%), and wheezes (40%) were the most common signs. Wheezes were as common in patients with no history of obstructive airway disease (9 patients) as it was in those with such a history (7 patients). Bronchial breathing as a sign of consolidation was detected in only one fourth of patients with pneumonia, which is consistent with the known disparity between clinical and radiological signs of M pneumoniae pneumonia. Crepitations, however, were detected in the majority (79.2%) of patients. Pleuritic chest pain and pleural effusion were rare.'), Cord19FullTextSection(title='Discussion', text='More than half (56.5%) of the patients with pneumonia had uneventful recovery. Mortality from M. pneumoniae pneumonia was high (12.5%) and occurred only in patients with underlying comorbidities. None of the 9 patients with no underlying comorbidities died of M pneumoniae pneumonia. The relatively high complications rate (16.7%) and mortality (12.5%) related to M. pneumoniae pneumonia are likely due to selection bias as most patients with pneumonia were sick enough to require admission to the hospital (21/24 or 87.5%) and most of them had comorbidities (20/24 or 83.3%).'), Cord19FullTextSection(title='Discussion', text='In conclusion, our data shed some light on the epidemiology and clinical features of M pneumoniae infections in one of the Saudi tertiary care centers. The data are comparable to those of other countries except for the finding that infections were more common in infants and preschool children than in school children and young adults. Additionally, mortality attributable to M. pneumoniae pneumonia was relatively high in patients with comorbidities. It is hoped this information will assist clinicians in their approach and management of respiratory tract infections.'), Cord19FullTextSection(title='Pre-publication history', text='The pre-publication history for this paper can be accessed here:'), Cord19FullTextSection(title='Pre-publication history', text=''))), 9: Cord19FullTextDoc('jg13scgo', 'Technical Description of RODS: A Real-time Public Health Surveillance System', '10.1197/jamia.m1345', '2003-09-01', re.compile('^This report describes the design and implementation of the Real\\-time Outbreak and Disease Surveillan.{1077} be a resource for implementing, evaluating, and applying new methods of public health surveillance\\.$', flags=48), (Cord19FullTextSection(title='Public Health Surveillance ::: Background', text='The role of public health surveillance is to collect, analyze, and interpret data about biological agents, diseases, risk factors, and other health events and to provide timely dissemination of collected information to decision makers.17 Conventionally, public health surveillance relies on manual operations and off-line analysis.'), Cord19FullTextSection(title='Syndromic Surveillance ::: Background', text="Existing syndromic surveillance systems include the CDC's drop-in surveillance systems,8 Early Notification of Community-based Epidemics (ESSENCE),10,18 the Lightweight Epidemiology Advanced Detection and Emergency Response System (LEADERS),19 the Rapid Syndrome Validation Project (RSVP),20 and the eight systems discussed by Lober et al.11"), Cord19FullTextSection(title='Syndromic Surveillance ::: Background', text="Lober et al. summarized desirable characteristics of syndromic surveillance systems and analyzed the extent to which systems that were in existence in 2001 had those characteristics.11 A limitation of most systems (e.g., ESSENCE,10 Children's Hospital in Boston,11 University of Washington11) was batch transfer of data, which may delay detection by as long as the time interval (periodicity) between batch transfers. For example, a surveillance system with daily batch transfer may delay by one day the detection of an outbreak."), Cord19FullTextSection(title='Syndromic Surveillance ::: Background', text="Some systems required manual data input (e.g., CDC's drop-in surveillance systems, RSVP,20 and LEADERS19), which is labor-intensive and, in the worst case, requires round-the-clock staffing. Manual data input is not a feasible mid- or long-term solution even if the approach is to add items to existing encounter forms (where the items still may be ignored by busy clinicians)."), Cord19FullTextSection(title='Syndromic Surveillance ::: Background', text='A third limitation for existing surveillance systems is that the systems may not exploit existing standards or communication protocols like Heath Level 7 (HL7) even when they are available.'), Cord19FullTextSection(title='Syndromic Surveillance ::: Background', text='The data type most commonly used among surveillance systems is symptoms or diagnoses of patients from ED and/or physician office visits. Other types of data identified in that study include emergency call center and nurse advice lines. Other types of data being used include sales of over-the-counter health care products, prescriptions, telephone call volumes to health care providers and drug stores, and absenteeism. We have conducted studies demonstrating that the free-text chief complaint data that we use correlate with outbreaks.21,22'), Cord19FullTextSection(title='Design Objectives', text="The overall design objective for RODS is similar to that of an early warning system for missile defense; namely, to collect whatever data are required to achieve early detection from as wide an area as necessary and to analyze the data in a way that they can be used effectively by decision makers. It is required that this analysis be done in close to real time. This design objective is complex and difficult to operationalize because of the large number of organisms and the even larger number of possible routes of dissemination all requiring potentially different types of data for their detection, different algorithms, and different time urgencies. For this reason, our focus since beginning the project in 1999 has been on the specific problem of detecting a large-scale outbreak due to an outdoor (outside buildings) aerosol release of anthrax. Additional design objectives were adherence to NEDSS standards to ensure future interoperability with other types of public health surveillance systems, scalability, and that the system could not rely on manual data entry, except when it was done in a focused way in response to the system's own analysis of passively collected data."), Cord19FullTextSection(title='Overview ::: Technical Description', text="RODS uses clinical data that are already being collected by health care providers and systems during the registration process. When a patient arrives at an ED (or an InstaCare in Utah), the registration clerk or triage nurse elicits the patient's reason for visit (i.e., the chief complaint), age, gender, home zip code, and other data and enter the data in a registration computer. The registration computer then generates an HL7 ADT (admission, discharge, and transfer) message and transmits it to the health system's HL7 message router (also called an integration engine). There usually is only one message router per health system even if there are many hospitals and facilities. These processes are all routine existing business activities and do not need to be created de novo for public health surveillance."), Cord19FullTextSection(title='Overview ::: Technical Description', text="▶ shows the flow of clinical data to and within RODS. The hospital's HL7 message router, upon receipt of an HL7 message from a registration computer, deletes identifiable information from the message and then transmits it to RODS over a secure virtual private network (VPN), or a leased line, or both (during the 2002 Winter Olympics we utilized both types of connections to each facility for fault tolerance). The RODS HL7 listener maintains the connection with the health system's message router and parses the HL7 message as described in more detail below. It then passes the chief complaint portion of the message to a Bayesian text classifier that assigns each free-text chief complaint to one of seven syndromic categories (or to an eighth category, other). The database stores the category data, which then are used by applications such as detection algorithms and user interfaces. "), Cord19FullTextSection(title='Overview ::: Technical Description', text='Data about sales of OTC health care products are processed separately by the National Retail Data Monitor, which is discussed in detail in another article in this issue of JAMIA.23 The processing was kept separate intentionally because, in the future, the servers for the National Retail Data Monitor may operate in different physical locations than RODS. The RODS user interfaces can and do display sales of OTC health care products as will be discussed, but other user interfaces can be connected to the National Retail Data Monitor as well.'), Cord19FullTextSection(title='Data Sharing Agreements ::: Data Level ::: Technical Description', text='Prior to September 2001, RODS received data only from hospitals associated with the UPMC Health System, and efforts to recruit other hospitals met with resistance. After the terrorist attacks (including anthrax) in the Fall of 2001, other hospitals agreed to participate. Although data in this project are de-identified, certain information such as the number of ED visits by zip code were considered proprietary information by some health systems. Health Insurance Portability and Accountability Act (HIPAA) concerns also were very prominent in the discussions. Data-sharing agreements were executed with every participating health system that addressed these concerns. As an additional precaution, all RODS project members meet annually with University of Pittsburgh council to review obligations and are required to sign an agreement every year stating that they understand the terms of the data-sharing agreements and agree to abide by the terms. RODS began as a research project at the University of Pittsburgh in 1999 and has functioned with IRB approvals since that time.'), Cord19FullTextSection(title='Data Types ::: Data Level ::: Technical Description', text='Health care facilities send admission, discharge, and transfer (ADT) HL7 messages to RODS for patient visits in EDs and walk-in clinics. A minimal data set is sent, as shown in ▶, which qualifies as a HIPAA Limited Data Set.24 Currently the data elements are age (without date of birth), gender, home zip code, and free-text chief complaint. '), Cord19FullTextSection(title='Data Transmission ::: Data Level ::: Technical Description', text='The HL7 listener receives HL7 messages from the message routers located in each health system. The HL7 listener then passes the received HL7 message to the HL7 parser bean, an Enterprise JavaBean (EJB) in the RODS business logic tier. The HL7 parser bean uses regular expressions to parse the fields in an HL7 message. The HL7 parser bean then stores the parsed elements into a database through a managed database connection pool.'), Cord19FullTextSection(title='Data Transmission ::: Data Level ::: Technical Description', text="Although nearly all health systems utilize the HL7 messaging standard, the location of individual data elements in an HL7 message may differ from health system to health system. For example, some care providers' systems record free-text chief complaint in the DG1 segment instead of the PV2 segment of an HL7 message. To resolve this mapping problem, a configuration file written in eXtensible Markup Language (XML), a standard protocol often used to define hierarchical data elements, defines where each of the data elements can be found in the HL7 message. When an HL7 listener starts up, it reads the hospital-dependent configuration file and passes the configuration information to the parser bean."), Cord19FullTextSection(title='Data Transmission ::: Data Level ::: Technical Description', text='We also use this configuration file to define the database table and field in which the HL7 parser bean should store each data element. This approach is useful because it allows the HL7 data to be stored to an external database. We anticipate that health departments with existing NEDSS or other public health surveillance databases may wish to use just this component of RODS for real-time collection of clinical data.'), Cord19FullTextSection(title='Data Transmission ::: Data Level ::: Technical Description', text='For hospitals that do not have HL7 message routers (two of approximately 60 in our experience to date), RODS accepts ED registration data files through either a secure Web-based data upload interface or a secure file transfer protocol. In general, these types of data transfers are technically trivial and for that reason are used by many groups but do not have the reliability of a HL7 connection (and have very undesirable time latencies).'), Cord19FullTextSection(title='Data Integrity ::: Data Level ::: Technical Description', text='RODS checks the integrity of the data in the HL7 messages that it receives. This processing is necessary because hospital data flows may have undesirable characteristics such as duplicates. RODS identifies and deletes duplicates by using a database trigger that creates a composite primary key before inserting the data. RODS also filters out scheduling messages, which are identified by the fact that they have future admitted date and time.'), Cord19FullTextSection(title='Data Integrity ::: Data Level ::: Technical Description', text="RODS monitors all data feeds to ensure continuous connections with health systems. If RODS does not receive data for six hours, it sends an alert to the RODS administrator and the sending health system's administrator. Because the commercial message routers that hospitals use queue up HL7 messages when encountering networking or system problems, data integrity is preserved."), Cord19FullTextSection(title='Database ::: Data Level ::: Technical Description', text='RODS uses an Oracle8i database to store ED registration data. (Oracle, Redwood Shores, CA). To ensure fast response for an online query (e.g., the daily counts of respiratory syndrome in a county for the past six months), we developed a cache table scheme that pre-aggregates counts and refreshes them every 30 minutes.25'), Cord19FullTextSection(title='Network Level ::: Technical Description', text='The communications network between RODS and health care systems consists of virtual private networks (VPN) and leased lines. RODS uses multivendor site-to-site Internet Protocol Security (IPSEC) VPNs to receive HL7 messages. During the Winter Olympics, we exclusively used leased lines for the primary connection because of concerns about possible communications interruptions due to Internet traffic related to the games. The leased lines consisted of a redundant pair of 128k fractional T1 lines. After the Olympics, we returned to use of VPNs, and RODS has operated reliably using VPNs in both Utah and Pennsylvania. The leased-line modality is used only to connect the Siemens Medical Systems Data Center with RODS for the transmission of data from nine health systems that are hosted by Siemens.'), Cord19FullTextSection(title='System Hardware ::: Technical Description', text='For connectivity with the HL7 message routers, we utilize hardware-based routers. The VPN router is a Cisco PIX 501 and the leased-line routers are a pair of Cisco 2600s (Cisco Systems, Inc., San Jose, CA).'), Cord19FullTextSection(title='System Hardware ::: Technical Description', text='All of the RODS processes can be run on a single computer, but in our current implementation—serving Pennsylvania and Utah as an application service provider—we use five dedicated servers: firewall, database, Web server, a geographic information system (GIS) server, and computation. The processes are written in Java code and can run on most platforms, but here we describe the specific platforms we use to indicate approximate sizing and processing requirements.'), Cord19FullTextSection(title='System Hardware ::: Technical Description', text='The database server is a Sun Microsystems Enterprise 250 configured with two Ultrasparc II 400Mhz processors, 2 gigabytes of RAM, and 36 gigabytes of mirrored hard drive space running an Oracle 8.1.7 (database) on Solaris 8 (Sun Microsystems, Inc., Santa Clara, CA).'), Cord19FullTextSection(title='System Hardware ::: Technical Description', text='The Web server is a Dell Poweredge 1550 configured with two 1Ghz Pentium III processors, 1 gigabyte of RAM, and 36 gigabytes of Redundant Arrays of Inexpensive Disk 5 (RAID-5) storage running Apache 1.3.24 (Web server), and Jboss 3.0 (described below in Fault Tolerance) on Redhat Linux 7.1 (Dell Computer Corporation, Round Rock, TX; Jboss Group, Atlanta, GA; Red Hat, Raleigh, NC).'), Cord19FullTextSection(title='System Hardware ::: Technical Description', text='The GIS server is a Dell Poweredge 350 configured with one 1Ghz Pentium III processor, 512 megabytes of RAM, and 18 gigabytes of storage running ArcIMS 4.0 (ESRI, Inc., Redlands, CA), an Internet-enabled geographic information system on Redhat Linux 7.3.'), Cord19FullTextSection(title='System Hardware ::: Technical Description', text='The computation server is a Penguin Computing server configured with dual Athlon MP 2400s, 1 gigabyte of RAM, and 750 gigabytes of RAID-5 storage running Oracle 9i on Redhat Linux 7.3.'), Cord19FullTextSection(title='System Hardware ::: Technical Description', text='Backup is performed nightly on all machines using a Sun StoreEdge L9 Tape Autoloader attached to the database server and Veritas Netbackup software (Veritas, Mountain View, CA).'), Cord19FullTextSection(title='Natural Language Processing ::: Application Level ::: Technical Description', text='RODS uses a naive Bayesian classifier called Complaint Coder (CoCo) to classify free-text chief complaints into one of the following syndromic categories: constitutional, respiratory, gastrointestinal, neurological, botulinic, rash, hemorrhagic, and other. CoCo computes the probability of each category, conditioned on each word in a free-text chief complaint and assigns a patient to the category with the highest probability.27 The probability distributions used by CoCo are learned from a manually created training set. CoCo can be retrained with local data, and it can be trained to detect a different set of syndromes than we currently use. CoCo runs as a local process on the RODS database server. CoCo was developed at the University of Pittsburgh and is available for free download at <http://health.pitt.edu/rods/sw>.'), Cord19FullTextSection(title='Detection Algorithms ::: Application Level ::: Technical Description', text='Over the course of the project, RODS has used two detection algorithms. These algorithms have not been formally field tested because the emphasis of the project to date has been on developing the data collection infrastructure more than field testing of algorithms.'), Cord19FullTextSection(title='Detection Algorithms ::: Application Level ::: Technical Description', text='The Recursive-Least-Square (RLS) adaptive filter28 currently runs every four hours, and alerts are sent to public health officials in Utah and Pennsylvania. RLS, a dynamic autoregressive linear model, computes an expected count for each syndrome category for seven counties in Utah and 16 counties in Pennsylvania as well as for the combined counts for each state. We use RLS because it has a minimal reliance on historical data for setting model parameters and a high sensitivity to rapid increases in a time series e.g., a sudden increase in daily counts. RLS triggers an alert when the current actual count exceeds the 95% confidence interval for the predicted count.'), Cord19FullTextSection(title='Detection Algorithms ::: Application Level ::: Technical Description', text="During the 2002 Olympics we also used the What's Strange About Recent Events (WSARE 1.0) algorithm.29 WSARE performs a heuristic search over combinations of temporal and spatial features to detect anomalous densities of cases in space and time. Such features include all aspects of recent patient records, including syndromal categories, age, gender, and geographical information about patients. The criteria used in the past for sending a WSARE 1.0 alert was that there has been an increase in the number of patients with specific characteristics relative to the counts on the same day of the week during recent weeks and the p-value after careful adjustment for multiple testing for the increase was ≤0.05. Version 3.0 of WSARE, which will incorporate a Bayesian model for computing expected counts rather than using unadjusted historical counts currently, is under development."), Cord19FullTextSection(title='Alert Notification ::: Application Level ::: Technical Description', text="When an algorithm triggers an alert based on the above criteria, RODS sends e-mail and/or page alerts to its users. RODS uses an XML-based configuration file to define users' e-mail and pager addresses. The e-mail version of the alert includes a URL link to a graph of the time series that triggered the alarm with two comparison time series: total visits for the same time period and normalized counts."), Cord19FullTextSection(title='User Interface ::: Application Level ::: Technical Description', text="RODS has a password-protected, encrypted Web site at which users can review health care registration and sales of OTC health care products on epidemic plots and maps. When a user logs in, RODS will check the user's profile and will display data only for his or her health department's jurisdiction. The interface comprises three screens—Main, Epiplot, and Mapplot."), Cord19FullTextSection(title='User Interface ::: Application Level ::: Technical Description', text='The main screen alternates views automatically among each of the available data sources (currently health care registrations and OTC products in Pennsylvania and Utah and OTC sales only for other states). The view alternates every two minutes as shown in ▶. The clinic visits view shows daily total visits and seven daily syndromes for the past week. The OTC data view shows daily sales for five product categories and the total, also for the past week. Users also can set the view to a specific county in a state. If the normalize control box is checked, the counts in the time series being displayed will be divided by (normalized by) the total daily sales of OTC health care products or ED visits for the region. '), Cord19FullTextSection(title='User Interface ::: Application Level ::: Technical Description', text='The Epiplot screen provides a general epidemic plotting capability. The user can simultaneously view a mixture of different syndromes and OTC product categories for any geographic region (state, county, or zip code), and for any time interval. The user also can retrieve case details as shown in ▶. The Get Cases button queries the database for the admission date, age, zip code, and chief complaint (verbatim, not classified into syndrome category) of all patients in the time interval and typically is used to examine an anomalous density (spike) of cases. The Download Data button will download data as a compressed comma separated file for further analyses. '), Cord19FullTextSection(title='User Interface ::: Application Level ::: Technical Description', text='The Mapplot screen is an interface to ArcIMS, an Internet-enabled GIS product developed by Environmental Systems Research Institute, Inc. Mapplot colors zip code regions to indicate the proportion of patients presenting with a particular syndrome. The GIS server also can overlay state boundaries, county boundaries, water bodies, hospital locations, landmarks, streets, and highways on the public health data as shown in ▶. Similar to Epiplot, Mapplot also can display case details for a user-selected zip code. '), Cord19FullTextSection(title='Fault Tolerance ::: Technical Description', text='RODS has been in operation for four years and, like most production systems, has acquired many fault-tolerant features. For example, at the software level, HL7 listeners continue to receive messages and temporarily store the messages when the database is off-line. A data manager program runs every ten minutes and, on finding such a cache, it loads the unstored messages to the database when the database is back on-line. In addition, the data manager program monitors and restarts HL7 listeners as necessary. The database uses “archive log” mode to log every transaction to ensure that the database can recover from a system failure.'), Cord19FullTextSection(title='Fault Tolerance ::: Technical Description', text='The hardware architecture also is fault tolerant. All servers have dual power supplies and dual network cards. All hard drives use Redundant Arrays of Inexpensive Disk configurations. In addition to dual power supplies, all machines are connected to an uninterrupted power supply that is capable of sending an e-mail alert to the RODS administrator when the main power is down.'), Cord19FullTextSection(title='Health System Resident Component ::: Technical Description', text='An important component of RODS that currently is used only at the UPMC Health System in Pittsburgh is the Health System Resident Component (HSRC). The HSRC is located within the firewall of a health system and connects directly to the HL7 message router. The HSRC currently receives a diverse set of clinical data from the HL7 message router including culture results, radiology reports, and dictated emergency room notes. Its purpose is to provide additional public health surveillance functions that would not be possible if it were located outside of the firewall due to restrictions on the release of identifiable clinical data. The HSRC uses patient identifiers to link laboratory and radiology information to perform case detection. In the past, we have used HSRC to monitor for patients with both a gram-positive rod in a preliminary microbiology culture report and “mediastinal widening” in a radiology report. The HSRC is a case detector in a distributed outbreak detection system that is capable of achieving much higher specificity of patient diagnostic categorization through access to more information.'), Cord19FullTextSection(title='Health System Resident Component ::: Technical Description', text="HSRC also removes identifiable information before transmitting data to the RODS system, a function provided by the health system's message router in other hospitals that connect to RODS."), Cord19FullTextSection(title='Health System Resident Component ::: Technical Description', text='The HSRC at UPMC Health System functions as an electronic laboratory reporting system, although the state and local health departments are not yet ready to receive real-time messaging from the system. Currently, it sends email alerts to the director of the laboratory and hospital infection control group about positive cultures for organisms that are required to be reported to public health in the state of Pennsylvania.30 It also sends messages to hospital infection control when it detects organisms that cause nosocomial infections. These organisms include Clostridium difficile, methicillin-resistant Staphylococcus aureus, and vancomycin-resistant Enterococcus.'), Cord19FullTextSection(title='Health System Resident Component ::: Technical Description', text='We have been able in HSRC to prototype one additional feature, which is a “look-back” function that facilitates very rapid outbreak investigations by providing access to electronic medical records to public health investigators as shown in ▶. This feature requires a token that can be passed to a hospital information system that can uniquely identify a patient, and the reason we have prototyped this feature in the HSRC and not in RODS is simply that HSRC runs within the firewall so an unencrypted token can be used. The look-back is accomplished as follows: when a public health user identifies an anonymous patient record of interest (e.g., one of 20 patients with diarrhea today from one zip code), HSRC calls the UPMC Health System Web-based electronic medical record system and passes it the patient identifier. UPMC Health System then requests the user to log in using the UPMC-issued password before providing access to the record directly from its own secure Web site. This approach is not intended to be implemented in HSRC, but rather in the RODS system outside of the firewall of a health system. It is intended to use encrypted identifiers that the health system would decrypt to retrieve the correct record. The HSRC could provide the encryption-decryption service or it could be provided by another data system in the hospital. We estimate that the prevalence of health systems that have Web-based results review in the United States is 30% to 50% and growing so that this approach could very quickly improve the efficiency of outbreak investigations. '), Cord19FullTextSection(title='Current Status', text='RODS has been in operation in Pennsylvania since 1999 and in Utah since January 2002. In Utah, RODS receives data from two health systems: Intermountain Heath Care, including nine EDs and 18 acute care facilities, and the University of Utah Health Sciences Center, with one ED.24 Together, these facilities serve about 70% of the population of Utah. In Pennsylvania, RODS receives data from 20 health systems comprising 38 hospitals. Two health systems (each with one hospital) send plain text files to RODS on a daily basis. In Pennsylvania, RODS covers 80% of ED visits in Allegheny County (population, 1.3 M) where Pittsburgh is located; 50% of visits in the 13-county Metropolitan Medical Response Area centered on Pittsburgh (population 3.0 M); and more than 70% coverage of three other counties, including Dauphin County where Harrisburg, the capital of Pennsylvania, is located. The Commonwealth of Pennsylvania is funding a large project to connect the remaining hospitals in the Commonwealth with RODS over the next two years (approximately an additional 170 hospitals).'), Cord19FullTextSection(title='Current Status', text='In December 2002, the RODS laboratory released version 1.1 of the RODS software to the public. The release includes all of the components necessary to deploy RODS for clinic visits surveillance. RODS is free for noncommercial use and can be downloaded at <http://www.health.pitt.edu/rods/sw/>. Although the software has been downloaded in excess of 170 times, we are aware of only a few successful efforts at deployment. These kinds of systems require network engineers, Oracle database administrators, and interface engineers, and very few health departments have access to that skills set.'), Cord19FullTextSection(title='Current Status', text='For these reasons, we have moved to an application service provider model for dissemination in which we encourage state and local health departments to form coalitions to support shared services. We also have been fortunate to have sufficient grant funding from the Commonwealth of Pennsylvania to be able to support these services on an interim basis while sustainable funding models evolve.'), Cord19FullTextSection(title='Discussion', text='Our original design objectives for RODS were real-time collection of data with sufficient geographic coverage and sampling density to provide early syndromic warning of a large-scale aerosol release of anthrax. Although we have not achieved all of our initial design objectives, progress has been substantial. The research identified two types of data—free-text chief complaints and sales of OTC health care products—that can be obtained in real time or near real time at sampling levels of 70% or higher for most of the United States. These results were obtained through large-scale deployments of RODS in Pennsylvania and Utah and through building the National Retail Data Monitor described in the accompanying article in this issue of JAMIA. The deployments also provided insights about organizational and technical success factors that would inform an effort to scale the project nationally.'), Cord19FullTextSection(title='Discussion', text='The project established the importance of HL7 message routers (also known as integration engines) for public health surveillance. HL7 message routers are a mature, highly prevalent technology in health care. We demonstrated that free-text triage chief complaints can be obtained in real time from most U.S. hospitals through message routers and that these data represent early syndromal information about disease. Many other clinical data of value to public health are transmitted using the HL7 standard (e.g., orders for diagnostic tests, especially microbiology tests, reports of chest radiographs, medications, and test results) and can be integrated into RODS or other surveillance systems capable of receiving HL7 messages.'), Cord19FullTextSection(title='Discussion', text='As a result of our efforts to disseminate this technology by giving it away, we have learned that most health departments do not have the technical resources to build and maintain real-time electronic disease surveillance systems. Our application service provider model has been much more successful, and we now recommend that states form coalitions to share the costs of such services.'), Cord19FullTextSection(title='Discussion', text="The project very early identified the need for a computing component to reside within the firewall of a health system, connected to the hospital's HL7 message router. This component would function as a case detector in a distributed public health surveillance scheme linking laboratory and radiology data to increase the specificity of case detection. It has proven very difficult to disseminate this technology, perhaps due to the complexity of the idea. Nevertheless, the threat of bioterrorism has created a need for such technology, and this approach, or something with equivalent function, must be deployed."), Cord19FullTextSection(title='Discussion', text="Adherence to NEDSS architectural standards was an early design objective that we have met. RODS 1.5 closely follows NEDSS architectural, software, messaging, and data specifications. Our success is a strong validation of those standards. We will gain further understanding of the standards as we attempt to use RODS components including HL7 listeners, natural language parsers, message parsers, databases, user interfaces, notification subsystems, and detection algorithms with other NEDSS compliant systems. An ongoing project will use RODS to collect chief complaints and integrate them into the Utah Department of Health's planned NEDSS system."), Cord19FullTextSection(title='Discussion', text='We have demonstrated the ability to rapidly deploy RODS in a special event with the added advantage that the system persisted after the event. This experience suggests strongly that RODS or similar systems be considered an alternative to drop-in surveillance.'), Cord19FullTextSection(title='Discussion', text='Our future plans are to meet our initial design objective to develop early-warning capability for a large, outdoor release of anthrax, especially ensuring that the data and analysis produced by RODS are reviewed by public health. This goal will require improvements in the interfaces and the detection algorithms to reduce false alarms and to vastly improve the efficiency with which anomalies are evaluated by use of multiple types of data, better interfaces, and implementation of the look-back function. We would like to enlarge as quickly as possible the application service provider to include more states and more types of clinical data so that states will be in a position to prospectively evaluate the detection performance from different types of data on naturally occurring outbreaks.'), Cord19FullTextSection(title='Discussion', text='Our long-term goals are to add additional disease scenarios to the design objectives such as detection of in-building anthrax release, vector-borne disease, food-borne disease, and a communicable disease such as severe acute respiratory syndrome (SARS).'), Cord19FullTextSection(title='Conclusion', text='RODS is a NEDSS-compliant public health surveillance system that focuses on real-time collection and analysis of data routinely collected for other purposes. RODS is deployed in two states and was installed quickly in seven weeks for the 2002 Olympics. Our experience demonstrates the feasibility of such a surveillance system and the challenges involved.'), Cord19FullTextSection(title='Conclusion', text='Outbreaks, emerging infections, and bioterrorism have become serious threats. It is our hope that the front-line of public health workers, astute citizens, and health care workers will detect outbreaks early enough so that systems such as RODS are not needed. However, timely outbreak detection is too important to be left to human detection alone. The notion that public health can operate optimally without timely electronic information is as unwise as having commercial airline pilots taking off without weather forecasts and radar.'))), 192508: Cord19FullTextDoc('pnl9th2c', 'Vascular Life during the COVID-19 Pandemic Reminds Us to Prepare for the Unexpected', '10.1016/j.ejvs.2020.04.040', '2020-05-12', '', (Cord19FullTextSection(title='', text='The first reported case in Spain was on 31 January, and since then the rapid spread of the virus has been demonstrated by the numbers of confirmed diagnoses (> 188 000), and deaths (nearly 20 000) which have increased dramatically every day.'), Cord19FullTextSection(title='', text="The vascular patient is at a higher risk of developing a severe form of the disease due to the nature of its association with several comorbid states and thus, vascular surgery communities from many countries have tried to stratify patients into those requiring urgent care, such as > 70 mm abdominal aortic aneurysm (AAA) or ruptured AAA, critical limb ischaemia (CLI), and symptomatic carotid disease. However, the setting is very dynamic with a clear trend to worsening. Today's circumstances do not fit any previous protocol guidance. An overwhelmed health system, with a lack of personal protection equipment (PPE) and ventilatory support, has led to a global sanitary collapse; with a 90% in hospital occupancy of COVID-19 disease alone, one might correctly guess that the response to remaining diseases, including all cardiovascular ones, is underdiagnosis and undertreatment."), Cord19FullTextSection(title='', text='Moreover, patients are quite afraid (understandably) of coming to hospital, resulting in an unprecedented and unpredictable scenario where surgical arterial practice has severely dropped to an historic minimum: reaching, in total, between 15% and 20% of the usual numbers, mostly related to vascular emergencies (mainly CLI). All ambulatory consultations and imaging follow up (duplex and computed tomography scans) have been rescheduled and consultation by telephone is provided for those awaiting any arterial surgical repair.'), Cord19FullTextSection(title='', text='In addition, for those arriving with any medical or surgical pathology, "war like" decisions are driven by insufficient support to take care of everybody and, therefore, breaking the state of wellbeing that has been largely achieved in most European countries.'), Cord19FullTextSection(title='', text='Therefore, we want to focus on four points that we have faced so far and might help in improving care of, particularly, vascular patients.'), Cord19FullTextSection(title='', text='First, although the aim of protocols is to provide the best treatment for specific situations, they are based mostly on patient assessment. Despite the extensive use of telematic consultation with the aim of identifying patients at risk of clinical worsening (i.e. amputation risk in CLI patients), these "assessment based protocols" are far from working in this unusual scenario. People are simply not coming to hospital as they are extremely afraid; in addition, the first line of disease identification (family medicine) is nearly absent, as family doctors have already been distributed to emergency departments to aid in pulmonary care. Thus, a change in current protocol design considering these novel issues should be created.'), Cord19FullTextSection(title='', text='Second, nearly 20% of COVID-19 positive people are healthcare workers. This is of extreme importance, 3 as the lack of an unaffected workforce could be avoided with adequate and fast COVID testing, 4 receiving specific disease education and by using PPE (unfortunately not adequate in Spain). Moreover, the health professional (potentially infected and working actively with patients) can maintain the negative circle of contagion for those coming to hospital with other kinds of disease and their coworkers. This may lead to the organisation of specific COVID free surgical teams; however, this surely deserves a more in depth discussion in each centre.'), Cord19FullTextSection(title='', text='Third, due to the "extreme" situation, we are forced to optimise surgical care.'), Cord19FullTextSection(title='', text='What does this mean? Not necessarily provide the best care. It actually means "… keep in mind there is no mechanical post-operative support nor ICU beds for your patient …". One might think that an "endovascular first approach" is the best solution to deal with this issue. Protocols suggest treating ruptured AAA by endovascular aneurysm repair if anatomically feasible in patients with long life expectancy. The same may be considered for CLI. However, one may consider the global scenario where all has changed: we are not working in the usual operating room (it has been converted into an ICU for COVID-19 patients); we are without our usual nursing staff (they have been distributed according to hospital needs); and room availability is pretty scarce (as our > 800 bed hospital now shares only four or five COVID free surgical rooms for all surgical specialities plus one for identified COVID-19 patients).'), Cord19FullTextSection(title='', text='One case representing such a scenario is described in Fig. 1 . It is really hard to decide whether an endovascular or open approach seems best for our patients.'), Cord19FullTextSection(title='', text='Moreover, COVID patients are more prone to post-operative complications, as some of them present with lymphopenia, which reflects their immunodeficiency state. 5 We hypothesise that an endovascular first approach (counterbalancing the global aforementioned picture) might be considered, as every patient hospitalised for any type of care should be considered potentially infected and therefore, the fewer wounds and grafts, the less chance of infection.'), Cord19FullTextSection(title='', text='It seems that COVID-19 might present with a procoagulant state (mostly in the inflammatory response and some days after it), as we have observed a higher frequency of deep vein and arterial thrombosis (very much like antiphospholipid syndrome).'), Cord19FullTextSection(title='', text='Further data are required to reach stronger conclusions and, certainly to provide any recommendation for aggressive anticoagulation therapy.'), Cord19FullTextSection(title='', text="We encourage the vascular community to spread any new knowledge related to this new disease. We do not have the answer yet for the best organisation or strategy and therefore, it should be adapted for the hospital/centre's resources. There is a lot of uncertainty now, but also in the future when the COVID-19 pandemic is overcome. Our weakened health system will need to deal with a scenario that might again exceed the available resources. Two weeks later, the patient presented with graft infection and COVID symptoms; he is receiving specific treatment and waiting for explantation and new revascularisation."), Cord19FullTextSection(title='Figure 1', text='A, anterior; and B, lateral view.'))), }) self._test_docs('cord19/trec-covid/round1', count=51078, items={ 0: Cord19Doc('xqhn0vbp', 'Airborne rhinovirus detection and effect of ultraviolet irradiation on detection by a semi-nested RT-PCR assay', '10.1186/1471-2458-3-5', '2003-01-13', re.compile('^BACKGROUND: Rhinovirus, the most common cause of upper respiratory tract infections, has been implic.{1492}ools\\. This method, however, cannot distinguish UV inactivated virus from infectious viral particles\\.$', flags=48)), 9: Cord19Doc('1ul8owic', "Air pollution and case fatality of SARS in the People's Republic of China: an ecologic study", '10.1186/1476-069x-2-15', '2003-11-20', re.compile('^BACKGROUND: Severe acute respiratory syndrome \\(SARS\\) has claimed 349 lives with 5,327 probable cases.{1763}etrimental effect of air pollution on the prognosis of SARS patients deserves further investigation\\.$', flags=48)), 51077: Cord19Doc('1peoakxt', 'The pathogenic role of virus-specific antibody secreting cells in the CNS of rats resistant and susceptible to coronavirus-induced encephalitis', '10.1016/0165-5728(91)91228-5', '1991-12-31', ''), }) self._test_docs('cord19/trec-covid/round2', count=59887, items={ 0: Cord19Doc('zjufx4fo', 'Sequence requirements for RNA strand transfer during nidovirus discontinuous subgenomic RNA synthesis', '10.1093/emboj/20.24.7220', '2001-12-17', re.compile('^Nidovirus subgenomic mRNAs contain a leader sequence derived from the 5′ end of the genome fused to .{968} of TRS mutants strongly suggested that the discontinuous step occurs during minus strand synthesis\\.$', flags=48)), 9: Cord19Doc('3oxzzxnd', 'SseG, a virulence protein that targets Salmonella to the Golgi network', '10.1093/emboj/cdg517', '2003-10-01', re.compile('^Intracellular replication of the bacterial pathogen Salmonella enterica occurs in membrane\\-bound com.{1028} is dependent on simultaneous and selective interactions with both endocytic and secretory pathways\\.$', flags=48)), 59886: Cord19Doc('jxbr6hbk', 'Identifying spatio-temporal dynamics of Ebola in Sierra Leone using virus genomes', '10.1098/rsif.2017.0583', '2017-11-29', re.compile('^Containing the recent West African outbreak of Ebola virus \\(EBOV\\) required the deployment of substan.{1158}ffering operationally relevant guidance for surveillance and sampling strategies during an epidemic\\.$', flags=48)), }) self._test_docs('cord19/trec-covid/round3', count=128492, items={ 0: Cord19Doc('ug7v899j', 'Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia', '10.1186/1471-2334-1-6', '2001-07-04', re.compile('^OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 pa.{1647}preschool children and that the mortality rate of pneumonia in patients with comorbidities was high\\.$', flags=48)), 9: Cord19Doc('i0zym7iq', 'Discontinuous and non-discontinuous subgenomic RNA transcription in a nidovirus', '10.1093/emboj/cdf635', '2002-12-01', re.compile('^Arteri\\-, corona\\-, toro\\- and roniviruses are evolutionarily related positive\\-strand RNA viruses, unit.{1021}ortant implications for the understanding of the mechanism and evolution of nidovirus transcription\\.$', flags=48)), 128491: Cord19Doc('clmtwq4v', 'The determinants of the 1999 and 2007 Chinese Golden Holiday System: A content analysis of official documentation', '10.1016/j.tourman.2009.10.003', '2009-11-06', re.compile('^This study examines the factors that lay behind the development of the Golden Week holiday system in.{587} in 2007\\. The theoretical contributions and practical implications of this study are also addressed\\.$', flags=48)), }) self._test_docs('cord19/trec-covid/round4', count=158274, items={ 0: Cord19Doc('ug7v899j', 'Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia', '10.1186/1471-2334-1-6', '2001-07-04', re.compile('^OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 pa.{1647}preschool children and that the mortality rate of pneumonia in patients with comorbidities was high\\.$', flags=48)), 9: Cord19Doc('jg13scgo', 'Technical Description of RODS: A Real-time Public Health Surveillance System', '10.1197/jamia.m1345', '2003-09-01', re.compile('^This report describes the design and implementation of the Real\\-time Outbreak and Disease Surveillan.{1077} be a resource for implementing, evaluating, and applying new methods of public health surveillance\\.$', flags=48)), 158273: Cord19Doc('zvop8bxh', 'Antiviral RNAi therapy: emerging approaches for hitting a moving target', '10.1038/sj.gt.3302645', '2005-09-22', re.compile('^The field of directed RNA interference \\(RNAi\\) has rapidly developed into a highly promising approach.{1184}NAi targets and discuss strategies for translating these findings into effective clinical therapies\\.$', flags=48)), }) self._test_docs('cord19/trec-covid/round5', count=192509, items={ 0: Cord19Doc('ug7v899j', 'Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia', '10.1186/1471-2334-1-6', '2001-07-04', re.compile('^OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 pa.{1647}preschool children and that the mortality rate of pneumonia in patients with comorbidities was high\\.$', flags=48)), 9: Cord19Doc('jg13scgo', 'Technical Description of RODS: A Real-time Public Health Surveillance System', '10.1197/jamia.m1345', '2003-09-01', re.compile('^This report describes the design and implementation of the Real\\-time Outbreak and Disease Surveillan.{1077} be a resource for implementing, evaluating, and applying new methods of public health surveillance\\.$', flags=48)), 192508: Cord19Doc('pnl9th2c', 'Vascular Life during the COVID-19 Pandemic Reminds Us to Prepare for the Unexpected', '10.1016/j.ejvs.2020.04.040', '2020-05-12', ''), }) def test_cord19_queries(self): self._test_queries('cord19/trec-covid', count=50, items={ 0: TrecQuery('1', 'coronavirus origin', 'what is the origin of COVID-19', "seeking range of information about the SARS-CoV-2 virus's origin, including its evolution, animal source, and first transmission into humans"), 9: TrecQuery('10', 'coronavirus social distancing impact', 'has social distancing had an impact on slowing the spread of COVID-19?', "seeking specific information on studies that have measured COVID-19's transmission in one or more social distancing (or non-social distancing) approaches"), 49: TrecQuery('50', 'mRNA vaccine coronavirus', 'what is known about an mRNA vaccine for the SARS-CoV-2 virus?', 'Looking for studies specifically focusing on mRNA vaccines for COVID-19, including how mRNA vaccines work, why they are promising, and any results from actual clinical studies.'), }) self._test_queries('cord19/fulltext/trec-covid', count=50, items={ 0: TrecQuery('1', 'coronavirus origin', 'what is the origin of COVID-19', "seeking range of information about the SARS-CoV-2 virus's origin, including its evolution, animal source, and first transmission into humans"), 9: TrecQuery('10', 'coronavirus social distancing impact', 'has social distancing had an impact on slowing the spread of COVID-19?', "seeking specific information on studies that have measured COVID-19's transmission in one or more social distancing (or non-social distancing) approaches"), 49: TrecQuery('50', 'mRNA vaccine coronavirus', 'what is known about an mRNA vaccine for the SARS-CoV-2 virus?', 'Looking for studies specifically focusing on mRNA vaccines for COVID-19, including how mRNA vaccines work, why they are promising, and any results from actual clinical studies.'), }) self._test_queries('cord19/trec-covid/round1', count=30, items={ 0: TrecQuery('1', 'coronavirus origin', 'what is the origin of COVID-19', "seeking range of information about the SARS-CoV-2 virus's origin, including its evolution, animal source, and first transmission into humans"), 9: TrecQuery('10', 'coronavirus social distancing impact', 'has social distancing had an impact on slowing the spread of COVID-19?', "seeking specific information on studies that have measured COVID-19's transmission in one or more social distancing (or non-social distancing) approaches"), 29: TrecQuery('30', 'coronavirus remdesivir', 'is remdesivir an effective treatment for COVID-19', 'seeking specific information on clinical outcomes in COVID-19 patients treated with remdesivir'), }) self._test_queries('cord19/trec-covid/round2', count=35, items={ 0: TrecQuery('1', 'coronavirus origin', 'what is the origin of COVID-19', "seeking range of information about the SARS-CoV-2 virus's origin, including its evolution, animal source, and first transmission into humans"), 9: TrecQuery('10', 'coronavirus social distancing impact', 'has social distancing had an impact on slowing the spread of COVID-19?', "seeking specific information on studies that have measured COVID-19's transmission in one or more social distancing (or non-social distancing) approaches"), 34: TrecQuery('35', 'coronavirus public datasets', 'What new public datasets are available related to COVID-19?', 'Seeking articles that specifically release new data related to SARS-CoV-2 or COVID-19, including genomic data, patient data, public health data, etc. Articles that reference previously existing datasets are not relevant.'), }) self._test_queries('cord19/trec-covid/round3', count=40, items={ 0: TrecQuery('1', 'coronavirus origin', 'what is the origin of COVID-19', "seeking range of information about the SARS-CoV-2 virus's origin, including its evolution, animal source, and first transmission into humans"), 9: TrecQuery('10', 'coronavirus social distancing impact', 'has social distancing had an impact on slowing the spread of COVID-19?', "seeking specific information on studies that have measured COVID-19's transmission in one or more social distancing (or non-social distancing) approaches"), 39: TrecQuery('40', 'coronavirus mutations', 'What are the observed mutations in the SARS-CoV-2 genome and how often do the mutations occur?', 'Looking for studies that describes the emergence of genomic diversity of the coronavirus due to recurrent mutations which explore the potential genomic site of the mutation, mechanisms and its potential or observed clinical implications in the pathogenicity of the virus'), }) self._test_queries('cord19/trec-covid/round4', count=45, items={ 0: TrecQuery('1', 'coronavirus origin', 'what is the origin of COVID-19', "seeking range of information about the SARS-CoV-2 virus's origin, including its evolution, animal source, and first transmission into humans"), 9: TrecQuery('10', 'coronavirus social distancing impact', 'has social distancing had an impact on slowing the spread of COVID-19?', "seeking specific information on studies that have measured COVID-19's transmission in one or more social distancing (or non-social distancing) approaches"), 44: TrecQuery('45', 'coronavirus mental health impact', 'How has the COVID-19 pandemic impacted mental health?', 'Includes increasing/decreasing rates of depression, anxiety, panic disorder, and other psychiatric and mental health conditions.'), }) self._test_queries('cord19/trec-covid/round5', count=50, items={ 0: TrecQuery('1', 'coronavirus origin', 'what is the origin of COVID-19', "seeking range of information about the SARS-CoV-2 virus's origin, including its evolution, animal source, and first transmission into humans"), 9: TrecQuery('10', 'coronavirus social distancing impact', 'has social distancing had an impact on slowing the spread of COVID-19?', "seeking specific information on studies that have measured COVID-19's transmission in one or more social distancing (or non-social distancing) approaches"), 49: TrecQuery('50', 'mRNA vaccine coronavirus', 'what is known about an mRNA vaccine for the SARS-CoV-2 virus?', 'Looking for studies specifically focusing on mRNA vaccines for COVID-19, including how mRNA vaccines work, why they are promising, and any results from actual clinical studies.'), }) def test_cord19_qrels(self): self._test_qrels('cord19/trec-covid', count=69318, items={ 0: TrecQrel('1', '005b2j4b', 2, '4.5'), 9: TrecQrel('1', '05vx82oo', 0, '3'), 69317: TrecQrel('50', 'zz8wvos9', 1, '5'), }) self._test_qrels('cord19/fulltext/trec-covid', count=69318, items={ 0: TrecQrel('1', '005b2j4b', 2, '4.5'), 9: TrecQrel('1', '05vx82oo', 0, '3'), 69317: TrecQrel('50', 'zz8wvos9', 1, '5'), }) self._test_qrels('cord19/trec-covid/round1', count=8691, items={ 0: TrecQrel('1', '010vptx3', 2, '0.5'), 9: TrecQrel('1', '0pbjttv4', 0, '1'), 8690: TrecQrel('30', 'zn87f1lk', 1, '0.5'), }) self._test_qrels('cord19/trec-covid/round2', count=12037, items={ 0: TrecQrel('1', '08efpohc', 0, '1.5'), 9: TrecQrel('1', '1bvsn9e8', 2, '2'), 12036: TrecQrel('35', 'zzmfhr2s', 0, '2'), }) self._test_qrels('cord19/trec-covid/round3', count=12713, items={ 0: TrecQrel('1', '0194oljo', 1, '2.5'), 9: TrecQrel('1', '0qpfoh5t', 0, '3'), 12712: TrecQrel('40', 'zsx7wfyj', 1, '3'), }) self._test_qrels('cord19/trec-covid/round4', count=13262, items={ 0: TrecQrel('1', '00fmeepz', 1, '4'), 9: TrecQrel('1', '1c47w4q5', 2, '4'), 13261: TrecQrel('45', 'zzrsk1ls', 2, '4'), }) self._test_qrels('cord19/trec-covid/round5', count=23151, items={ 0: TrecQrel('1', '005b2j4b', 2, '4.5'), 9: TrecQrel('1', '1ag9jkk6', 0, '4.5'), 23150: TrecQrel('50', 'zz8wvos9', 1, '5'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/cranfield.py ================================================ import re import unittest from ir_datasets.datasets.cranfield import CranfieldDoc from ir_datasets.formats import TrecQrel, GenericQuery from .base import DatasetIntegrationTest class TestCranfield(DatasetIntegrationTest): def test_docs(self): self._test_docs('cranfield', count=1400, items={ 0: CranfieldDoc('1', 'experimental investigation of the aerodynamics of a\nwing in a slipstream .', re.compile('^experimental investigation of the aerodynamics of a\nwing in a slipstream \\.\n an experimental study o.{710}cal evaluation of the destalling effects was made for\nthe specific configuration of the experiment \\.$', flags=48), 'brenckman,m.', 'j. ae. scs. 25, 1958, 324.'), 9: CranfieldDoc('10', 'the theory of the impact tube at low pressure .', re.compile('^the theory of the impact tube at low pressure \\.\n a theoretical analysis has been made for an impact.{131}sures \\.\nit is shown that the results differ appreciably from the\ncorresponding continuum relations \\.$', flags=48), 'chambre,p.l. and schaaf,s.a.', 'j. ae. scs. 15, 1948, 735.'), 1399: CranfieldDoc('1400', 'the buckling shear stress of simply-supported infinitely\nlong plates with transverse stiffeners .', re.compile('^the buckling shear stress of simply\\-supported infinitely\nlong plates with transverse stiffeners \\.\n .{466}lete range\nof stiffnesses, for panels with ratios of width to stiffener spacing of\ngraphical forms \\.$', flags=48), 'kleeman,p.w.', 'arc r + m.2971, 1953.'), }) def test_queries(self): self._test_queries('cranfield', count=225, items={ 0: GenericQuery('1', 'what similarity laws must be obeyed when constructing aeroelastic models\nof heated high speed aircraft .'), 9: GenericQuery('10', 'are real-gas transport properties for air available over a wide range of\nenthalpies and densities .'), 224: GenericQuery('225', 'what design factors can be used to control lift-drag ratios at mach\nnumbers above 5 .'), }) def test_qrels(self): self._test_qrels('cranfield', count=1837, items={ 0: TrecQrel('1', '184', 2, '0'), 9: TrecQrel('1', '57', 2, '0'), 1836: TrecQrel('225', '1188', -1, '0'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/csl.py ================================================ import re import unittest from ir_datasets.formats import TrecQrel, ExctractedCCNoReportQuery from ir_datasets.datasets.csl import CslDoc from .base import DatasetIntegrationTest class TestCsl(DatasetIntegrationTest): def test_docs(self): self._test_docs('csl/trec-2023', count=395927, items={ 0: CslDoc('csl-387565', '谈若敖氏', re.compile('^"\'若敖之鬼馁而\',也是一件人生的大哀",这句话是说阿Q需要一个女人的念头不仅合符礼制:所谓"不孝有三,无后为大";而且也是十分现实的事情,即自己饿死了尚不要紧,连祖先也没有人供奉了则是一件大事\\.其实.{7}冠冕堂皇,阿Q私心至多不过这么想:若没有一个女人,"断子绝孙便没有人供一碗饭"\\.自然这就同"若敖之鬼馁而"的境遇一样了\\.当然无论阿Q是否知书识礼,此话用在阿Q身上就成了歪理,不过歪理对于阿Q就是真理\\.$', flags=48), ['女人', '真理', '人生', '礼制', '境遇'], '文学', 'Literature', '中国语言文学', 'Chinese Literature'), 9: CslDoc('csl-153711', '图书馆的"计算机2000年问题"及解决措施', re.compile('^"2000年问题"是由于早期编程人员为了节省存储空间,用年代的后两位数字代替四位计年而造成的\\.就是说随着人类迈入2000年,某些计算机系统将会出现一片混乱\\.首先软件计数将会转化成"00",这和代表19.{17}可能会自动初始化\\.例如1997年用97表示,简单方便,但1900年用00表示,而2000年的表示方式仍然是"00",这就导致了系统在处理与日期相关的问题时,出现一系列的错误\\.这就是"2000年问题"\\.$', flags=48), ['图书馆', '计算机系统', '2000年问题', '存储空间', '表示方式', '初始化', '转化', '硬件', '四位', '数字', '软件', '人员', '计数', '处理', '编程'], '管理学', 'Management', '图书馆、情报与档案管理', 'Library Information and Archives Management'), 395926: CslDoc('csl-042248', '少数民族理科学习困境的因素分析', re.compile('^__摘_理科教育质量事关少数民族学生认知发展、就业前途以及社会和谐与稳定。本文分析影响少数民族理科教育质量的主要因素:民族地区语言-教学模式处于新的探索阶段,理科教师的语言转换能力难度较大,理科课程标.{63}教育需定位于培养学生的基本科学素养,以及解决生产生活问题的实际能力。建议开设“语言与文化适宜的教学法”等课程,提高民族地区教师的教学能力和语言转化能力;加强基于教育与心理的实证研究与理科教学实践探索。$', flags=48), ['少数民族', '语言-教学模式', '理科学习', '语言和文化适宜'], '教育学', 'Pedagogy', '教育学', 'Pedagogy'), }) def test_queries(self): self._test_queries('csl/trec-2023', count=41, items={ 0: ExctractedCCNoReportQuery('1', 'human reproductive system', 'I am looking for articles describing the human reproductive system.', 'Articles describing the human reproductive system in biological terms from the perspective of a modern biologist are considered relevant. Articles describing the overall system such as a general description of the components or a lower level description such as specific cellular or molecular pathways involved in either the early development or adult stage maintenance of the systemo are relevant. Descriptions of mammalian reproductive system in general are considere somewhat valuable. Articles describing the reproductive system of a species outside of the class of mammalia are not relevant. Articles talking about human reproduction from other perspectives outside that of the biological sciences, such as from a historical or political perspective are not relevant.', '人类生殖系统', '我希望找到关于人类生殖系统的文章。', '', '人类生殖系统', '我正在寻找描述人类生殖系统的文章。', '从现代生物学家的角度用生物学术语描述人类生殖系统的文章被认为是相关的。描述整个系统的文章(例如组件的一般描述)或较低级别的描述(例如涉及系统早期发育或成年阶段维护的特定细胞或分子途径)是相关的。一般而言,对哺乳动物生殖系统的描述被认为具有一定的价值。描述哺乳动物纲以外物种的生殖系统的文章不相关。从生物科学之外的其他角度(例如从历史或政治角度)讨论人类生殖的文章是不相关的。', 'zh'), 9: ExctractedCCNoReportQuery('10', 'Internet improve rural economy', 'How does Internet availability improve the rural economy?', 'Relevant articles should focus on the process of Internet availability contributing to the rural economy, but not on the result that what the government should do to utilize the Internet to improve the rural economy. Focusing on the influence on the city economy is also deemed irrelevant. Documents talking about how technology/ some specific components of the Internet improves the rural economy/urbanization would be considered somewhat valuable.', '互联网 提升 乡村的 经济', '互联网普及如何提升乡村经济?', '', '互联网带动农村经济', '互联网的普及如何改善农村经济?', '相关文章应该关注互联网对农村经济的贡献过程,而不是政府应该如何利用互联网来改善农村经济的结果。关注对城市经济的影响也被认为是无关紧要的。讨论技术/互联网的某些特定组成部分如何改善农村经济/城市化的文件将被认为是有价值的。', 'zh'), 40: ExctractedCCNoReportQuery('41', 'Pure gold nanoparticle production', 'I am looking for articles discussing the production of pure gold nanoparticles.', 'Only articles related to the production process of pure gold nano particles are considered relevant. Documents discussing the production of nanoparticles made off materials other than pure gold are not considered relevant. Additionally, documents focusing on the property or applications of the pure gold nano particles rather than their production are not considered relevant.', '纯金纳米颗粒的制备', '与纯金纳米颗粒的制作工艺相关的文章。', '', '纯金纳米颗粒生产', '我正在寻找讨论纯金纳米颗粒生产的文章。', '只有与纯金纳米粒子的生产过程相关的文章才被认为是相关的。讨论由纯金以外的材料制成的纳米颗粒的生产的文件被认为不相关。此外,关注纯金纳米颗粒的特性或应用而不是其生产的文件被认为不相关。', 'zh'), }) def test_qrels(self): self._test_qrels('csl/trec-2023', count=11291, items={ 0: TrecQrel('1', 'csl-017102', 0, '0'), 9: TrecQrel('1', 'csl-213347', 0, '0'), 11290: TrecQrel('41', 'csl-166443', 0, '0'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/disks45.py ================================================ import re import unittest from ir_datasets.datasets.cranfield import CranfieldDoc from ir_datasets.formats import TrecQrel, TrecParsedDoc, TrecQuery from .base import DatasetIntegrationTest class TestDisks45(DatasetIntegrationTest): def test_docs(self): self._test_docs('disks45/nocr', count=528155, items={ 0: TrecParsedDoc('FBIS3-1', 'FORMER YUGOSLAV REPUBLIC OF MACEDONIA: OPINION POLLS ON', re.compile('^POLITICIANS, PARTY PREFERENCES\nSummary: Newspapers in the Former Yugoslav Republic of\nMacedonia have.{5410}S, PLEASE CALL CHIEF,\nBALKANS BRANCH AT \\(703\\) 733\\-6481\\)\nELAG/25 February/POLCHF/EED/DEW 28/2023Z FEB$', flags=48), re.compile(b'^<DOC>\n<DOCNO> FBIS3\\-1 </DOCNO>\n<HT> "cr00000011094001" </HT>\n\n\n<HEADER>\n<H2> March Reports </H2>\n.{7224} \nBALKANS BRANCH AT \\(703\\) 733\\-6481\\) \n\nELAG/25 February/POLCHF/EED/DEW 28/2023Z FEB \n\n</TEXT>\n\n</DOC>$', flags=16)), 9: TrecParsedDoc('FBIS3-10', 'Vietnam-Libya', re.compile('^Hanoi Finds New Outlet for Surplus Labor\nJudging by a 1 March VNA report, Hanoi has found new\nopport.{751}COMMENTS, PLEASE CALL CHIEF,\nASIA DIVISION ANALYSIS TEAM, \\(703\\) 733\\-6534\\.\\)\nEAG/BIETZ/ta 07/2051z mar$', flags=48), re.compile(b'^<DOC>\n<DOCNO> FBIS3\\-10 </DOCNO>\n<HT> "cr00000011994001" </HT>\n\n\n<HEADER>\n<DATE1> 9 March 1994.{1274}L CHIEF, \nASIA DIVISION ANALYSIS TEAM, \\(703\\) 733\\-6534\\.\\) \nEAG/BIETZ/ta 07/2051z mar \n\n</TEXT>\n\n</DOC>$', flags=16)), 528154: TrecParsedDoc('LA123190-0134', "SHORT TAKES;\nTAMMY SEES COUNTRY'S REBIRTH", re.compile('^December 31, 1990, Monday, P\\.M\\. Final\nTammy Wynette says a new generation of performers has helped p.{470}is, Ricky Van Shelton,\nClint Black, Patty Loveless and Garth Brooks as among those making an impact\\.$', flags=48), re.compile(b'^<DOC>\n<DOCNO> LA123190\\-0134 </DOCNO>\n<DOCID> 329701 </DOCID>\n<DATE>\n<P>\nDecember 31, 1990, Monday, .{972}th Brooks as among those making an impact\\. \n</P>\n</TEXT>\n<TYPE>\n<P>\nBrief; Wire \n</P>\n</TYPE>\n</DOC>$', flags=16)), }) def test_queries(self): self._test_queries('disks45/nocr/trec-robust-2004', count=250, items={ 0: TrecQuery('301', 'International Organized Crime', 'Identify organizations that participate in international criminal\nactivity, the activity, and, if possible, collaborating organizations\nand the countries involved.', 'A relevant document must as a minimum identify the organization and the\ntype of illegal activity (e.g., Columbian cartel exporting cocaine).\nVague references to international drug trade without identification of\nthe organization(s) involved would not be relevant.'), 9: TrecQuery('310', 'Radio Waves and Brain Cancer', 'Evidence that radio waves from radio towers or car phones affect\nbrain cancer occurrence.', 'Persons living near radio towers and more recently persons using\ncar phones have been diagnosed with brain cancer. The argument \nrages regarding the direct association of one with the other.\nThe incidence of cancer among the groups cited is considered, by\nsome, to be higher than that found in the normal population. A \nrelevant document includes any experiment with animals, statistical \nstudy, articles, news items which report on the incidence of brain \ncancer being higher/lower/same as those persons who live near a \nradio tower and those using car phones as compared to those in the \ngeneral population.'), 249: TrecQuery('700', 'gasoline tax U.S.', 'What are the arguments for and against an increase in gasoline\ntaxes in the U.S.?', 'Relevant documents present reasons for or against raising gasoline taxes\nin the U.S. Documents discussing rises or decreases in the price of\ngasoline are not relevant.'), }) self._test_queries('disks45/nocr/trec-robust-2004/fold1', count=50, items={ 0: TrecQuery('302', 'Poliomyelitis and Post-Polio', 'Is the disease of Poliomyelitis (polio) under control in the\nworld?', 'Relevant documents should contain data or outbreaks of the \npolio disease (large or small scale), medical protection \nagainst the disease, reports on what has been labeled as \n"post-polio" problems. Of interest would be location of \nthe cases, how severe, as well as what is being done in \nthe "post-polio" area.'), 9: TrecQuery('341', 'Airport Security', 'A relevant document would discuss how effective\ngovernment orders to better scrutinize passengers\nand luggage on international flights and to step\nup screening of all carry-on baggage has been.', 'A relevant document would contain reports on what\nnew steps airports worldwide have taken to better \nscrutinize passengers and their luggage on \ninternational flights and to step up screening of\nall carry-on baggage. With the increase in \ninternational terrorism and in the wake of the\nTWA Flight 800 disaster, articles on airport \nsecurity relating in particular to additional\nsteps taken by airports to increase flight safety\nwould be relevant. The mere mention of enhanced \nsecurity does not constitute relevance. Additional\nsteps refer to something beyond just passenger\nand carry-on screening using the normal methods.\nExamples of new steps would be additional personnel, \nsophisticated monitoring and screening devices, \nand extraordinary measures to check luggage in the \nbaggage compartment.'), 49: TrecQuery('700', 'gasoline tax U.S.', 'What are the arguments for and against an increase in gasoline\ntaxes in the U.S.?', 'Relevant documents present reasons for or against raising gasoline taxes\nin the U.S. Documents discussing rises or decreases in the price of\ngasoline are not relevant.'), }) self._test_queries('disks45/nocr/trec-robust-2004/fold2', count=50, items={ 0: TrecQuery('301', 'International Organized Crime', 'Identify organizations that participate in international criminal\nactivity, the activity, and, if possible, collaborating organizations\nand the countries involved.', 'A relevant document must as a minimum identify the organization and the\ntype of illegal activity (e.g., Columbian cartel exporting cocaine).\nVague references to international drug trade without identification of\nthe organization(s) involved would not be relevant.'), 9: TrecQuery('349', 'Metabolism', 'Document will discuss the chemical reactions \nnecessary to keep living cells healthy and/or\nproducing energy.', 'A relevant document will contain specific information\non the catabolic and anabolic reactions of the metabolic\nprocess. Relevant information includes, but is not \nlimited to, the reactions occurring in metabolism, \nbiochemical processes (Glycolysis or Krebs cycle for\nproduction of energy), and disorders associated with \nthe metabolic rate.'), 49: TrecQuery('698', 'literacy rates Africa', 'What are literacy rates in African countries?', 'A relevant document will contain information about the\nliteracy rate in an African country.\nGeneral education levels that do not specifically include literacy rates\nare not relevant.'), }) self._test_queries('disks45/nocr/trec-robust-2004/fold3', count=50, items={ 0: TrecQuery('306', 'African Civilian Deaths', 'How many civilian non-combatants have been killed in \nthe various civil wars in Africa?', 'A relevant document will contain specific casualty \ninformation for a given area, country, or region. \nIt will cite numbers of civilian deaths caused \ndirectly or indirectly by armed conflict.'), 9: TrecQuery('354', 'journalist risks', 'Identify instances where a journalist has been put at risk (e.g.,\nkilled, arrested or taken hostage) in the performance of his work.', 'Any document identifying an instance where a journalist or \ncorrespondent has been killed, arrested or taken hostage in the \nperformance of his work is relevant.'), 49: TrecQuery('693', 'newspapers electronic media', 'What has been the effect of the electronic media on the newspaper\nindustry?', 'Relevant documents must explicitly attribute effects to the electronic\nmedia: information about declining readership is irrelevant unless\nit attributes the cause to the electronic media.'), }) self._test_queries('disks45/nocr/trec-robust-2004/fold4', count=50, items={ 0: TrecQuery('320', 'Undersea Fiber Optic Cable', "Fiber optic link around the globe (Flag) will be\nthe world's longest undersea fiber optic cable.\nWho's involved and how extensive is the technology\non this system. What problems exist?", 'Relevant documents will reference companies involved\nin building the system or the technology needed for\nsuch an endeavor. Of relevance also would be information\non the link up points of FLAG or landing sites or \ninterconnection with other telecommunication cables.\nRelevant documents may reference any regulatory problems\nwith the system once constructed. A non-relevant \ndocument would contain information on other fiber optic\nsystems currently in place.'), 9: TrecQuery('355', 'ocean remote sensing', 'Identify documents discussing the development and application of\nspaceborne ocean remote sensing.', 'Documents discussing the development and application of spaceborne \nocean remote sensing in oceanography, seabed prospecting and \nmining, or any marine-science activity are relevant. Documents \nthat discuss the application of satellite remote sensing in \ngeography, agriculture, forestry, mining and mineral prospecting \nor any land-bound science are not relevant, nor are references \nto international marketing or promotional advertizing of any \nremote-sensing technology. Synthetic aperture radar (SAR) \nemployed in ocean remote sensing is relevant.'), 49: TrecQuery('697', 'air traffic controller', 'What are working conditions and pay for U.S. air traffic controllers?', 'Relevant documents tell something about working conditions\nor pay for American controllers. Documents about foreign\ncontrollers or an individual controller are not relevant.'), }) self._test_queries('disks45/nocr/trec-robust-2004/fold5', count=50, items={ 0: TrecQuery('304', 'Endangered Species (Mammals)', 'Compile a list of mammals that are considered to be endangered,\nidentify their habitat and, if possible, specify what threatens them.', 'Any document identifying a mammal as endangered is relevant. \nStatements of authorities disputing the endangered status would also\nbe relevant. A document containing information on habitat and\npopulations of a mammal identified elsewhere as endangered would also\nbe relevant even if the document at hand did not identify the species\nas endangered. Generalized statements about endangered species \nwithout reference to specific mammals would not be relevant.'), 9: TrecQuery('339', "Alzheimer's Drug Treatment", "What drugs are being used in the treatment of \nAlzheimer's Disease and how successful are they?", "A relevant document should name a drug used in \nthe treatment of Alzheimer's Disease and also \nits manufacturer, and should give some indication \nof the drug's success or failure."), 49: TrecQuery('699', 'term limits', 'What are the pros and cons of term limits?', 'Relevant documents reflect an opinion on the value of term limits\nwith accompanying reason(s). Documents that cite the status of term\nlimit legislation or opinions on the issue sans reasons for the opinion\nare not relevant.'), }) self._test_queries('disks45/nocr/trec7', count=50, items={ 0: TrecQuery('351', 'Falkland petroleum exploration', 'What information is available on petroleum exploration in \nthe South Atlantic near the Falkland Islands?', 'Any document discussing petroleum exploration in the\nSouth Atlantic near the Falkland Islands is considered\nrelevant. Documents discussing petroleum exploration in \ncontinental South America are not relevant.'), 9: TrecQuery('360', 'drug legalization benefits', 'What are the benefits, if any, of drug legalization?', 'Relevant documents may contain information on perceived\nbenefits of drug legalization, such as crime reduction, \nimproved treatment using monies which otherwise would \nhave gone for crime fighting, reduced drug addiction, \nand increased governmental income. Documents that \ndiscuss drug legalization and whether legalization\nis or is not perceived to be beneficial are relevant.'), 49: TrecQuery('400', 'Amazon rain forest', 'What measures are being taken by local South\nAmerican authorities to preserve the Amazon\ntropical rain forest?', 'Relevant documents may identify: the official \norganizations, institutions, and individuals\nof the countries included in the Amazon rain\nforest; the measures being taken by them to\npreserve the rain forest; and indications of\ndegrees of success in these endeavors.'), }) self._test_queries('disks45/nocr/trec8', count=50, items={ 0: TrecQuery('401', 'foreign minorities, Germany', 'What language and cultural differences impede the integration \nof foreign minorities in Germany?', 'A relevant document will focus on the causes of the lack of\nintegration in a significant way; that is, the mere mention of\nimmigration difficulties is not relevant. Documents that discuss\nimmigration problems unrelated to Germany are also not relevant.'), 9: TrecQuery('410', 'Schengen agreement', 'Who is involved in the Schengen agreement to eliminate border\ncontrols in Western Europe and what do they hope to accomplish?', 'Relevant documents will contain any information about the\nactions of signatories of the Schengen agreement such as:\nmeasures to eliminate border controls (removal of traffic\nobstacles, lifting of traffic restrictions); implementation\nof the information system data bank that contains unified\nvisa issuance procedures; or strengthening of border controls\nat the external borders of the treaty area in exchange for \nfree movement at the internal borders. Discussions of border \ncrossovers for business purposes are not relevant.'), 49: TrecQuery('450', 'King Hussein, peace', 'How significant a figure over the years was the late\nJordanian King Hussein in furthering peace in the \nMiddle East?', "A relevant document must include mention of Israel;\nKing Hussein himself as opposed to other Jordanian\nofficials; discussion of the King's on-going, previous \nor upcoming efforts; and efforts pertinent to the peace \nprocess, not merely Jordan's relationship with other \nmiddle-east countries or the U.S."), }) def test_qrels(self): self._test_qrels('disks45/nocr/trec-robust-2004', count=311410, items={ 0: TrecQrel('301', 'FBIS3-10082', 1, '0'), 9: TrecQrel('301', 'FBIS3-10635', 0, '0'), 311409: TrecQrel('700', 'LA123090-0137', 0, '0'), }) self._test_qrels('disks45/nocr/trec-robust-2004/fold1', count=62789, items={ 0: TrecQrel('302', 'FBIS3-10615', 0, '0'), 9: TrecQrel('302', 'FBIS3-22470', 0, '0'), 62788: TrecQrel('700', 'LA123090-0137', 0, '0'), }) self._test_qrels('disks45/nocr/trec-robust-2004/fold2', count=63917, items={ 0: TrecQrel('301', 'FBIS3-10082', 1, '0'), 9: TrecQrel('301', 'FBIS3-10635', 0, '0'), 63916: TrecQrel('698', 'LA123190-0100', 0, '0'), }) self._test_qrels('disks45/nocr/trec-robust-2004/fold3', count=62901, items={ 0: TrecQrel('306', 'FBIS3-1010', 0, '0'), 9: TrecQrel('306', 'FBIS3-13331', 0, '0'), 62900: TrecQrel('693', 'LA122789-0115', 0, '0'), }) self._test_qrels('disks45/nocr/trec-robust-2004/fold4', count=57962, items={ 0: TrecQrel('320', 'FBIS3-10291', 0, '0'), 9: TrecQrel('320', 'FBIS3-20327', 0, '0'), 57961: TrecQrel('697', 'LA122589-0068', 0, '0'), }) self._test_qrels('disks45/nocr/trec-robust-2004/fold5', count=63841, items={ 0: TrecQrel('304', 'FBIS3-1584', 0, '0'), 9: TrecQrel('304', 'FBIS3-37947', 0, '0'), 63840: TrecQrel('699', 'LA123190-0008', 0, '0'), }) self._test_qrels('disks45/nocr/trec7', count=80345, items={ 0: TrecQrel('351', 'FBIS3-10411', 0, '0'), 9: TrecQrel('351', 'FBIS3-11107', 1, '0'), 80344: TrecQrel('400', 'LA123190-0051', 0, '0'), }) self._test_qrels('disks45/nocr/trec8', count=86830, items={ 0: TrecQrel('401', 'FBIS3-10009', 0, '0'), 9: TrecQrel('401', 'FBIS3-11424', 0, '0'), 86829: TrecQrel('450', 'LA123190-0061', 0, '0'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/dpr_w100.py ================================================ import re import unittest from ir_datasets.datasets.dpr_w100 import DprW100Doc, DprW100Query from ir_datasets.formats import TrecQrel, TrecQuery from .base import DatasetIntegrationTest class TestDprW100(DatasetIntegrationTest): def test_docs(self): self._test_docs('dpr-w100', count=21015324, items={ 0: DprW100Doc('1', re.compile('^"Aaron Aaron \\( or ; ""Ahärôn""\\) is a prophet, high priest, and the brother of Moses in the Abrahamic.{412} brother\'s spokesman \\(""prophet""\\) to the Pharaoh\\. Part of the Law \\(Torah\\) that Moses received from"$', flags=48), 'Aaron'), 9: DprW100Doc('10', re.compile('^"families some time in Israel\'s past\\. Others argue that the story simply shows what can happen if th.{397}ho affirmed Moses\' uniqueness as the one with whom the spoke face to face\\. Miriam was punished with"$', flags=48), 'Aaron'), 21015323: DprW100Doc('21015324', re.compile('^"committee was established before the building was opened\\. It is the District Nursing base for North.{425}ontains 81 extra care apartments two GP surgeries, a public library, a community café, an optician,"$', flags=48), '"Limelight centre"'), }) def test_queries(self): self._test_queries('dpr-w100/natural-questions/train', count=58880, items={ 0: DprW100Query('0', 'big little lies season 2 how many episodes', ('seven',)), 9: DprW100Query('9', 'who is in charge of enforcing the pendleton act of 1883', ('United States Civil Service Commission',)), 58879: DprW100Query('58879', 'who plays the army guy in pitch perfect 3', ('Matt Lanter', 'Troy Ian Hall')), }) self._test_queries('dpr-w100/natural-questions/dev', count=6515, items={ 0: DprW100Query('0', 'who sings does he love me with reba', ('Linda Davis',)), 9: DprW100Query('9', 'what is the name of wonder womans mother', ('Queen Hippolyta',)), 6514: DprW100Query('6514', 'girl from the shut up and dance video', ('Lauren Taft',)), }) self._test_queries('dpr-w100/trivia-qa/train', count=78785, items={ 0: DprW100Query('0', 'Who was President when the first Peanuts cartoon was published?', ('Presidency of Harry S. Truman', 'Hary truman', 'Harry Shipp Truman', "Harry Truman's", 'Harry S. Truman', 'Harry S.Truman', 'Harry S Truman', 'H. S. Truman', 'President Harry Truman', 'Truman administration', 'Presidency of Harry Truman', 'Mr. Citizen', 'HST (president)', 'H.S. Truman', 'Mary Jane Truman', 'Harry Shippe Truman', 'S truman', 'Harry Truman', 'President Truman', '33rd President of the United States', 'Truman Administration', 'Harry Solomon Truman', 'Harold Truman', 'Harry truman', 'H. Truman')), 9: DprW100Query('9', 'Which was the first European country to abolish capital punishment?', ('Norvège', 'Mainland Norway', 'Norway', 'Norvege', 'Noregur', 'NORWAY', 'Norwegian state', 'Etymology of Norway', 'Noruega', 'Norwegen', 'ISO 3166-1:NO', 'Noreg', 'Republic of Norway', 'Norwegian kingdom', 'Kongeriket Noreg', 'Name of Norway', 'Kongeriket Norge', 'Noorwegen', 'Kingdom of Norway', 'Sport in Norway', 'Norwegia', 'Royal Kingdom of Norway')), 78784: DprW100Query('78784', 'According to the Bart Simpsons TV ad, Nobody better lay a finger on my what??', ('Butterfingers Snackerz', 'Butterfinger (ice cream)', 'Butterfinger Crisp', 'Nestlé Butterfinger', 'Butterfinger Snackerz', 'Butterfinger Ice Cream Bars', "Butterfinger BB's", 'Butterfinger', 'The Butterfinger Group')), }) self._test_queries('dpr-w100/trivia-qa/dev', count=8837, items={ 0: DprW100Query('0', 'The VS-300 was a type of what?', ('🚁', 'Helicopters', 'Civilian helicopter', 'Pescara (helicopter)', 'Cargo helicopter', 'Copter', 'Helecopter', 'List of deadliest helicopter crashes', 'Helichopper', 'Helocopter', 'Cargo Helicopter', 'Helicopter', 'Helicoptor', 'Anatomy of a helicopter')), 9: DprW100Query('9', 'Who wrote The Turn Of The Screw in the 19th century and The Ambassadors in the 20th?', ('The Finer Grain', 'Henry james', 'James, Henry', 'Henry James')), 8836: DprW100Query('8836', 'Name the artist and the title of this 1978 classic that remains popular today: We were at the beach Everybody had matching towels Somebody went under a dock And there they saw a rock It wasnt a rock', ('Rock Lobster by the B-52s',)), }) def test_qrels(self): self._test_qrels('dpr-w100/natural-questions/train', count=8856662, items={ 0: TrecQrel('0', '18768923', 2, '0'), 9: TrecQrel('0', '928112', 0, '0'), 8856661: TrecQrel('58879', '14546521', -1, '0'), }) self._test_qrels('dpr-w100/natural-questions/dev', count=979893, items={ 0: TrecQrel('0', '11828866', 2, '0'), 9: TrecQrel('0', '9446572', 0, '0'), 979892: TrecQrel('6514', '11133390', -1, '0'), }) self._test_qrels('dpr-w100/trivia-qa/train', count=7878500, items={ 0: TrecQrel('0', '525858', 0, '0'), 9: TrecQrel('0', '16254256', 0, '0'), 7878499: TrecQrel('78784', '5674041', 0, '0'), }) self._test_qrels('dpr-w100/trivia-qa/dev', count=883700, items={ 0: TrecQrel('0', '7108855', 1, '0'), 9: TrecQrel('0', '10764863', 0, '0'), 883699: TrecQrel('8836', '9491145', 0, '0'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/dummy.py ================================================ import re import os import shutil import unittest import ir_datasets from ir_datasets.formats import GenericQuery, GenericDoc, TrecQrel from .base import DatasetIntegrationTest class TestDummy(DatasetIntegrationTest): def test_dummy_docs(self): dataset = ir_datasets.create_dataset( docs_tsv='test/dummy/docs.tsv', queries_tsv='test/dummy/queries.tsv', qrels_trec='test/dummy/qrels' ) self._test_docs(dataset, count=15, items={ 0: GenericDoc('T1', 'CUT, CAP AND BALANCE. TAXED ENOUGH ALREADY!'), 9: GenericDoc('T10', 'Perhaps this is the kind of thinking we need in Washington ...'), 14: GenericDoc('T15', "I've been visiting Trump Int'l Golf Links Scotland and the course will be unmatched anywhere in the world. Spectacular!"), }) def test_dummy_queries(self): dataset = ir_datasets.create_dataset( docs_tsv='test/dummy/docs.tsv', queries_tsv='test/dummy/queries.tsv', qrels_trec='test/dummy/qrels' ) self._test_qrels(dataset, count=55, items={ 0: TrecQrel('1', 'T1', 0, '0'), 9: TrecQrel('1', 'T11', 0, '0'), 54: TrecQrel('4', 'T15', 0, '0'), }) def test_dummy_qrels(self): dataset = ir_datasets.create_dataset( docs_tsv='test/dummy/docs.tsv', queries_tsv='test/dummy/queries.tsv', qrels_trec='test/dummy/qrels' ) self._test_queries(dataset, count=4, items={ 0: GenericQuery('1', 'republican party'), 3: GenericQuery('4', 'media'), }) def tearDown(self): if os.path.exists('test/dummy/docs.tsv.pklz4'): shutil.rmtree('test/dummy/docs.tsv.pklz4') if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/gov.py ================================================ import re import unittest import ir_datasets from ir_datasets.datasets.gov import GovWeb02Query, GovDoc from ir_datasets.formats import TrecQrel, TrecQuery, GenericQuery from .base import DatasetIntegrationTest _logger = ir_datasets.log.easy() class TestGov(DatasetIntegrationTest): def test_docs(self): self._test_docs('gov', count=1247753, items={ 0: GovDoc('G00-00-0000000', 'http://www.aspe.hhs.gov', 'HTTP/1.0 200 OK\r\nDate: Wed, 30 Jan 2002 17:00:23 GMT\r\nServer: WebSitePro/3.0.37\r\nAccept-ranges: bytes\r\nContent-type: text/html\r\nLast-modified: Fri, 18 Jan 2002 19:04:17 GMT\r\nContent-length: 8228\r\n', re.compile(b'^<!DOCTYPE HTML PUBLIC "\\-//w3c//dtd html 4\\.0 transitional//en" "http://www\\.w3\\.org/TR/REC\\-html40/loose.{8029} \n\\\t\\\t<P> <FONT SIZE="\\-1">Last updated on January 18, 2002\\.</FONT>\n\\\t\\\t </P></CENTER> </BODY>\n</HTML>\n\n$', flags=16), 'text/html'), 9: GovDoc('G00-00-0066311', 'http://www.oso.noaa.gov', re.compile('^HTTP/1\\.1 200 OK\\\r\nServer: Microsoft\\-IIS/4\\.0\\\r\nContent\\-Location: http://www\\.oso\\.noaa\\.gov/Index\\.htm\\\r\nDat.{80}\nLast\\-Modified: Mon, 07 Jan 2002 16:16:17 GMT\\\r\nETag: "28b3d1a29697c11:3a83"\\\r\nContent\\-Length: 12431\\\r\n$', flags=48), re.compile(b'^<html>\\\r\n\\\r\n<head lang="en\\-US">\\\r\n<meta http\\-equiv="Content\\-Type" content="text/html; charset=windows\\-1.{12232}tranpix\\.GIF" width="1" height="1" alt=""></td>\\\r\n </tr>\\\r\n </table>\\\r\n</div>\\\r\n\\\r\n</body>\\\r\n</html>\\\r\n\n$', flags=16), 'text/html'), 1247752: GovDoc('G46-12-4054845', 'http://www.tfhrc.gov/pavement/ltpp/pdf/01-024d.pdf', re.compile('^HTTP/1\\.1 200 OK\\\r\nDate: Mon, 04 Feb 2002 01:07:07 GMT\\\r\nServer: Apache/1\\.3\\.6 \\(Unix\\)\\\r\nLast\\-Modified: We.{82}Content\\-Length: 977004\\\r\nConnection: close\\\r\nContent\\-Type: application/pdf\\\r\nX\\-Pad: avoid browser bug\\\r\n$', flags=48), re.compile(b'^ Mean\\(Neg Area\\)\n\n 0 \\.001 \\.01 \\.05\\.10 \\..{101813} 0\\.0% 0\n\n Moments\n\n Mean 10722\\.77\n Std Dev \n$', flags=16), 'application/pdf'), }) def test_queries(self): self._test_queries('gov/trec-web-2002', count=50, items={ 0: TrecQuery('551', 'intellectual property', 'Find documents related to laws or regulations that protect\nintellectual property.', 'Relevant documents describe legislation or federal regulations that\nprotect authors or composers from copyright infringement, or from\npiracy of their creative work. These regulations may also be related\nto fair use or to encryption.'), 9: TrecQuery('560', 'Symptoms of diabetes', 'Find documents that list and explain the danger signals of Type I and\nType II diabetes.', 'Relevant documents give the symptoms/danger signals that\nconsumers need to recognize as warnings of the onset of Type I and\nType II diabetes. Exclude documents directed at medical personnel.'), 49: TrecQuery('600', 'highway safety', 'Find documents related to improving highway safety in the U.S.', 'Relevant documents include those related to the improvement of safety\nof all vehicles driven on highways, including cars, trucks, vans, and\ntractor trailers. Ways to reduce accidents through legislation,\nvehicle checks, and drivers education programs are all relevant.'), }) self._test_queries('gov/trec-web-2002/named-page', count=150, items={ 0: GenericQuery('1', "America's Century Farms"), 9: GenericQuery('10', "FBI's most wanted list"), 149: GenericQuery('150', 'employee access FDIC DC'), }) self._test_queries('gov/trec-web-2003', count=50, items={ 0: GovWeb02Query('1', 'mining gold silver coal', 'What can be learned about the location of mines\nin the U.S., about the extent of mineral resources,\nand about careers in the mining industry?'), 9: GovWeb02Query('10', 'Physical Fitness', 'Information on Physical Fitness'), 49: GovWeb02Query('50', 'anthrax', 'Info regarding prevention and treatment of the disease anthrax.'), }) self._test_queries('gov/trec-web-2003/named-page', count=300, items={ 0: GenericQuery('151', 'ADA Enforcement'), 9: GenericQuery('160', 'NSF Fact Sheet January 2002'), 299: GenericQuery('450', 'Surgeon General report on schizophrenia'), }) self._test_queries('gov/trec-web-2004', count=225, items={ 0: GenericQuery('1', 'Electoral College'), 9: GenericQuery('10', 'well water contamination'), 224: GenericQuery('225', 'Japanese surrender document'), }) def test_gov2_qrels(self): self._test_qrels('gov/trec-web-2002', count=56650, items={ 0: TrecQrel('551', 'G14-77-3709129', 0, '0'), 9: TrecQrel('551', 'G22-94-3703003', 1, '0'), 56649: TrecQrel('600', 'G12-45-0115310', 0, '0'), }) self._test_qrels('gov/trec-web-2002/named-page', count=170, items={ 0: TrecQrel('1', 'G00-04-3805407', 1, '0'), 9: TrecQrel('8', 'G00-90-0514219', 1, '0'), 169: TrecQrel('150', 'G00-48-3849824', 1, '0'), }) self._test_qrels('gov/trec-web-2003', count=51062, items={ 0: TrecQrel('1', 'G00-00-0681214', 0, '0'), 9: TrecQrel('1', 'G00-01-0682299', 0, '0'), 51061: TrecQrel('50', 'G45-99-1311180', 0, '0'), }) self._test_qrels('gov/trec-web-2003/named-page', count=352, items={ 0: TrecQrel('151', 'G02-86-0432155', 1, '0'), 9: TrecQrel('160', 'G29-09-0183573', 1, '0'), 351: TrecQrel('450', 'G44-36-3557956', 1, '0'), }) self._test_qrels('gov/trec-web-2004', count=88566, items={ 0: TrecQrel('1', 'G00-00-2869955', 0, '0'), 9: TrecQrel('1', 'G00-06-0365563', 0, '0'), 88565: TrecQrel('225', 'G45-68-2130931', 0, '0'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/gov2.py ================================================ import re import unittest import ir_datasets from ir_datasets.datasets.gov2 import Gov2Doc from ir_datasets.formats import TrecQrel, TrecQuery, GenericQuery, TrecPrel from .base import DatasetIntegrationTest _logger = ir_datasets.log.easy() class TestGov2(DatasetIntegrationTest): def test_gov2_docs(self): self._test_docs('gov2', items={ 0: Gov2Doc('GX000-00-0000000', 'http://sgra.jpl.nasa.gov', re.compile('^HTTP/1\\.1 200 OK\\\r\nDate: Tue, 09 Dec 2003 21:21:33 GMT\\\r\nServer: Apache/1\\.3\\.27 \\(Unix\\)\\\r\nLast\\-Modified: T.{45}6\\-3ca0cae9"\\\r\nAccept\\-Ranges: bytes\\\r\nContent\\-Length: 614\\\r\nConnection: close\\\r\nContent\\-Type: text/html\\\r\n$', flags=48), re.compile(b'^<html>\n\n<head>\n<title>\nJPL Sgra web Site\n\n\n\n\n\n
\n

\n\n\n\n
\nLast updated: Thu Sep 16 17:24:48 PDT 1999 \n
\n\n\n\n$', flags=16), 'text/html'), 9: Gov2Doc('GX000-00-0109156', 'http://gsbca2.gsa.gov', re.compile('^HTTP/1\\.1 200 OK\\\r\nServer: Netscape\\-Enterprise/3\\.6 SP3\\\r\nDate: Sun, 15 Jun 2003 01:36:47 GMT\\\r\nContent\\-t.{6}ext/html\\\r\nLast\\-modified: Wed, 16 Apr 2003 19:30:56 GMT\\\r\nContent\\-length: 5063\\\r\nAccept\\-ranges: bytes\\\r\n$', flags=48), re.compile(b'^\n\n\n \n\n\n\n$', flags=16), 'text/html'), }) def test_gov2_docstore(self): docstore = ir_datasets.load('gov2').docs_store() docstore.clear_cache() with _logger.duration('cold fetch'): docstore.get_many(['GX269-06-1933735', 'GX269-06-16539507', 'GX002-04-0481202']) with _logger.duration('warm fetch'): docstore.get_many(['GX269-06-1933735', 'GX269-06-16539507', 'GX002-04-0481202']) docstore = ir_datasets.load('gov2').docs_store() with _logger.duration('warm fetch (new docstore)'): docstore.get_many(['GX269-06-1933735', 'GX269-06-16539507', 'GX002-04-0481202']) with _logger.duration('cold fetch (nearby)'): docstore.get_many(['GX269-06-16476479', 'GX269-06-1939325', 'GX002-04-0587205']) with _logger.duration('cold fetch (earlier)'): docstore.get_many(['GX269-06-0125294', 'GX002-04-0050816']) def test_gov2_queries(self): self._test_queries('gov2/trec-tb-2004', count=50, items={ 0: TrecQuery('701', 'U.S. oil industry history', 'Describe the history of the U.S. oil industry', 'Relevant documents will include those on historical exploration and\ndrilling as well as history of regulatory bodies. Relevant are history\nof the oil industry in various states, even if drilling began in 1950\nor later.'), 9: TrecQuery('710', 'Prostate cancer treatments', 'What are the various treatments for prostate cancer?', 'Relevant cancer treatments include radiation therapy, radioactive\npellets, hormonal therapy and surgery. "Watchful waiting" is also\nconsidered relevant.'), 49: TrecQuery('750', 'John Edwards womens issues', "What are Senator John Edwards' positions on women's issues such as pay\nequity, abortion, Title IX and violence against women.", "Relevant documents will indicate Senator John Edwards' stand on issues\nconcerning women, such as pay parity, abortion rights, Title IX, and\nviolence against women. Lists of press releases are relevant when the\nheadlines show he is voting for or against bills on women's\nissues. Not relevant are Edwards' positions on issues not exclusively\nconcerning women."), }) self._test_queries('gov2/trec-tb-2005', count=50, items={ 0: TrecQuery('751', 'Scrabble Players', 'Give information on Scrabble players, when and where Scrabble is\nplayed, and how popular it has been.', "Give information on the social aspects of the game Scrabble. Scrabble\nplayers may be named or described as a group. Both real and fictional\nplayers are relevant. Mention of a scheduled Scrabble game is\nrelevant. Scrabble's popularity is relevant. An account of a\nparticular game is relevant. Descriptions of variants on the Scrabble\ngame are not relevant. Use of Scrabble tiles for other purposes are\nnot relevant. Scrabble software is not relevant unless there is\nmention of its users. Titles of Scrabble-related books (dictionaries,\nglossaries, rulebooks) are not relevant."), 9: TrecQuery('760', 'american muslim mosques schools', 'Statistics regarding American Muslims, mosques, and schools.', 'Relevant documents should provide some count or proportion of mosques,\nMuslim-affiliated schools, or population. With regard to population,\nspecific age groupings, sexes, or other categorizations are\nacceptable. The statistics can be pertinent to a specific geographic\narea, such as Fulton County, the state of California, or the\nNortheast. There is no restriction as to time period (for example\n2005 versus 1987).'), 49: TrecQuery('800', 'Ovarian Cancer Treatment', 'The remedies and treatments given to lesson or stop effects of ovarian\ncancer.', 'Relevant documents must include names of chemicals or medicines used\nto fight ovarian cancer. Studies of new treatments that are being\ntried are valid, even if they have not reached a conclusion as to\neffectiveness.'), }) self._test_queries('gov2/trec-tb-2006', count=50, items={ 0: TrecQuery('801', 'Kudzu Pueraria lobata', 'Describe the origin, nature, extent of spread and means of controlling\nkudzu.', 'Identification of kudzu as an invasive species with description of how\nit spreads and grows is relevant. A document which is simply a list\nheaded "invasive species" or "noxious weeds" including kudzu is not\nrelevant. A statement that kudzu is present in a specific location is\nnot relevant unless it relates to its spread. Features of kudzu such\nas its use as a treatment for alcoholism or its function as a haven\nfor plant pathogens describe its nature and are relevant.'), 9: TrecQuery('810', 'timeshare resales', 'Provide information regarding timeshare resales.', 'Relevant documents will include those describing the prospects of\nreselling a timeshare and the pitfalls one should be aware of when\nselling a timeshare. Real estate legislature regarding the resale of\ntimeshares is not relevant.'), 49: TrecQuery('850', 'Mississippi River flood', 'How frequently does the Mississippi River flood its banks?', 'Flooding is a relative term which implies water overflowing its\ncontainer and causing damage to the surrounding ares. Documents are\nrelevant if they describe Mississippi River events which are commonly\nconsidered to be floods. Relevant documents may also show how such\nevents have led to the introduction of controls to lessen the\nfrequency of damaging floods of this river. Relevant documents\ninclude different levels of flooding, not only the major ones.\nDocuments are not relevant if they are essentially forecasts or\nroutine reports of water levels. They are also not relevant if they\nare purely bibliographies or lists of sources for relevant documents.\nPhotos and videos of floods alone are not relevant.'), }) self._test_queries('gov2/trec-tb-2005/named-page', count=252, items={ 0: GenericQuery('601', 'metallurgy division world war history'), 9: GenericQuery('610', 'united states vs david j. kaiser transcript court appeal'), 251: GenericQuery('872', 'medical advisory committee memorandum a rule to exclude idet'), }) self._test_queries('gov2/trec-tb-2005/efficiency', count=50000, items={ 0: GenericQuery('1', 'pierson s twin lakes marina'), 9: GenericQuery('10', 'hotel meistertrunk'), 49999: GenericQuery('50000', 'senator durbin'), }) self._test_queries('gov2/trec-tb-2006/named-page', count=181, items={ 0: GenericQuery('901', 'CCAP advance case search'), 9: GenericQuery('910', 'HS project "It\'s not easy being green"'), 180: GenericQuery('1081', 'Colleges in PA'), }) self._test_queries('gov2/trec-tb-2006/efficiency', count=100000, items={ 0: GenericQuery('1', 'commissioner of revenue orange county virginia'), 9: GenericQuery('10', 'terrorism policies in history'), 99999: GenericQuery('100000', 'cervical flexion extension injury'), }) self._test_queries('gov2/trec-tb-2006/efficiency/10k', count=10000, items={ 0: GenericQuery('1', 'commissioner of revenue orange county virginia'), 9: GenericQuery('10', 'terrorism policies in history'), 9999: GenericQuery('10000', 'gdp of international business in bermuda'), }) self._test_queries('gov2/trec-tb-2006/efficiency/stream1', count=25000, items={ 0: GenericQuery('1', 'commissioner of revenue orange county virginia'), 9: GenericQuery('10', 'terrorism policies in history'), 24999: GenericQuery('25000', 'organized crime in columbus ohio'), }) self._test_queries('gov2/trec-tb-2006/efficiency/stream2', count=25000, items={ 0: GenericQuery('25001', 'pea ridge national park'), 9: GenericQuery('25010', 'nylon concrete'), 24999: GenericQuery('50000', 'mark wallace bush and u.n.'), }) self._test_queries('gov2/trec-tb-2006/efficiency/stream3', count=25000, items={ 0: GenericQuery('50001', "dept veteran's affairs connecticut"), 9: GenericQuery('50010', 'nuclear & missile cold war'), 24999: GenericQuery('75000', 'the role and responsibilities of the u.s. senate'), }) self._test_queries('gov2/trec-tb-2006/efficiency/stream4', count=25000, items={ 0: GenericQuery('75001', 'united states office of personel management'), 9: GenericQuery('75010', 'percentage of youth tobacco smokers'), 24999: GenericQuery('100000', 'cervical flexion extension injury'), }) self._test_queries('gov2/trec-mq-2007', count=10000, items={ 0: GenericQuery('1', 'after school program evaluation'), 9: GenericQuery('10', 'qualifications for a senator'), 9999: GenericQuery('10000', 'californa mission'), }) self._test_queries('gov2/trec-mq-2008', count=10000, items={ 0: GenericQuery('10001', 'comparability of pay analyses'), 9: GenericQuery('10010', 'in in 2015 will the u.s military be fighting iran and north korea'), 9999: GenericQuery('20000', 'manchester city hall'), }) def test_gov2_qrels(self): self._test_qrels('gov2/trec-tb-2004', count=58077, items={ 0: TrecQrel('701', 'GX000-00-13923627', 0, '0'), 9: TrecQrel('701', 'GX000-25-2008761', 1, '0'), 58076: TrecQrel('750', 'GX272-82-4931834', 0, '0'), }) self._test_qrels('gov2/trec-tb-2005', count=45291, items={ 0: TrecQrel('751', 'GX000-00-13125308', 0, '0'), 9: TrecQrel('751', 'GX000-47-11993633', 0, '0'), 45290: TrecQrel('800', 'GX272-48-8680401', 1, '0'), }) self._test_qrels('gov2/trec-tb-2006', count=31984, items={ 0: TrecQrel('801', 'GX000-01-2722311', 0, '0'), 9: TrecQrel('801', 'GX001-46-11521081', 1, '0'), 31983: TrecQrel('850', 'GX272-67-14117174', 0, '0'), }) self._test_qrels('gov2/trec-tb-2005/named-page', count=11729, items={ 0: TrecQrel('601', 'GX000-06-6013381', 1, '0'), 9: TrecQrel('606', 'GX001-80-15356704', 1, '0'), 11728: TrecQrel('872', 'GX270-03-12329248', 1, '0'), }) self._test_qrels('gov2/trec-tb-2005/efficiency', count=45291, items={ 0: TrecQrel('1192', 'GX000-00-13125308', 0, '0'), 9: TrecQrel('1192', 'GX000-47-11993633', 0, '0'), 45290: TrecQrel('49462', 'GX272-48-8680401', 1, '0'), }) self._test_qrels('gov2/trec-tb-2006/named-page', count=2361, items={ 0: TrecQrel('901', 'GX123-98-3885901', 1, '0'), 9: TrecQrel('902', 'GX078-80-12349004', 1, '1'), 2360: TrecQrel('1081', 'GX136-71-9506712', 1, '0'), }) self._test_qrels('gov2/trec-tb-2006/efficiency', count=31984, items={ 0: TrecQrel('62937', 'GX000-01-2722311', 0, '0'), 9: TrecQrel('62937', 'GX001-46-11521081', 1, '0'), 31983: TrecQrel('71136', 'GX272-67-14117174', 0, '0'), }) self._test_qrels('gov2/trec-tb-2006/efficiency/stream3', count=31984, items={ 0: TrecQrel('62937', 'GX000-01-2722311', 0, '0'), 9: TrecQrel('62937', 'GX001-46-11521081', 1, '0'), 31983: TrecQrel('71136', 'GX272-67-14117174', 0, '0'), }) self._test_qrels('gov2/trec-mq-2007', count=73015, items={ 0: TrecPrel('10', 'GX253-98-16418961', 0, 1, 0.165904710473544), 9: TrecPrel('10', 'GX225-79-9870332', 1, 1, 0.568969759822883), 73014: TrecPrel('9999', 'GX237-19-4725226', 1, 1, 0.659520100607628), }) self._test_qrels('gov2/trec-mq-2008', count=15211, items={ 0: TrecPrel('10002', 'GX037-06-11625428', 0, 1, 0.0031586555555558), 9: TrecPrel('10032', 'GX010-65-7921994', 0, 1, 0.00137811889937823), 15210: TrecPrel('19997', 'GX257-71-11550035', 0, 1, 0.00107614156144011), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/hc4.py ================================================ import unittest from ir_datasets.formats import ExctractedCCDoc, ExctractedCCQuery, TrecQrel from .base import DatasetIntegrationTest class TestHC4(DatasetIntegrationTest): def test_hc4_zh_docs(self): self._test_docs('hc4/zh', count=646305, items={ 0: ExctractedCCDoc(doc_id='a2b32b07-a008-438a-a8c3-b8331a1e306d', title='云南网,云南省重点新闻网站,世界看云南的窗口,云南看世界的视角', text='今年夏季,一片浪漫花海在抚仙湖边横空出世,既美化了抚仙湖,又为澄江的旅游带来了旺盛的人气。', url='http://www.yunnan.cn', time=None, cc_file='crawl-data/CC-NEWS/2016/08/CC-NEWS-20160826124520-00000.warc.gz'), 20000: ExctractedCCDoc(doc_id='407c5f73-0a81-4da7-8294-496c8dc7c493', title='學測指考一例一休?勞動部:不影響', text='我要評比\n\n一例一休上路,大考中心認為學測、指考均利用周末舉行,恐無法負擔加班費衍生成本,考慮調漲報名費。勞動部對此回應「不受影響」,監考人員受僱於大考中心,監考天數若未達連續6天,就不會產生休息日加班費的問題。\n\n至於入闈出題者,勞動部指出,依勞動部7休1函釋,入闈者經勞資雙方合意,可挪移例假,最多可以連上12天班,不算違法,也可以安排休息日。\n\n(中時)', url='http://www.chinatimes.com/realtimenews/20170109005061-260405', time='2017-01-09T17:25:51+08:00', cc_file='crawl-data/CC-NEWS/2017/01/CC-NEWS-20170109103449-00080.warc.gz'), 646304: ExctractedCCDoc(doc_id='5eaf1d5b-1591-4b42-9e6f-2c878b721d5e', title='六福村徵鬼!當鬼王陪玩笑傲飛鷹 讓你選擇笑死還是嚇死', text='▲應徵上鬼王還能陪遊客一起搭乘笑傲飛鷹。(圖/六福村提供,下同)\n\n\n\n記者陳凱力/新竹報導\n\n中國人怕鬼,西洋人也怕鬼,全世界的人都怕鬼!鬼門要關了,但六福村主題遊樂園不關門,迎接即將到來的萬聖節,六福村下達急急如律令,號召全台敢秀、愛熱鬧、腦中還有1000種大膽、充滿創意、千奇百怪嚇朋友鬼點子的「鬼王」,一起到村子裡來群魔亂舞開趴吧!\n\n請繼續往下閱讀...\n\n▲六福村開出100名鬼缺。\n\n\n\n徵的是鬼!徵的是鬼!六福村徵的就是鬼!猛鬼團隊限額招募100個名額,教你畫上血淋淋、恐怖指數破表的的鬼妝穿梭在園區,工作內容就是要把遊客嚇得「吱吱叫」,讓園區充滿高分貝驚聲尖叫,一邊當鬼王還能暢玩遊樂設施,像是鬼王陪遊客搭上宛如屍速列車的笑傲飛鷹,神出鬼沒鬼吼軌叫,從白天惡靈狂歡派對到夜晚最恐怖的萬聖遊行惡靈夜驚魂,整個園區都是鬼王舞台,絕對是北台灣下半年度最恐怖又最有趣的打工。\n\n六福村行銷經理黃淑芳說,六福村連續三年推出的萬聖節慶典活動「墓碑鎮」令遊客十分驚艷,面對今年即將到來的萬聖節,延續好評投入更多心力物力準備,猛鬼們將會出沒在10月1日起跑的六福萬聖節慶,滿滿一個月的主題活動,而這項「鬼」工作適合鬼靈精怪的朋友來挑戰,經審查錄取後,將有專業特殊化妝老師教你如何變成一副鬼模樣,並安排豐富的肢體訓練、驚嚇訓練等課程,工作期間更提供免費餐點。園方指出,通過履歷第一階段的朋友將接獲面試邀請,接受「驚嚇測試」項目包含情境演技、角色投入和劇本演繹等測試。\n\n▲100名猛鬼們將會出沒在10月1日起跑的六福萬聖節慶。\n\n這是一份擁有充分舞台的工作,戲劇系、表演科系學生將優先錄取,通過履歷第一階段的朋友將接獲面試邀請,接受「驚嚇測試」項目包含情境演技、角色投入和劇本演繹等測試,擁有其他特殊專長如體操、特技更是加分條件。你,準備好要挑戰當鬼王了嗎?我們不見不散!\n\n更多活動詳情及優惠,歡迎至六福村官網 、六福村官方線上購票 、FB粉絲專頁或猛鬼人力銀行網站查詢。', url='https://www.ettoday.net/news/20190828/1523501.htm', time='2019-08-28T15:00:00', cc_file='crawl-data/CC-NEWS/2019/08/CC-NEWS-20190828113707-01025.warc.gz') }) def test_hc4_fa_docs(self): self._test_docs('hc4/fa', count=486486, items={ 0: ExctractedCCDoc(doc_id='9064520f-bc4d-4118-a30e-7d99f5adc612', title='گشن\u200c شاخ پارسی در گردباد تعصب تباری', text='پذيرش > دیدگاه > گشن\u200c شاخ پارسی در گردباد تعصب تباری\n\nزبان و ادبیات پارسی یک هویت فرهنگی است. مرز هویت فرهنگی را نمی\u200cتوان با مرزهای سیال سیاسی و تباری اشتباه گرفت. فرهنگ تعریفش را نه از سیاست می\u200cگیرد و نه از تبار و قبیله. فرهنگ مجموعه داشته\u200cهایی است خیلی فراتر از این مقوله\u200cهای تقلیل\u200cگرا. این سیاست و تبار است که در فرایند تاریخ و تحول، در درون فرهنگ شکل می\u200cگیرند. بنابراین، سیاست و تبار برساخته\u200cهای فرهنگ اند و نه عکس آن. از این\u200cرو، نمی\u200cتوان هیچ سیاست و تبار خاصی را از میان چندین خرده\u200cفرهنگ و کتله\u200cهای تباری برساخته\u200cشده، وارث اصلی و یگانه\u200cی فرهنگ مادر خواند. سهمیه\u200cبندی این میراث بر مبنای تفکیک "خلف" از "ناخلف"، چیزی نیست جز یک انتحار فرهنگی و سلاخی پیکر واحد و تنومند فرهنگ. انتحار فرهنگی صرفن می\u200cتواند از دست یک سیاست فاشیستی و عصبیت کور تباری و قبیلوی برآمده باشد! وارث به\u200cحق و شایسته\u200cی فرهنگ مادر در هر زمانی، صرف نظر از نسبت تباری و قبیلوی و سیاسی، کسانی یا جمعی می\u200cتوانند بود که در جهت استمرار و تعالی آن فرهنگ کاری کرده و خون دلی خورده باشند. لذا، این\u200cکه شخصی با یک روی\u200cکرد کماکان ملاعمری از یک آدرس فرهنگی آمده می\u200cگوید من در انتخابات گذشته به فلانی رأی ندادم اما اگر بسمدانی کاندید می\u200cکرد، به او رأی می\u200cدادم، مگر جز پیروی از یک سیاست فاشیستی و حیوانیت تباری، چیز دیگری را هم می\u200cتواند به نمایش گذارد؟ آقا، تویی که سنگ میراث\u200cداری از کلیت زبان و ادبیات پارسی در یک حوزه\u200cی تمدنی را به سینه می\u200cزنی، اگر خوره\u200cی حیوانیت تباری در مغز جانت خانه نکرده، دیگر این جیره\u200cبندی\u200cها به چی معنا؟ اصلن این یعنی چی که داد از زبان و ادبیات پارسی می\u200cزنیم، اما حرکات و رفتارمان نشان از یک عصبیت افراطی ناسیونالیستی دارند و در حقیقت، در سمت تکه-پاره شدن این پارسی گام بر می\u200cداریم؟ این نعل وارونه دیگر به چی منظور که بر گرده\u200cی یک حوزه تمدنی بزرگ، تعصب تباری\u200cمان را سوار می\u200cکنیم که باری اگر حرف بر سر حقانیت مسأله باشد، این گل سرسبد تبار شاید کم\u200cترین سهم و نقش را در دامن قند پارسی داشته؟ (نابغه\u200cی دیگری از این\u200cسو در یک شبه\u200cتوطئه\u200cی ناسیونالیستی و ضد فرهنگی رفته بابت تمام بی\u200cفرهنگی\u200cهای مهاجرین افغانستانی در ایران(!!) از آن کشور معذرت تاریخی خواسته!!)\n\n\n\nفرهنگی که امروزه یگانه مشخصه و معرف آن زبان و ادبیات دیرینه\u200cسال پارسی است، حوزه\u200cی تمدنی\u200cیی را شامل می\u200cشود که اگر این مرزکشی\u200cهای سیاسی چند سال پسین را در نظر بگیریم افغانستان، تاجیکستان و ایران به یک اندازه در آن سهم دارند و در درون هریک از این کشورها نیز خرده\u200cفرهنگ\u200cها و تبارهایی زیست دارند که هیج\u200cیک بر دیگری ارجحیت و برتری ندارند تا به تنهایی "وارث خلف" آن حوزه\u200cی تمدنی و تاریخش قلمداد شود، جز آنی که در راستای بقا و بالندگی این حوزه\u200cی تمدنی و افتخاراتش، کار و پیکار کرده اند. حوزه\u200cی تمدنی زبان و ادبیات پارسی، ملک طلق هیچ سیاست و تبار یگانه و خاصی نیست! یک خاوری ساکن مشهد همانقدر می\u200cتواند وارث این حوزه\u200cی تمدنی و افتخاراتش باشد که یک ایرانی ساکن تهران و... اساس اشتراک این خرده\u200cفرهنگ\u200cها و تبارها در درون این حوزه\u200cی تمدنی واحد، زبان و ادبیات تاریخی\u200cشان است. فقط زبان و ادبیات در این معنا می\u200cتواند برای هریک از این خرده\u200cهویت\u200cهای شناور و سرگردان، یک هویت واحد باشد و به تعبیر هایدگر: «خانه\u200cی وجودشان». بلی، برای تصاحب شدن و مصادره کردن پارسی که هرگز تصاحب پذیر نمی\u200cتواند بود، نه سکه\u200cی رنگ\u200cرفته\u200cی تبار ارزشی دارد و نه سیاست\u200cهای فاشیستی هتلروار! فقط یک هویت واحد (زبان) است که می\u200cتواند به قول ویتگنشتاین: «مرزهای جهان\u200cمان را تعیین کند» و بس!\n\nبیایید با عصبیت\u200cهای زشت و عق\u200cآور تباری و سیاسی، دامن فرهنگ گران\u200cسنگ پارسی را لکه\u200cدار نکنیم. وگرنه آن بزرگانی که از جان و زندگی\u200cشان مایه گذاشته این کاخ بلند را بنا گذاشته اند و ما امروزه به طرز بی\u200cشرمانه\u200cیی آن\u200cها را نیز به نفع جنون مبتذل تباری خود مصادره می\u200cکنیم، بر روی\u200cمان تف خواهند انداخت!\n\nبه امید بقا و درخشش مدام پارسی و شرمنده باد آنانی که با انتحار فرهنگی\u200cشان می\u200cخواهند این کاخ بلندمان را ویران و کلبه\u200cی حقیرانه\u200cی تباری و سیاسی\u200cشان را بر آوار آن بنا کنند!', url='http://kabulpress.org/article239985.html', time=None, cc_file='crawl-data/CC-NEWS/2016/08/CC-NEWS-20160826124520-00000.warc.gz'), 20000: ExctractedCCDoc(doc_id='67d086f7-0d09-44e5-b3b8-92caf7930827', title='ورزشکاران حرفه\u200cای در مردم شوق ورزش ایجاد کنند', text='یزد ـ استاندار یزد با اشاره به اینکه مردم به ورزشکاران علاقمند هستند و به آنها توجه نشان می دهند، گفت: شایسته است ورزشکاران حرفه\u200cای در مردم شوق ورزش ایجاد کنند.\n\nبه گزارش خبرنگار مهر، سیدمحمد میرمحمدی در مراسم افتتاحیه شانزدهمین صعود بین المللی کوهنوردان سراسر کشور به شیرکوه در سالن همایش های هلال احمر استان یزد اظهار داشت: مردم از ورزشکاران حرفه ای و محبوب الگوگیری می کنند و خوب است ورزشکاران از این امر به عنوان یک فرصت برای جذب مردم به ورزش استفاده کنند.\n\nوی با اشاره به اینکه حضور کوهنوردان نخبه کشور در یزد در مردم شور و شعف ایجاد می کند، افزود: این حضور و همدلی ها در مردم برای انجام ورزش اشتیاق ایجاد می کند.\n\nاستاندار یزد با شاره به اینکه همواره از گذشته بر این موضوع که عقل سالم در بدن سالم است، تاکید شده است، عنوان کرد: اگر مردم را به ورزش کردن و تحرک تشویق کنیم به طور قطع از بار مشکلات و بیماری های موجود در جامعه کاسته می شود ضمن اینکه مردم انگیزه و نشاط بیشتری برای ادامه زندگی خواهند داشت.\n\nمیرمحمدی با بیان اینکه از ورزش کوهنوردی لذت می برم، افزود: یکی از زیبایی\u200cها و جذابیت\u200cهای کوهنوردی صعود است و در صعود روح تعالی وجود دارد.\n\nوی خاطرنشان کرد: اخلاق ورزشی در ورزش کوهنوردی شکل و صورت دیگری دارد زیرا اخلاق ورزشی از همین همدلی ها و دور هم بودن ها ناشی می شود.\n\nاستاندار یزد از حضور قهرمانان بین المللی نظیر عظیم قیچی\u200cساز، همچنین هشت نفر دیگر از کوهنوردان بین المللی و آقایان عطری و گلشن نیا قهرمانان کوهنوردی استان یزد در این همایش ابراز خرسندی کرد و افزود: حضور ۴۰۰ کوهنورد حرفه ای از سراسر کشور افتخاری برای استان یزد است.\n\nوی اظهار امیدواری کرد: سفر به استان یزد و صعود به شیرکوه خاطرات خوشایندی برای کوهنوردان کشور از یزد بر جای بگذارد.\n\nبه گزارش مهر، در این مراسم از کوهنوردان برتر استان یزد تجلیل شد و از سوی رئیس فدراسیون کوهنوردی نیز پرچمی به استاندار یزد اهدا شد.', url='http://www.mehrnews.com/news/3895020/%D9%88%D8%B1%D8%B2%D8%B4%DA%A9%D8%A7%D8%B1%D8%A7%D9%86-%D8%AD%D8%B1%D9%81%D9%87-%D8%A7%DB%8C-%D8%AF%D8%B1-%D9%85%D8%B1%D8%AF%D9%85-%D8%B4%D9%88%D9%82-%D9%88%D8%B1%D8%B2%D8%B4-%D8%A7%DB%8C%D8%AC%D8%A7%D8%AF-%DA%A9%D9%86%D9%86%D8%AF', time='2017-02-02T08:04:10+00:00', cc_file='crawl-data/CC-NEWS/2017/02/CC-NEWS-20170202041952-00044.warc.gz'), 486485: ExctractedCCDoc(doc_id='40ab1e8e-a937-4c9e-89a9-9495a021e47e', title='بهره\u200cبرداری از طرح آبرسانی تهران به سد تنظیمی کرج تا خرداد ۹۹', text='به گزارش خبرگزاری مهر به نقل از وزارت نیرو، سید حسن رضوی در حاشیه بازدید از تصفیه\u200cخانه ششم تهران از افزایش سرعت عملیات اجرایی کانال سرریز تصفیه\u200cخانه ششم پایتخت خبر داد و اظهار داشت: به\u200cدنبال رفع موانع اجرایی کانال سرریز تصفیه\u200cخانه ششم تهران واقع در اراضی لتمال کن، از جمله پرداخت هزینه باقی\u200cمانده و تغییر پیمانکار مربوطه، عملیات اجرایی این بخش که حدود ۱۵۰۰ متر کانال بتنی با مقطع پنج متر در پنج متر است، سرعت بیشتری به خود گرفت.\n\nوی افزود: بعد از شروع عملیات اجرایی از تیر ماه امسال در محدوده یاد شده، ۶۰ هزار متر مکعب خاکبرداری و ۱۱۰۰ متر مکعب بتن\u200cریزی انجام شد و انتظار می\u200cرود با افزایش سرعت کار، تا پایان سال جاری بخش عمده کانال مذکور اجرا و وارد مرحله آب\u200cاندازی شود.\n\nمدیرعامل شرکت آب منطقه\u200cای تهران افزود: طرح آبرسانی به تهران از سد تنظیمی کرج در مجموع حدود ۹۶ درصد پیشرفت دارد که با تدابیر اتخاذ شده و با تزریق به\u200cموقع منابع مالی تا خرداد ماه سال آینده به مرحله بهره\u200cبرداری می\u200cرسد.\n\nگفتنی است، کانال سرریز تصفیه\u200cخانه ششم تهران، بخش انتهایی طرح آبرسانی به تهران از سد تنظیمی امیرکبیر است که سرریز تصفیه\u200cخانه ششم تهران را به رودخانه کن هدایت می\u200cکند. بهره\u200cبرداری از تونل انتقال آب کرج – تهران و تصفیه\u200cخانه ششم، منوط به تکمیل این کانال است.', url='https://www.mehrnews.com/news/4704991/%D8%A8%D9%87%D8%B1%D9%87-%D8%A8%D8%B1%D8%AF%D8%A7%D8%B1%DB%8C-%D8%A7%D8%B2-%D8%B7%D8%B1%D8%AD-%D8%A2%D8%A8%D8%B1%D8%B3%D8%A7%D9%86%DB%8C-%D8%AA%D9%87%D8%B1%D8%A7%D9%86-%D8%A8%D9%87-%D8%B3%D8%AF-%D8%AA%D9%86%D8%B8%DB%8C%D9%85%DB%8C-%DA%A9%D8%B1%D8%AC-%D8%AA%D8%A7-%D8%AE%D8%B1%D8%AF%D8%A7%D8%AF-%DB%B9%DB%B9', time='2019-08-28T10:56:28+00:00', cc_file='crawl-data/CC-NEWS/2019/08/CC-NEWS-20190828113707-01025.warc.gz') }) def test_hc4_ru_docs(self): self._test_docs('hc4/ru', count=4721064, items={ 0: ExctractedCCDoc(doc_id='b899e0e9-61e6-40e9-b9e1-1f19cd2559c6', title='IE 6 style', text='Сегодня мы отмечаем один из главных праздников Республики, знамен', url='http://ugo-osetia.ru/index.php/joomla-pages-iii/category-list/23-ie-6-style', time=None, cc_file='crawl-data/CC-NEWS/2016/08/CC-NEWS-20160826124520-00000.warc.gz'), 20000: ExctractedCCDoc(doc_id='c5481f83-c357-4ce5-9570-5d74a8efc442', title='Киев кошмарит итальянских политиков из-за намеченной ими поездки в Крым', text='Члены региональных советов нескольких областей Италии, которые запланировали поездку в Крым, получили письма с угрозами от консульства Украины. Об этом журналистам сообщили итальянские политики.\n\n«Письмо было направлено губернаторам. В тексте указывалось, что наши советники нарушают статью уголовного кодекса Украины со всеми вытекающими последствиями. Настоящие угрозы. Это неприемлемо, чтобы третья страна вмешивалась в политическую деятельность представителей другого государства», — заявил советник регионального совета Венето Лучано Сандона (в этот регион входят такие крупные города, как Венеция, Падуя, Верона и Виченца). По словам Сандоны, кроме политиков из региональных советов Венето, Ломбардии, Лигурии, Эмилии-Романьи и Тосканы в делегацию вошли также крупные предприниматели Италии.\n\nНапомним: многочисленная делегация итальянских депутатов и бизнесменов намерена посетить Крым 13—16 октября. Делегаты планируют ознакомиться с бизнес-условиями и инвестиционными возможностями полуострова.', url='http://anna-news.info/node/65960', time=None, cc_file='crawl-data/CC-NEWS/2016/10/CC-NEWS-20161011083804-00021.warc.gz'), 4721063: ExctractedCCDoc(doc_id='39000c4d-17ea-4b71-9859-09fd60c866c9', title='Три экстремала прощались с летом, прыгая с моста в центре Петербурга', text='Школьники прыгали в воду, проделывая в воздухе различные кульбиты.\n\nУтром 28 августа на Красном мосту в Петербурге заметили трех молодых людей. На записи, которую сделал один из пользователей интернета, видно, как школьники прыгают с высоты в воду, проделывая в воздухе различные кульбиты.\n\nОдин из юношей, прыгая с моста в реку Мойку, кричал: "Света, прощай!". Его примеру последовали два товарища, которые разделись до трусов и бесстрашно кинулись в воду.\n\nАвтор ролика предположил, что таким образом молодые люди прощались с летом. Другой в ответ написал, что "с мозгами они попрощались чуть раньше".\n\nРанее мы писали, что после падения на рельсы на станции метро "Гражданский проспект" женщина осталась без рук.\n\nВидео: ВКонтакте / Признавашки ДТП и ЧП Санкт-Петербург', url='https://piter.tv/event/Tri_molodih_cheloveka_prigali_s_Krasnogo_mosta_v_Mojku/', time=None, cc_file='crawl-data/CC-NEWS/2019/08/CC-NEWS-20190828113707-01025.warc.gz') }) def test_hc4_zh_query(self): self._test_queries('hc4/zh/train', count=23, items={ 0: ExctractedCCQuery(query_id='1001', title='Extremist covert communication advancements', description='How are extremist groups advancing covert communication capabilities with technology?', ht_title='极端分子隐匿通讯的进步', ht_description='极端分子组织如何利用科技来发展隐匿通讯能力?', mt_title='Extremist covert communication advancements', mt_description='EXtremist groups advert communication capabwith technology?', narrative_by_relevance={'very_valuable': '', 'somewhat_valuable': '', 'not_that_valuable': '', 'non_relevant': ''}, report='Even though older communication methods like radio are still used, the revolution in communication technology over the past 10–15 years has dramatically changed how terrorist organizations communicate. E-mails, fax transmissions, websites, cell phones, and satellite telephones have made it possible for organizations to contemplate a global strategy. However, too great a reliance on this new technology leaves organizations vulnerable to sophisticated monitoring of communication and triangulation of its source. When Osama bin Laden found out that his satellite phone conversations were being intercepted, he ceased using this method to communicate.', report_url='https://en.wikipedia.org/w/index.php?title=Tactics_of_terrorism&oldid=834204722#Communications', report_date='2018-04-04', translation_lang='zh'), 3: ExctractedCCQuery(query_id='1004', title='Urban fire with multiple fatalities', description='Details on large urban fires leading to multiple lives lost', ht_title='多人死亡的城市火灾', ht_description='大型城市火灾导致多人死亡的细节', mt_title='城市火灾,有多种脂肪', mt_description='关于大量城市火灾的细节,以减少损失', narrative_by_relevance={'very_valuable': '', 'somewhat_valuable': '', 'not_that_valuable': '', 'non_relevant': ''}, report="A conflagration is a large and destructive fire that threatens human life, animal life, health, and/or property. It may also be described as a blaze or simply a (large) fire. A conflagration can begin accidentally, be naturally caused (wildfire), or intentionally created (arson). Arson can be for fraud, murder, sabotage or diversion, or due to a person's pyromania. A very large fire can produce a firestorm, in which the central column of rising heated air induces strong inward winds, which supply oxygen to the fire. Conflagrations can cause casualties including deaths or injuries from burns, trauma due to collapse of structures and attempts to escape, and smoke inhalation.", report_url='https://en.wikipedia.org/w/index.php?title=Conflagration&oldid=872678379', report_date='2018-12-08', translation_lang='zh') }) self._test_queries('hc4/zh/dev', count=10, items={ 4: ExctractedCCQuery(query_id='6', title="Gibraltar's Sovereignty After Brexit", description="How will Gibraltar's sovereignty be impacted by Brexit negotiations between Spain and the UK?", ht_title='直布罗陀在脱欧以后主权的情况', ht_description='西班牙与英国之间的脱欧谈判对直布罗陀主权有什么影响?', mt_title='Gustar的累累累累累累累累累累累累累累累累累累累累累', mt_description='在英国和英国的Brexit negotiations中,G é altar的soverignty将如何受到影响?', narrative_by_relevance={'very_valuable': 'Very valuable documents discuss the negotiations between the UK and Spain specifically regarding the status of Gibraltar, and the different possible scenarios for Gibraltar. Very valuable information also includes what residents of Gibraltar prefer, proposed plans between the UK and Spain for Gibraltar, and a brief history of the disagreement between the UK and Spain over Gibraltar and the main areas of disagreement specifically regarding Brexit.', 'somewhat_valuable': "Somewhat valuable information references Gibraltar in the UK's Brexit negotiations and that the UK has a disagreement with Spain, but doesn't go into detail about it and doesn't mention what the implications are for Gibraltar if the UK exits the EU.", 'not_that_valuable': 'Information discussed Brexit negotiations between the UK and EU, but did not mention the issue of Gibraltar or disagreements with Spain.', 'non_relevant': 'Gibraltar was mentioned in regards to Brexit, but no details were provided and it was not the main part of the article.'}, report='Sovereignty\nSee also: Disputed status of Gibraltar\n\nThe day after the result, Spain\'s acting Foreign Minister, José Manuel García-Margallo, renewed calls for joint Spanish–British control of the peninsula. These calls were strongly rebuffed by Gibraltar\'s Chief Minister. After the result Spain reiterated its position that it wanted to jointly govern Gibraltar with the United Kingdom and said it would seek to block Gibraltar from participating in talks over future deals between the UK and EU.\n\nIn April 2017, British Prime Minister Theresa May reiterated that "the UK would seek the best possible deal for Gibraltar as the UK exits the EU, and there would be no negotiation on the sovereignty of Gibraltar without the consent of its people.” \n\nIn April 2018, Spanish Foreign Minister Alfonso Dastis announced that Spain hoped to sign off a bilateral agreement with Britain over Gibraltar before October so as not to hinder a Brexit transition deal. Talks between London and Madrid had progressed well. While reiterating the Spanish long-term aim of "recovering" Gibraltar, he said that Spain would not hold Gibraltar as a "hostage" to the EU negotiations. ', report_url='https://en.wikipedia.org/w/index.php?title=Effect_of_Brexit_on_Gibraltar&oldid=870688779', report_date='2018-11-26', translation_lang='zh') }) self._test_queries('hc4/zh/test', count=50, items={ 0: ExctractedCCQuery(query_id='102', title='Shipwrecks and Historical Chinese Trade', description='What information about trade and shipping has been discovered by investigating historical shipwrecks of Chinese vessels?', ht_title='沉船与中国古代贸易', ht_description='通过调查中国历史中的沉船事故,发现了哪些古代贸易和航运信息?', mt_title='Shewrecks and Historical Chinese Trade', mt_description='在中国船舰的高压造船厂中,有哪些关于贸易和航运的信息被破坏了?', narrative_by_relevance={'very_valuable': 'There were two reports found with very valuable information detailed the specific types of cultural relics found on a Song Dynasty shipwreck found in the South China Sea, which indicated porcelain and iron trades with foreign nations during that time period. ', 'somewhat_valuable': 'These was one report briefly mentioned some cosmetic boxes found in a Chinese historical shipwreck that suggest possible foreign trades. ', 'not_that_valuable': 'N/A', 'non_relevant': 'There were multiple articles found with reports on cultural relics found on various Chinese historical shipwrecks. But with no mention of trades. '}, report='The Chinese had wide connections through trade in Asia and had been sailing to Arabia, East Africa, and Egypt since the Tang Dynasty (AD 618–907). Between 1405 and 1421 the third Ming emperor Yongle sponsored a series of long range tributary missions in the Indian Ocean under the command of admiral Zheng He (Cheng Ho).\n\nA large fleet of new junk ships was prepared for these international diplomatic expeditions. The largest of these junks—that the Chinese termed bao chuan (treasure ships)—may have measured 121 metres (400 feet) stem to stern, and thousands of sailors were involved. The first expedition departed in 1405. At least seven well-documented expeditions were launched, each bigger and more expensive than the last. The fleets visited Arabia, East Africa, India, Malay Archipelago and Thailand (at the time called Siam), exchanging goods along the way. They presented gifts of gold, silver, porcelain and silk; in return, received such novelties as ostriches, zebras, camels, ivory and giraffes. After the emperor\'s death, Zheng He led a final expedition departing from Nanking in 1431 and returning to Beijing in 1433. It is very likely that this last expedition reached as far as Madagascar. The travels were reported by Ma Huan, a Muslim voyager and translator who accompanied Zheng He on three of the seven expeditions, his account published as "Ying-Yai Sheng-Lam" (Overall Survey of the Ocean\'s Shores) (1433).\n\nThese long distance journeys were not followed up, as the Chinese Ming dynasty retreated in the haijin, a policy of isolationism, having limited maritime trade. Travels were halted abruptly after the emperor\'s death, as the Chinese lost interest in what they termed barbarian lands turning inward, and successor emperors felt the expeditions were harmful to the Chinese state; Hongxi Emperor ended further expeditions and Xuande Emperor suppressed much of the information about Zheng He\'s voyages.', report_url='https://en.wikipedia.org/w/index.php?title=Age_of_Discovery&oldid=858657320', report_date='2018-09-08', translation_lang='zh'), }) def test_hc4_fa_query(self): self._test_queries('hc4/fa/train', count=8, items={ 0: ExctractedCCQuery(query_id='1008', title='French protest fuel taxes', description='Information on protests in France against rising fuel taxes and possible outcomes of the protests.', ht_title='اعتراض فرانسه به مالیات سوخت', ht_description='اطلاعات مربوط به اعتراضات در فرانسه علیه افزایش مالیات سوخت و نتایج احتمالی اعتراضات', mt_title='اعتراض فرانسه به مالیات سوخت', mt_description='اطلاعات در مورد اعتراضات در فرانسه در برابر افزایش مالیات سوخت و نتایج احتمالی اعتراضات.', narrative_by_relevance={'very_valuable': '', 'somewhat_valuable': '', 'not_that_valuable': '', 'non_relevant': ''}, report='After Macron was inaugurated on 15 May 2017, there were numerous risks from Labour unions about the prospect of a large organized protest. The CGT Union has attempted numerous times to organise a large-scale demonstration against Macron with one currently taking place on September 12, 2017. Macron has actively tried to prevent this by opening Labor code reform negotiations with trade unions. The reception among the unions has been mixed with the head of the FO union supporting the negotiations, the CFDT deciding to stay neutral, not participating in the September 12th protests and the CGT denouncing the negotiations alongside its ally SUD. Jean-Luc Melenchon from La France Insoumise has spoken in support of the September 12th protest encouraging members to attend. Melenchon himself organized a protest on July 12, 2017.\n\nPresident Donald Trump\'s state visit to France during Bastile Day was met with protests, protesters gathered around Place de la République to create a "No Trump Zone". Protesters were reportedly protesting about the Trump visit and Macron\'s policies with the ranks of the protesters being made up of socialists, pro-Palestinian groups, migrants’ rights activists, environmentalists and anti-fascists. Despite mass protests, 59% of French people approve of Trump\'s visit.\n\nFollowing Prime Minister Edouard Philippe\'s announcement of the plans for immigration reform, a small protest was led by a group of LGBT activists in Paris holding up a sign reading "Macron starves migrants, queers without borders"\n\nA series of protests by wine producers in the South of France have been ongoing since Francois Hollande\'s presidency. These demonstrations generally involve arson, sabotage and assault. These protests are caused by the importation of wine rather than buying it from French producers and the loss of culture. These protests have led to a 25% decrease in sales for Spanish wine producers. Spanish tankers transporting wine are usually the target of these attacks.\n\nPro-Palestinan protesters began to demonstrate against Macron offering Israel Prime Minister Netanyahu a place at the Paris Holocaust Ceremony. The French Communist Party also opposed Netanyahu\'s visit. The organizers of the protest were unknown but Le Muslim Post, a religious radioshow promoted the demonstration, encouraging listeners to attend. ', report_url='https://en.wikipedia.org/w/index.php?title=Protests_against_Emmanuel_Macron&oldid=801910503', report_date='2018-09-22', translation_lang='fa') }) self._test_queries('hc4/fa/dev', count=10, items={ 0: ExctractedCCQuery(query_id='1', title='Asteroids Endangering Earth', description='Articles related to asteroids that pose danger of impact to Earth.', ht_title='سیارک ها\xa0کره زمین را به خطر می اندازند', ht_description='مقالات مربوط به سیارک ها که خطر ضربه به کره زمین را دارند', mt_title='سیارات در معرض خطر زمین', mt_description='مقالات مربوط به سیارک \u200c هایی که خطر برخورد با زمین را تهدید می \u200c کنند.', narrative_by_relevance={'very_valuable': 'Mention of asteroids striking Earth with description', 'somewhat_valuable': 'Mention of asteroids striking Earth but with little information', 'not_that_valuable': 'May talk about asteroids without real mention of their danger', 'non_relevant': 'Describes meteorites, planets, stars with no relation to striking Earth'}, report='2011 UL21 briefly had about a 1 in a million chance of impacting in 2029. Its cumulative impact probability dropped to 1 in 71 million by 2 November 2011 when the observation arc reached 15 days. It was removed from the Sentry Risk Table on 4 November 2011 when all impact scenarios for the next 100 years or more were ruled out. During 2029, the closest approach to Earth is 1.6 AU. Palomar Observatory precovery images from 1989 and 1990 have extended the observation arc to 22 years. Its next notable close approach to the Earth will be on June 27, 2024 at a distance of 0.044 AU (6,600,000 km; 4,100,000 mi).\n\nWith an absolute magnitude of 15.8, it is one the brightest and therefor largest potentially hazardous asteroids (PHA) detected since (242450) 2004 QY2. The next largest PHA (based on absolute magnitude) discovered in 2011 is 2011 WO41 with an absolute magnitude of 16.8. ', report_url='https://en.wikipedia.org/w/index.php?title=(415029)_2011_UL21&oldid=877055001', report_date='2019-01-06', translation_lang='fa') }) self._test_queries('hc4/fa/test', count=50, items={ 0: ExctractedCCQuery(query_id='103', title='African Extremist School Girls Kidnapping', description='What is known about instances of extremist or terrorist groups kidnapping girls from schools in Africa?', ht_title='ربودن دختران دانش\u200cآموز توسط گروه های افراطی آفریقایی', ht_description='راجع به وقوع ربودن دختران دانش\u200cآموز توسط گروه های افراطی یا تروریستی از مدرسه ها در آفریقا چی درک می شود؟', mt_title='دختران مدرسه افراطی آفریقایی Kidnapping', mt_description='در مورد نمونه \u200c هایی از گروه \u200c های افراطی یا گروه \u200c های تروریستی که دختران را از مدارس آفریقا می \u200c ربایند چه می \u200c توان گفت ؟', narrative_by_relevance={'very_valuable': "Details about kidnapping operations by Boko Haram in Nigerian schools (victims often drugged, taken from schools overnight), numbers of victims, affected locations; Details about international responses including that of UNICEF; reports on Nigerian domestic response; information on the terrorist group's motives (pressure Nigerian government, use girls in suicide attacks, forced conversion to Islam) and connections to ISIL and Al Qaeda; ", 'somewhat_valuable': 'Reports of girls fleeing prior to a planned attack, reports about girls that were kidnapped but whose fate is still unknown, reports about girls being freed after government negotiations with terrorist group ', 'not_that_valuable': 'Information about celebrations after girls are rescued and returned home; general background information on Boko Haram when referenced in connection with kidnappings in Africa; side references African school girl kidnappings in reports focused on other attacks', 'non_relevant': "Boko Haram attacks in Nigeria not related to girls' kidnappings, suicide attacks by terrorist groups in countries outside of Africa, girls education in Muslim countries, American students accused of a planned school attack in Florida"}, report='On the night of 14–15 April 2014, 276 female students were kidnapped from the Government Secondary School in the town of Chibok in Borno State, Nigeria. Responsibility for the kidnappings was claimed by Boko Haram, an extremist, Islamic, terrorist organization based in northeastern Nigeria. 57 of the schoolgirls managed to escape over the next few months and some have described their capture in appearances at international human rights conferences. A child born to one of the girls and believed by medical personnel to be about 20 months old also was released, according to the Nigerian president\'s office.\n\nSince then hopes were raised on various occasions that the 219 remaining girls might be released. Newspaper reports suggested that Boko Haram was hoping to use the girls as a negotiating pawns in exchange for some of their commanders in jail.\n\nIn May 2016, one of the missing girls, Amina Ali, was found. She claimed that the remaining girls were still there, but that six had died. A further 21 girls were freed in October 2016, while another was rescued the next month. Another was found in January 2017. 82 more girls were freed in May 2017. One of the girls was rescued in January 2018.\n\n\nContents\n1\tBackground\n2\tKidnapping\n3\tAftermath\n3.1\tEvents in 2014\n3.2\tEvents in 2015\n3.3\tEvents in 2016\n3.4\tEvents in 2017\n3.5\tEvents in 2018\n4\tReaction\n5\tSocial media and celebrity involvement\n6\tInternational governmental response\n7\tSee also\n8\tReferences\n9\tExternal links\nBackground\nMain article: Boko Haram insurgency\nThe terrorist group Boko Haram wants to institute an Islamic caliphate in Nigeria and is in particular opposed to western-style modern education, which they say lures people away from following Islamic teaching as a way of life. By 2014, tens of thousands of people had been killed in attacks perpetrated by the group, and the Nigerian federal government declared a state of emergency in May 2013 in Borno, Yobe and Adamawa states in its fight against the insurgency. The resulting crackdown led to the capture or killing of hundreds of Boko Haram members, with the remainder retreating to mountainous areas from which they began increasingly to target civilians. However, the campaign failed to stabilise the country. A French military operation in Mali also pushed Boko Haram and AQIM terrorists into Nigeria.\n\n\nBorno State is in northeast Nigeria\nBoko Haram began to target schools in 2010, killing hundreds of students by 2014. A spokesperson for the group said such attacks would continue as long as the Nigerian government continued to interfere with traditional Islamic education. 10,000 children have been unable to attend school as a result of activities by Boko Haram. Boko Haram has also been known to kidnap girls, whom it believes should not be educated, and use them as cooks or sex slaves.\n\nBoko Haram\'s attacks intensified in 2014. In February, the group killed more than 100 Christian men in the villages of Doron Baga and Izghe. That same month, 59 boys were killed in the Federal Government College attack in northeastern Nigeria. In March, the group attacked the Giwa military barracks, freeing captured militants. The Chibok abduction occurred on the same day as a bombing attack in Abuja in which at least 88 people died. Boko Haram was blamed for nearly 4,000 deaths in 2014. Training received from al-Qaeda in the Islamic Maghreb and al Qaeda in the Arabian Peninsula has helped Boko Haram intensify its attacks.\n\nKidnapping\n\nDamage to the school\nOn the night of 14–15 April 2014, a group of militants attacked the Government Girls Secondary School in Chibok, Nigeria. They broke into the school, pretending to be guards. According to a diary written by two of the girls (Naomi Adamu and Sarah Samuel) the militants had intended to steal an "engine block" and were initially unsure what to do with the girls. They told the girls to get out and come with them. Some girls were loaded into trucks and the rest had to walk several miles until other trucks came to take them away possibly into the Konduga area of the Sambisa Forest where Boko Haram were known to have fortified camps. Houses in Chibok were also burned down in the incident. The school had been closed for four weeks prior to the attack due to the deteriorating security situation, but students from multiple schools had been called in to take final exams in physics.\n\nThere were 530 students from multiple villages registered for the Senior Secondary Certificate Examination, although it is unclear how many were in attendance at the time of the attack. The children were aged 16 to 18 and were in their final year of school. There was initial confusion over the number of girls kidnapped but on 21 April 2014, parents said 234 girls were missing. A number of the students escaped the kidnappers by jumping off the trucks. According to the police, approximately 276 children were taken in the attack, of whom 53 had escaped as of 2 May. Other reports said that 329 girls were kidnapped, 53 had escaped and 276 were still missing.\n\nAmnesty International said it believes the Nigerian military had four hours\' advance warning of the kidnapping, but failed to send reinforcements to protect the school. Nigeria\'s armed forces have confirmed that the Nigerian military had four-hour advance notice of the attack but said that their over-extended forces were unable to mobilize reinforcements.\n\nJonathan N.C. Hill of King\'s College London, has pointed out that Boko Haram kidnapped these girls after coming increasingly under the influence of al-Qaeda in the Islamic Maghreb, and asserts that the group\'s goal is to use girls and young women as sexual objects and as a means of intimidating the civilian population into non-resistance. Hill describes the attacks as similar to kidnapping of girls in Algeria in the 1990s and early 2000s.\n\nAftermath\nEvents in 2014\n\nCEE-HOPE Nigerian organized an event to commemorate one year anniversary of bring back our girls\nNon-Muslim students were forced to convert to Islam. The girls were forced into marriage with members of Boko Haram, with a reputed "bride price" of ₦2,000 each ($6/£4). Many of the students were taken to the neighbouring countries of Chad and Cameroon, with sightings reported of the students crossing borders with the militants, and sightings of the students by villagers living in the Sambisa Forest. The forest was considered a refuge for Boko Haram. Local residents were able to track the movements of the students with the help of contacts across north eastern Nigeria. A diary described how some girls escaped but were returned to Boko Haram by local villagers and whipped.\n\nThe Guardian reported that the British Royal Air Force conducted Operation Turus in response to the Chibok schoolgirls\' kidnapping by Boko Haram in Nigeria in April 2014. A source involved with the Operation told the Observer that “The girls were located in the first few weeks of the RAF mission,” and that “We [RAF] offered to rescue them, but the Nigerian government declined,” this was because it viewed the matter as a “national issue” to be resolved by Nigerian intelligence and security services.\n\nOn 2 May 2014, police said they were still unclear as to the exact number of students kidnapped. They asked parents to provide documents so an official count could be made, as school records had been damaged in the attack. On 4 May, the Nigerian President, Goodluck Jonathan, spoke publicly about the kidnapping for the first time, saying the government was doing everything it could to find the missing girls. At the same time, he blamed parents for not supplying enough information about their missing children to the police.\n\nOn 5 May 2014, a video in which Boko Haram leader Abubakar Shekau claimed responsibility for the kidnappings emerged. Shekau claimed that "Allah instructed me to sell them...I will carry out his instructions." and "Slavery is allowed in my religion, and I shall capture people and make them slaves." He said the girls should not have been in school and instead should have been married since girls as young as nine are suitable for marriage.\n\nFollowing the kidnapping incident, Boko Haram again abducted another eight girls, aged between 12–15, from northeastern Nigeria, a number later raised to eleven.\n\nChibok is primarily a Christian village and Shekau acknowledged that many of the girls seized were not Muslims: "The girls that have not accepted Islam, they are now gathered in numbers...and we treat them well the way the Prophet Muhammad treated the infidels he seized."\n\nOn 5 May 2014, at least 300 residents of the nearby town of Gamboru Ngala were killed in an attack by Boko Haram militants after Nigerian security forces had left the town to search for the kidnapped students. On 9 May, former Boko Haram negotiator, Shehu Sani, stated that the group wanted to swap the abducted girls for its jailed members. On 11 May, Kashim Shettima, Governor of Borno State in Nigeria, said that he had sighted the abducted girls and that the girls were not taken across the borders of Cameroon or Chad. On 12 May, Boko Haram released a video showing about 130 kidnapped girls, each clad in a hijab and a long Islamic chador, and demanded a prisoner exchange.\n\nA journalist-brokered deal to secure the release of the girls in exchange for 100 Boko Haram prisoners held in Nigerian jails was scrapped at a late stage on 24 May 2014 after President Goodluck Jonathan consulted with U.S., Israeli, French and British foreign ministers in Paris, where the consensus was that no deals should be struck with terrorists, and that a solution involving force was required.\n\nOn 26 May, the Nigerian Chief of Defence Staff announced that the Nigerian security forces had located the kidnapped girls, but ruled out a forceful rescue attempt for fears of collateral damage.\n\nOn 30 May, it was reported that a civilian militia in the Baale region of Northeastern Nigeria found two of the kidnapped girls raped, "half-dead," and tied to a tree. Villagers said the Boko Haram group had left the two girls, and killed four other disobedient girls and buried them. 223 were still missing.\n\nSir Andrew Pocock, British High Commissioner to Nigeria said that a couple of months after the kidnapping a group of up to 80 of the Chibok girls were seen by American \'eye in the sky\' technology but nothing was done. The girls, a camp and evidence of ground transport vehicles were spotted next to a local landmark called the \'Tree of Life\' in the Sambisa forest.\n\nOn 24 June, it was reported that 91 more women and children were abducted in other areas of Borno State. One source estimated in June that there could be as many as 600 girls held by Boko Haram in three camps outside Nigeria.\n\nOn 26 June, it was announced that Levick, a Washington, D.C. public relations firm, had received "a contract worth more than $1.2 million" from the government of Nigeria to work on "the international and local media narrative" surrounding the Chibok schoolgirl kidnapping.\n\nOn 1 July, a businessman suspected of carrying out the kidnappings of the school girls, as well as the bombing of a busy market in northeastern Nigeria, was arrested. Military sources said that he was also accused of helping the Islamist militant group kill the traditional leader Idrissa Timta, the Emir of Gwoza.\n\nOn 15 July, Zakaria Mohammed (\'the Butcher\'), a high-ranking member of Boko Haram, was arrested at Darazo-Basrika Road while fleeing from the counter insurgency operations going on around the Balmo Forest.\n\nOn 12 October 2014, it was reported that four girls from the original kidnapped group had escaped and walked three weeks to freedom in Nigeria. They said they had been held in a camp in Cameroon and raped every day.\n\nEvents in 2015\nStephen Davis, a former Anglican clergyman, contacted three Boko Haram commanders who said they might be prepared to release Chibok schoolgirls and went to Nigeria in April 2015. He was given proof of life (a video of them being raped) and was told 18 were seriously ill, some with HIV. Davis got initial agreement that Boko Haram would release these ill girls. However, after three attempts the deal fell through when another group abducted the girls believing they could make money out of them and Davis left Nigeria. Davis commented that it was not difficult to locate the five or six main Boko Haram camps. He could find them on Google Earth.\n\nIn May 2015, it was reported that the Nigerian military had reclaimed most of the areas previously controlled by Boko Haram in Nigeria including many of the camps in the Sambisa forest where it was suspected the Chibok girls had been kept. Although many women had been freed, none of the Chibok girls had been found. It was reported that some of the girls had been sold into slavery for N2,000 (about $10) each, others had been forcibly married to Boko Haram fighters and they may have been killed. Kashim Shettima, the Borno state governor said he suspected the Chibok girls were being kept in underground bunkers.\n\nEvents in 2016\nIn January 2016 the Nigerian military were reported to have freed 1,000 women held captive by Boko Haram but none of them were Chibok girls.\n\nIn April 2016 Boko Haram released a video showing 15 girls who appeared to be some of the kidnapped Chibok girls. The video was apparently taken in December 2015 and the girls seemed to be well fed and not distressed.\n\nOn 17 May 2016, Amina Ali Nkeki, one of the girls was found along with her baby and Mohammad Hayyatu, a suspected Boko Haram militant who claimed to be her husband, by the vigilante Civilian Joint Task Force group in the Sambisa Forest. All three were suffering from severe malnutrition. She was then taken to house of the group\'s leader Aboku Gaji who recognised her. The group then reunited the girl with her parents. She met Nigerian President Muhammadu Buhari on 19 May. Government officials announced the same day that the Nigerian army and vigilante groups had killed 35 Boko Haram militants, freed 97 women and children and claimed one of the women was a Chibok schoolgirl. However, there were doubts that this girl, Serah Luka, was really one of the kidnapped Chibok schoolgirls. On 21 May 2016, Amir Muhammad Abdullahi, who claimed to be the Boko Haram second in command and speaker for several senior militants, offered to surrender so long as they would not be harmed and in return they would release hostages including the Chibok girls. However he said of Chibok girls; "...frankly, just about a third of them remain, as the rest have been martyred”.\n\nIn August 2016 Boko Haram released a video of what appeared to be about 50 Chibok girls, some of them holding babies, with an armed masked spokesman who demanded the release of jailed fighters in exchange for the girls\' freedom, The masked gunman said some of the Chibok girls had been killed by Nigerian air strikes and 40 had been married. The film was apparently released on the orders of Abubakar Shekau, the leader of one of the factions of Boko Haram.\n\nIn October 2016, 21 of the Chibok schoolgirls had been freed by Boko Haram after negotiations between the group and the Nigerian government brokered by International Committee of the Red Cross and the Swiss government. On 16 October, President Buhari\'s spokesperson stated that the ISIL-allied faction of Boko Haram was willing to negotiate the release of 83 more of the girls. According to him, the splinter group had stated that the rest of the girls were under the control of Shekau-led faction. 2 days later, Pogu Bitrus, the chairman of the Chibok Development Association, claimed that more than 100 of the missing girls apparently did not want to return home because they had either been brainwashed or were fearful of the stigma they will receive.\n\nAnother girl named Maryam Ali Maiyanga was found and rescued by the Nigerian Army on 5 November along with a baby by the Nigerian Army. The spokesman for the Army, Sani Usman, said that they discovered her in Pulka of Borno state while screening escapees from Boko Haram\'s Sambisa forest base. She was confirmed to be one of the kidnapped girls by Bring Back Our Girls.\n\nEvents in 2017\n\nU.S. President Donald J. Trump, Ivanka Trump, and Chibok schoolgirls Joy Bishara and Lydia Pogu at the White House on June 27, 2017.\nOne of the kidnapped girls, Rakiya Abubakar, was reported on January 5, 2017 to have been found by the Nigerian Army along with a 6-month-old baby while they were interrogating suspects detained in army raids on the Sambisa forest. Her identity was later confirmed by Bring Back Our Girls group.\n\nOn May 6, 82 of the schoolgirls were released following successful negotiations between the Nigerian government involving the exchange of five Boko Haram leaders. The negotiations were carried by Mustapha Zanna, barrister and owner of an orphanage in Maiduguri. The deal also involved the intervention of the Swiss government and the Red Cross. 3 million Euros (about 3.7 million US$) were paid as ransom money in two duffel bags for the total of 103 girls released in October 2016 and May 2017. A Nigerian government spokesman stated that though originally 83 girls were to be released in May 2017, one of them chose to stay with her husband instead of being freed.\n\nEvents in 2018\nThe Nigerian military stated on 4 January 2018 that it had rescued Salomi Pogu, one of the kidnapped girls. Col. Onyema Nwachukwu stated that she was rescued near Pulka village in Borno. Her name was in the list of the kidnapped Chibok schoolgirls. She was found in the company of another young woman and her child. In February 2018 most of the released girls were studying at the American University of Nigeria not far from the original scene of the kidnapping at Chibok. It was estimated that 13 girls were presumed dead and 112 were still missing.', report_url='https://en.wikipedia.org/w/index.php?title=Chibok_schoolgirls_kidnapping&diff=825075173&oldid=823837916', report_date='2018-02-11', translation_lang='fa') }) def test_hc4_ru_query(self): self._test_queries('hc4/ru/train', count=7, items={ 0: ExctractedCCQuery(query_id='1005', title='political implications of the Kemerovo fire', description='What are the political implications of the public reaction to the Kemerovo fire?', ht_title='', ht_description='', mt_title='политические последствия Кемеровского пожара', mt_description='Каковы политические последствия реакции общественности на кемеровский пожар?', narrative_by_relevance={'very_valuable': '', 'somewhat_valuable': '', 'not_that_valuable': '', 'non_relevant': ''}, report='Politicians across the globe sent their condolences. 28 March 2018 was declared the national day of mourning in Russia.\n\nA rally was held in Kemerovo demanding the resignation of Kemerovo Mayor Ilya Seredyuk and the longtime regional governor, Aman Tuleyev. ', report_url='https://en.wikipedia.org/w/index.php?title=2018_Kemerovo_fire&oldid=833479706#Aftermath', report_date='2018-03-31', translation_lang='ru') }) self._test_queries('hc4/ru/dev', count=4, items={ 0: ExctractedCCQuery(query_id='3', title='British royal news impacts', description='What political and economic impacts does news about the British royal family have domestically and abroad?', ht_title='Влияние британских королевских новостей', ht_description='Какое политическое и экономическое влияние новости о британской королевской семье имеют внутри страны и за рубежом?', mt_title='Британские королевские новости влияют', mt_description='Какие политические и экономические последствия имеют новости о британской королевской семье как внутри страны, так и за рубежом?', narrative_by_relevance={'very_valuable': 'Information regarding economic and political impacts of the British royal family in the UK and worldwide', 'somewhat_valuable': 'Information related to the British royal family and their interactions with politics or the economy', 'not_that_valuable': 'information about the royal family', 'non_relevant': 'information not about the royals'}, report="Announcement of engagement\nPrincess Eugenie of York in 2013\n\nPrincess Eugenie of York is the second daughter of Prince Andrew, Duke of York and Sarah, Duchess of York. Jack Brooksbank is a British nightclub manager, a distant relation of the Brooksbank baronets,[notes 1] and a third cousin twice removed of Princess Eugenie through Thomas Coke, 2nd Earl of Leicester.[notes 2] The couple have been dating for seven years; they were first introduced by friends in a ski break in Verbier, Switzerland, where Brooksbank was working at the time.\n\nOn 22 January 2018, Buckingham Palace announced that Princess Eugenie of York would marry Jack Brooksbank in the autumn. They were engaged earlier the same month while in Nicaragua with Brooksbank giving the Princess an oval-cut Padparadscha sapphire surrounded by a halo of diamonds set on a gold band with two further diamonds on the shoulders. The ring bears a striking similarity to the engagement ring of Princess Eugenie's mother.\n\nEugenie was eighth in the line of succession to the British throne at the time the engagement was announced. As of May 2018, she is ninth in the line of succession. Although Eugenie is a member of the British royal family, she does not require the Queen's permission to marry.[notes 3] The Duke and Duchess of York expressed their delight at the news and the British Prime Minister Theresa May congratulated the couple on her Twitter account. After the announcement the couple gave an interview to Matt Baker of BBC One.\n\nThe official engagement photographs were taken in the Picture gallery at Buckingham Palace. ", report_url='https://en.wikipedia.org/w/index.php?title=Wedding_of_Princess_Eugenie_and_Jack_Brooksbank&oldid=841040200', report_date='2018-05-13', translation_lang='ru') }) self._test_queries('hc4/ru/test', count=50, items={ 10: ExctractedCCQuery(query_id='127', title='Brumadinho dam collapse cause', description='What caused the collapse of the dam at the Córrego do Feijão iron ore mine owned by Vale in Brumadinho, Brazil?', ht_title='Причина обрушения плотины Брумадиньо', ht_description='Что стало причиной обрушения плотины на железорудном руднике Коррего-ду-Фейхао, принадлежащем Vale в Брумадиньо, Бразилия?', mt_title='Причина обрушения плотины Брумадиньо', mt_description='Что вызвало обрушение плотины на руднике железной руды Коррего-ду-Фейяу, принадлежащей Вале в Брумадинью, Бразилия?', narrative_by_relevance={'very_valuable': 'states reason and provides responsible party', 'somewhat_valuable': 'company may have known about failures', 'not_that_valuable': 'people were arrested', 'non_relevant': 'mentions victims or unrelated topics (gas trade, cars)'}, report="Background\nBento Rodrigues Village right after the similar Mariana dam disaster of 2015\n\nThe Brumadinho dam failure happened three years and two months after the Mariana dam disaster, which killed 19 people and destroyed the village of Bento Rodrigues. The Mariana disaster is considered the worst environmental disaster in Brazil's history and is still under investigation.\n\nExperts say that Brazil's weak regulatory structures and regulatory gaps allowed the dam's failure. Three years after the Mariana dam collapse, the companies involved in that environmental disaster have paid only 3.4% of R$785 million in fines.\n\nAt the time of the Mariana dam disaster in November 2015, the department in charge of inspecting mining operations in the state of Minas Gerais, the National Department of Mineral Production (DNPM), was worried about the retirement of another 40% of public employees over the course of the next two years.\n\nAccording to the national registry of the National Mining Agency, the Córrego do Feijão dam, built in 1976 by the Ferteco Mineração (acquired by Vale in 2001), was classified as a small structure with low risk of high potential damage. In a statement, the State Department of Environment and Sustainable Development reported that the venture was duly licensed. In December 2018, Vale obtained a license to reuse waste from the dam (about 11.7 million cubic meters) and to close down activities. The dam had not received tailings since 2014 and, according to the company, underwent bi-weekly field inspections. ", report_url='https://en.wikipedia.org/w/index.php?title=Brumadinho_dam_disaster&oldid=tml', report_date='2019-02-10', translation_lang='ru') }) def test_hc4_zh_qrels(self): self._test_qrels('hc4/zh/train', count=341, items={ 0: TrecQrel(query_id='1001', doc_id='062bf68a-5971-40f9-9cb1-6d7bf55592e2', relevance=0, iteration='0'), 123: TrecQrel(query_id='1009', doc_id='4393e311-34f8-4b4a-810b-97ab75e45916', relevance=0, iteration='0') }) self._test_qrels('hc4/zh/dev', count=466, items={ 0: TrecQrel(query_id='1', doc_id='07fdce5f-e00b-4a40-b9eb-f0f78cfaefc4', relevance=0, iteration='0'), 123: TrecQrel(query_id='3', doc_id='ed7d7890-fffb-4620-a952-f5cc20a2368f', relevance=0, iteration='0') }) self._test_qrels('hc4/zh/test', count=2751, items={ 0: TrecQrel(query_id='102', doc_id='1b7ba71e-3294-4817-be08-300e22ead35c', relevance=0, iteration='0'), 123: TrecQrel(query_id='104', doc_id='f98d9c3c-089c-4124-946e-afbdc53e4fc9', relevance=0, iteration='0') }) def test_hc4_fa_qrels(self): self._test_qrels('hc4/fa/train', count=112, items={ 0: TrecQrel(query_id='1008', doc_id='0223ebb7-4d7e-47bc-a282-d8eeb8c9c49e', relevance=1, iteration='0'), 67: TrecQrel(query_id='1022', doc_id='a8e5a616-2a0d-43f2-af2b-1b8ec622ce6f', relevance=0, iteration='0') }) self._test_qrels('hc4/fa/dev', count=565, items={ 0: TrecQrel(query_id='1', doc_id='079eba9b-fda2-4c26-ac18-2c35a7a08e88', relevance=0, iteration='0'), 67: TrecQrel(query_id='4', doc_id='b9703bc0-af7e-4332-973a-9617dbd69b60', relevance=3, iteration='0') }) self._test_qrels('hc4/fa/test', count=2522, items={ 0: TrecQrel(query_id='103', doc_id='0d4e7624-69fb-4d55-9103-5ab04abb28c3', relevance=0, iteration='0'), 67: TrecQrel(query_id='107', doc_id='25872c00-4ca5-4584-b9c4-cb88b3f47e2c', relevance=0, iteration='0') }) def test_hc4_ru_qrels(self): self._test_qrels('hc4/ru/train', count=92, items={ 0: TrecQrel(query_id='1005', doc_id='047cae7c-c674-41ea-a4c7-1d0d68c73b47', relevance=1, iteration='0'), 77: TrecQrel(query_id='1026', doc_id='1f22c2cf-303d-4d09-9d54-26cd6a53c9d3', relevance=0, iteration='0') }) self._test_qrels('hc4/ru/dev', count=265, items={ 0: TrecQrel(query_id='3', doc_id='07d5796e-e55f-469f-b158-88a8aa53131d', relevance=0, iteration='0'), 77: TrecQrel(query_id='6', doc_id='62d31f31-1efa-4da2-8944-ac35a3cd77fa', relevance=1, iteration='0') }) self._test_qrels('hc4/ru/test', count=2970, items={ 0: TrecQrel(query_id='101', doc_id='00d525d6-ae24-43c0-b5f5-125d796552d1', relevance=0, iteration='0'), 77: TrecQrel(query_id='103', doc_id='3bd61245-80a3-4125-a095-f1314b1509f4', relevance=3, iteration='0') }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/highwire.py ================================================ import re import unittest import ir_datasets from ir_datasets.datasets.highwire import HighwireDoc, HighwireSpan, HighwireQrel from ir_datasets.formats import GenericQuery from .base import DatasetIntegrationTest _logger = ir_datasets.log.easy() class TestHighwire(DatasetIntegrationTest): def test_highwire_docs(self): self._test_docs('highwire', count=162259, items={ 0: HighwireDoc('10901322', 'ajepidem', '\r\nJohn Snow and Modern-Day Environmental Epidemiology\r\n', (HighwireSpan(start=0, length=151, text='\r\n\r\n\r\nJohn Snow and Modern-Day Environmental Epidemiology\r\n\r\n\r\n\r\nDale P. Sandler\r\n'), HighwireSpan(start=154, length=207, text='\r\n\r\nFrom the Epidemiology Branch, National Institute of Environmental Health Sciences, P.O. Box 12233\x97Mail Drop A3-05, 111 T. W. Alexander Drive, Research Triangle Park, NC 27709.\r\n'), HighwireSpan(start=364, length=296, text='\r\n\r\n\r\n\r\n\r\nWhat does an anecdote about John Snow have to do with modern-day epidemiology? And why use it to introduce an issue of the Journal highlighting the challenges of studying disease risks associated with low dose environmental exposures? '), HighwireSpan(start=663, length=1494, text="\r\n\r\nIn this issue, Lilienfeld describes John Snow giving expert-witness testimony on behalf of industry (1). Besides being interesting on a historical basis, this incident raises several issues that are pertinent today. Lilienfeld's paper and the accompanying commentary by Vandenbroucke (2) deal directly or indirectly with the role and responsibilities of expert witnesses, the extrapolation of data on health effects from high dose exposures to low dose exposures, the importance of epidemiology to the development of public health policy, the current debates on environmental justice (3), and the use of the precautionary principle (4) in standard-setting. Furthermore, if faced with an issue similar to that faced by Snow\x97namely, local residents' being worried about health consequences associated with emanations from factories\x97would modern-day environmental epidemiologists be any better positioned to carry out appropriate studies and reach sound conclusions? "), HighwireSpan(start=2160, length=844, text='\r\n\r\nSnow can be seen at once as victim and perpetrator of sins that are common in epidemiology in general and in environmental epidemiology in particular. Was Snow victimized by the medical establishment, including The Lancet, for expressing views that were not commonly held by the scientists of the day? Were his peers outraged because of the reactionary social position he was taking (as suggested by Vandenbroucke)? On the other hand, was he as guilty as proponents of the miasma theory for trying to apply his theory of disease transmission to all situations without allowing for the possibility of multiple disease pathways? Did he fall into the trap of equating the absence of data with an absence of effect? '), HighwireSpan(start=3007, length=860, text='\r\n\r\nWhen Snow contended that emanations from the bone-boiling factories were not causing ill health in the community at large, he invoked arguments that are often raised when unexpected health effects are encountered following supposed low dose exposures. One argument is that such health effects are implausible given what we know about high dose exposures. In this instance, Snow noted that the factory workers were not dying and therefore health effects in the community at large were not plausible. A related argument is that, even if workers are dying or suffering other health effects, because of the distance from the exposure source, the exposure levels in the community are probably too low to plausibly affect health. '), HighwireSpan(start=3870, length=892, text='\r\n\r\nHealth effects of low dose exposures are often seen as implausible, even in the face of accumulated consistent evidence. Such arguments have frequently been invoked in environmental epidemiology. Examples of low dose exposures that have been deemed implausible contributors to disease risk based on what is known about high dose exposures include passive smoking, residential radon exposure, childhood lead exposure, electromagnetic fields, and residence near nuclear facilities. If one begins with a fixed idea of what is plausible, arguments regarding susceptible subgroups, inverse dose rate, hormesis, multiple pathways, multifactor etiologies, and complex exposures (e.g., the different constituents of sidestream and mainstream smoke) are untenable. '), HighwireSpan(start=4765, length=539, text='\r\n\r\nBut how do we know that the factory workers were not dying or suffering other ill effects? Snow cited no studies. All too often the absence of data is argued as proof of no effect. This issue becomes especially difficult when regulatory decisions are being made. In the absence of evidence, can something be considered safe? While science is important, it is ultimately social forces, as much as science, that guide regulators in decision-making. '), HighwireSpan(start=5307, length=1484, text='\r\n\r\nSnow\'s statements and the questions that were put to him call to mind some of the fundamental difficulties inherent in environmental epidemiology. Today, there are numerous examples of residents who live near potential environmental hazards claiming health effects that can never be proven beyond a reasonable doubt. Although the "gold standard" is an unbiased risk estimate with precise confidence limits, studies focused on overt health effects are invariably underpowered because of the small numbers of residents in the neighborhoods of interest. Other creative approaches to assessment of subclinical health effects are more costly and difficult to implement, but even these studies are often too small for conclusive results. Yet, what is the right thing to do? If we wait for strong scientific evidence before we act\x97if we require proof that workers are dying or evidence of overt illness in the community\x97have we waited too long? Few clusters are ever resolved with the identification of a causal link between some localized exposure and disease. While many apparent clusters may be artifacts, what is the real cost of the true hazards that cannot be proven? These were the issues facing Parliament when Snow testified on behalf of industry. '), HighwireSpan(start=6794, length=604, text='\r\n\r\nWhat is the role of the epidemiologist in this quagmire? In Snow\'s London, the living conditions of people near the factories were likely to have been dismal. There were no doubt residents who perceived their symptoms as being related to the smells\x97smells that, if nothing else, impacted the quality of life. Policy-makers must balance "doing the right thing" with regard to human suffering and quality of life with the financial costs of doing so. Epidemiology can only go so far in providing the answers. '), HighwireSpan(start=7401, length=1174, text='\r\n\r\nIt is this political and social tug-of-war that makes environmental epidemiology especially difficult. On the one hand, there are well\x97funded industries with a financial stake in the outcome of such research. As Vandenbroucke notes (2), these industries often are in a position to exploit the many weaknesses that epidemiologists are trained to identify in their own studies and in the work of others to cast potentially damaging results in a more favorable light. On the other hand, there are environmental groups committed to proving that a particular environmental exposure can be linked to a variety of personal complaints; these groups may be motivated by the possibility of effecting social change through science or by the prospect of receiving needed medical attention or financial compensation. Those who attempt to work in this arena often find themselves and their research attacked from all directions. '), HighwireSpan(start=8578, length=1091, text='\r\n\r\nEnvironmental epidemiology is difficult to conduct today for other reasons as well. Adequate tools with which to measure and quantify exposures are lacking. Studies are often unable to detect meaningful effects because exposures are low, infrequent, or difficult to measure with certainty. How many investigators are willing to tackle this problem? In the case of the bone-boiling factories, would research linking questionnaire data on symptoms to factory releases be believed? Would a study relating distance from the factory to disease be sufficient evidence of effect? What health effects would be plausible based on known biologic mechanisms? How well could those effects be measured, and could they be measured objectively? Is there a biomarker of exposure? If a biomarker exists, does it measure relevant past exposures? Is the measure unaffected by current health status\x97particularly the disease under study? '), HighwireSpan(start=9672, length=385, text="\r\n\r\nIn addition to Lilienfeld's historical report and Vandenbroucke's commentary, this issue of the Journal features papers that illustrate various aspects of the difficulties faced in studying health effects of environmental exposures. Several of these include innovative attempts to improve the quality of such research. "), HighwireSpan(start=10060, length=1974, text="\r\n\r\nThe paper by Viel et al. (5) may come closest to what many may think of as environmental epidemiology. The authors have examined the spatial distribution of soft tissue sarcomas and non-Hodgkin's lymphomas around an incinerator with high dioxin emissions. Their results are suggestive but need to be followed by studies incorporating more rigorous exposure assessment\x97perhaps a biologic measure of exposure such as that used in the study of polychlorinated biphenyls and breast cancer reported by Zheng et al. (6). Other studies described in this issue used a variety of approaches to exposure assessment. Rondeau et al. (7) linked estimates of levels of aluminum and silica in drinking water to risks of dementia and Alzheimer's disease. Laden et al. (8) used questionnaire data on use of electric blankets to estimate exposure to electromagnetic fields, and Gustavsson et al. (9) used questionnaire data and expert assessment by industrial hygienists to classify environmental and occupational exposures. Radiation workers are one of the few groups for which historical records of personal exposure typically are available. Dupree-Ellis et al. (10) took advantage of such records to estimate cumulative external radiation exposure. "), HighwireSpan(start=12037, length=2544, text='\r\n\r\nSeveral of the papers evaluate methods for assessing exposure. For example, Oglesby et al. (11) average individual-level annoyance scores to estimate community-level exposure to air pollution. The authors propose that this measure better accounts for exposure variability than data from fixed-site monitoring stations. This is an interesting twist in a field where much work is based on linking data from monitoring stations with population-level mortality statistics. The measure seems to be easy to operationalize, and it correlates well with monitoring station data, although its ultimate utility may be limited. The real gold standard\x97a more precise direct measure of individual exposure, rather than another indirect measure\x97is what is needed. Hwang et al. (12) propose an alternative modeling approach whereby air pollution monitoring station data are used to ascribe exposures to individuals with and without school absences due to respiratory disease. Auvinen et al. (13) compare several possible methods for measuring and classifying exposure to electromagnetic fields. This is a topic that has been hurt by the lack of consensus on the best and most appropriate exposure measure, and results tend to vary for studies employing different exposure metrics. The paper by Karagas et al. (14) attempts to link a biologic measure, arsenic in toenails, with an environmental measure of arsenic in water. The toenail measure is likely to reflect total body burden, but it appears to correlate with water only when water levels are high. This presents an interesting regulatory dilemma. The best epidemiologic research may be based on a direct measure of body burden such as levels in toenails, whereas it is water levels that need to be regulated. Studies of toenail arsenic levels may not shed direct light on the link between water levels and disease. '), HighwireSpan(start=14584, length=523, text='\r\n\r\nAs these papers demonstrate, technological advances are making possible a wide range of new study designs and strategies to better assess both exposures and outcomes. Although progress has been made, research in environmental epidemiology is far from perfect. As epidemiologists face pressures and criticisms from industry, regulatory bodies, and other scientific disciplines, it is important to not lose sight of the lessons from John Snow. '), HighwireSpan(start=15110, length=51, text='\r\n\r\n\r\n\r\nNOTES'), HighwireSpan(start=15164, length=366, text='\r\n\r\n\r\nReprint requests to Dr. Dale P. Sandler at this address (e-mail: sander{at}niehs.nih.gov). '), HighwireSpan(start=15533, length=54, text='\r\n\r\n\r\nREFERENCES'), HighwireSpan(start=15590, length=5710, text="\r\n\r\n\r\n\r\n\r\n Lilienfeld DE. John Snow: the first hired gun? Am J Epidemiol 2000;152:4\x969.[Abstract/Free Full\xa0Text]\r\n\r\n\r\n Vandenbroucke JP. Invited commentary: the testimony of Dr. Snow. Am J Epidemiol 2000;152:10\x9612.[Free Full\xa0Text]\r\n\r\n\r\n Foreman CH Jr. The promise and peril of environmental justice. Washington, DC: Brookings Institute, 1998.\r\n\r\n\r\n Horton R. The new new public health of risk and radical engagement. (Editorial). Lancet 1998;352:251.[ISI][Medline]\r\n\r\n\r\n Viel J-F, Arveux P, Baverel J, et al. Soft-tissue sarcoma and non-Hodgkin's lymphoma clusters around a municipal solid waste incinerator with high dioxin emission levels. Am J Epidemiol 2000;152:13\x9619.[Abstract/Free Full\xa0Text]\r\n\r\n\r\n Zheng T, Holford TR, Tessari J, et al. Breast cancer risk associated with congeners of polychlorinated biphenyls. Am J Epidemiol 2000;152:50\x968.[Abstract/Free Full\xa0Text]\r\n\r\n\r\n Rondeau V, Commenges D, Jacqmin-Gadda H, et al. Relation between aluminum concentrations in drinking water and Alzheimer's disease: an 8-year follow-up study. Am J Epidemiol 2000;152:59\x9666.[Abstract/Free Full\xa0Text]\r\n\r\n\r\n Laden F, Neas LM, Tolbert PE, et al. Electric blanket use and breast cancer in the Nurses' Health Study. Am J Epidemiol 2000;152:41\x969.[Abstract/Free Full\xa0Text]\r\n\r\n\r\n Gustavsson P, Jakobsson R, Nyberg F, et al. Occupational exposure and lung cancer risk: a population-based case-referent study in Sweden. Am J Epidemiol 2000;152:32\x9640.[Abstract/Free Full\xa0Text]\r\n\r\n\r\n Dupree-Ellis E, Watkins J, Ingle JN, et al. External radiation exposure and mortality in a cohort of uranium processing workers. Am J Epidemiol 2000;152:91\x965.[Abstract/Free Full\xa0Text]\r\n\r\n\r\n Oglesby L, Künzli N, Monn C, et al. Validity of annoyance scores for estimation of long term air pollution exposure in epidemiologic studies: The Swiss Study on Air Pollution and Lung Diseases in Adults (SAPALDIA). Am J Epidemiol 2000;152:75\x9683.[Abstract/Free Full\xa0Text]\r\n\r\n\r\n Hwang J-S, Chen Y-J, Wang J-D, et al. Subject-domain approach to the study of air pollution effects on schoolchildren's illness absence. Am J Epidemiol 2000;152:67\x9674.[Abstract/Free Full\xa0Text]\r\n\r\n\r\n Auvinen A, Linet MS, Hatch EE, et al. Extremely low-frequency magnetic fields and childhood acute lymphoblastic leukemia: an exploratory analysis of alternative exposure metrics. Am J Epidemiol 2000;152:20\x9631.[Abstract/Free Full\xa0Text]\r\n\r\n\r\n Karagas MR, Tosteson TD, Blum J, et al. Measurement of low levels of arsenic exposure: a comparison of water and toenail concentrations. Am J Epidemiol 2000;152:84\x9690.[Abstract/Free Full\xa0Text]\r\n\r\n\r\nReceived for publication March 17, 2000. \r\n\r\nAccepted for publication March 29, 2000."), HighwireSpan(start=21303, length=129, text='\r\n\r\n\t\t\r\n \r\n \r\n\t\r\n\t\r\n\t\r\n\t\r\n\t\r\n\r\n\r\n\r\n\r\n\r\n \r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n'), HighwireSpan(start=21435, length=124, text='\r\nRelated articles in Am. J. Epidemiol.:\r\n'), HighwireSpan(start=21562, length=344, text='\r\n\r\n\t\r\n\tJohn Snow: The First Hired Gun?\r\n\tDavid E. Lilienfeld\r\n\t\r\nAm. J. Epidemiol. 2000 152: 4-9.\r\n\r\n\t\r\n\t\r\n\t[Abstract]\r\n\t\r\n\t[FREE Full Text]\r\n\t\r\n\t\xa0\r\n\t'), HighwireSpan(start=21909, length=362, text='\r\n\t\r\n\tInvited Commentary: The Testimony of Dr. Snow\r\n\tJan P. Vandenbroucke\r\n\t\r\nAm. J. Epidemiol. 2000 152: 10-12.\r\n\r\n\t\r\n\t\r\n\t[Extract]\r\n\t\r\n\t[FREE Full Text]\r\n\t\r\n\t\xa0\r\n\t'), HighwireSpan(start=22274, length=137, text='\r\n\t\r\n\r\n\r\n \r\n \r\n \r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n'), HighwireSpan(start=22414, length=16, text='\r\n'))), 9: HighwireDoc('10901331', 'ajepidem', "\r\nSubject-Domain Approach to the Study of Air Pollution Effects on Schoolchildren's Illness Absence\r\n", (HighwireSpan(start=0, length=423, text="\r\n\r\n\r\nSubject-Domain Approach to the Study of Air Pollution Effects on Schoolchildren's Illness Absence\r\n\r\n\r\n\r\nJing-Shiang Hwang1, \r\nYi-Ju Chen2, \r\nJung-Der Wang3,4, \r\nYu-Min Lai3, \r\nChun-Yuh Yang5 and \r\nChang-Chuan Chan3\r\n"), HighwireSpan(start=426, length=565, text='\r\n\r\n1 Institute of Statistical Science, Academia Sinica, Taipei, Taiwan.\r\n2 Institute of Epidemiology, College of Public Health, National Taiwan University, Taipei, Taiwan.\r\n3 Institute of Industrial Hygiene and Occupational Medicine, College of Public Health, National Taiwan University, Taipei, Taiwan.\r\n4 Department of Internal Medicine, National Taiwan University Hospital, Taipei, Taiwan.\r\n5 Department of Public Health, Kaohsiung Medical College, Kaohsiung, Taiwan.\r\n'), HighwireSpan(start=994, length=3118, text="\r\n\r\n\r\n\r\n\r\n \r\n \xa0\xa0\r\nABSTRACT\r\n\r\n\r\nTOP\r\nABSTRACT\r\nINTRODUCTION\r\nMATERIALS AND METHODS\r\nSTATISTICAL METHODS\r\nRESULTS\r\nDISCUSSION\r\nAPPENDIX\r\nREFERENCES\r\n\r\n\xa0\r\n\r\n\r\nIn this paper, the authors propose a new statistical modeling technique, the subject-domain approach, which is theoretically proven to be equivalent to the time-domain approach in detecting an association between exposure and response with time trends. The authors use an empirical data set from a school absence monitoring study conducted during the 1994\x961995 school year in Taiwan to demonstrate this subject-domain approach's application to environmental epidemiologic studies. Because the subject-domain models can control the influential personal confounding factors in the models, they show greater statistical power than the traditional time-domain approaches in determining the relation between air pollution and illness absences. The authors' models found that the schoolchildren's risks of illness absence were significantly related to acute exposures to nitrogen dioxide and nitrogen oxides with a 1-day lag (p < 0.01) at levels below the World Health Organization's guidelines. By contrast, the authors could not detect significant associations between air pollution and schoolchildren's absenteeism using time-domain approaches. Such findings imply that the models built on subject domain may be a general solution to the problem of the ecologic fallacy, which is commonly encountered in environmental and social epidemiologic studies. "), HighwireSpan(start=4115, length=8, text='\r\n\r\n\r\n\r\n'), HighwireSpan(start=4126, length=106, text=' air pollution; epidemiologic methods; nitrogen dioxide; statistics; time-dependent covariate; time series'), HighwireSpan(start=4235, length=28, text='\r\n\t\r\n\t\r\n \r\n\t\t'), HighwireSpan(start=4266, length=151, text='Abbreviations: \r\n\t\r\nPM10, particulate matter with a diameter less than 10 µm; SOAP&HIT, Study On Air Pollution and Health In Taiwan'), HighwireSpan(start=4420, length=5037, text="\r\n\r\n\r\n \r\n \xa0\xa0\r\nINTRODUCTION\r\n\r\n\r\nTOP\r\nABSTRACT\r\nINTRODUCTION\r\nMATERIALS AND METHODS\r\nSTATISTICAL METHODS\r\nRESULTS\r\nDISCUSSION\r\nAPPENDIX\r\nREFERENCES\r\n\r\n\xa0\r\n\r\nSeveral epidemiologic studies have used time-domain methods to illustrate the effects of air pollution on hospital admissions and emergency room visits for various respiratory diseases (1\x9610). In these ecologic-type epidemiologic studies, daily counts of hospital admissions or emergency room visits in a geographic area are usually regressed against pollution levels measured at several fixed-site air monitoring stations in the same areas. In applications of the same time-domain methods to investigation of the relation between air pollution and illness absence, neither community-based studies nor cohort-based studies show consistent findings of air pollution effects on absence (11\x9616). These studies' inherent problem of the ecologic fallacy, i.e., the lack of subject-specific information in the study population, tends to bias study results toward the null. Traditional models built on time domain usually cannot include subject-specific attributes in the models, even when such personal information is available. In the case of studies using illness absence as the outcome, we must control for each subject's personal factors, such as the individual's susceptibility factors and general environmental conditions, in order to illustrate air pollution effects on the risk of illness absence. One solution to this problem is to transform original time-series data into a subject-domain problem and make within-subject comparisons. An example is the case-crossover design for analysis of data with time trends (17, 18). Such a subject-domain approach can better estimate relative risks, because individual susceptibility factors are controlled by within-subject comparisons. "), HighwireSpan(start=9460, length=1627, text="\r\n\r\nIn this paper, we show that it is also valid to use the subject-domain approach to analyze time-domain problems and make between-subject comparisons. We demonstrate that time-series data can be equivalently analyzed by time-domain and subject-domain modeling approaches, theoretically as well as empirically. We further demonstrate, using data from an empirical study, that the subject-domain model is better than the time-domain model for uncovering true effects of air pollution on illness absence because person-related information can be included in the model. The data set used is from a school absence monitoring study we conducted during the 1994\x961995 school year in Taiwan. The study examined 4,679 schoolchildren's illness absences attributed to respiratory diseases at six schools. It was part of an epidemiologic study on air pollution and health, the Study On Air Pollution and Health In Taiwan (SOAP&HIT) (19). In the SOAP&HIT, air quality data were measured from fixed-site ambient air monitoring stations, and information on personal and housing characteristics was obtained from a questionnaire survey. Here we describe a subject-domain approach to the estimation of acute effects of exposure to nitrogen dioxide and nitrogen oxides on the risk of illness absence from school. "), HighwireSpan(start=11090, length=2359, text='\r\n\r\n\r\n\r\n \r\n \xa0\xa0\r\nMATERIALS AND METHODS\r\n\r\n\r\nTOP\r\nABSTRACT\r\nINTRODUCTION\r\nMATERIALS AND METHODS\r\nSTATISTICAL METHODS\r\nRESULTS\r\nDISCUSSION\r\nAPPENDIX\r\nREFERENCES\r\n\r\n\xa0\r\n\r\nStudy population\r\n\r\nWe collected the attendance records of 5,072 students aged 6\x9612 years from six primary schools in Taiwan during 1994\x961995. The study population included 705 students from Taihsi, 954 from Keelong, 1,386 from Sanchung, 796 from Toufen, 701 from Jenwu, and 530 from Linyuan. The Taihsi school is in a rural area; the Keelong and Sanchung schools are in urban areas; and the Jenwu, Linyuan, and Toufen schools are in industrial areas where petrochemicals are produced. From these 5,072 schoolchildren, we used 4,697 students (92 percent) with absence records covering at least one school year and complete information from the questionnaire survey as our study cohort. '), HighwireSpan(start=13452, length=553, text='\r\n\r\nIllness absence\r\n\r\nTeachers in each class of these six schools helped document the records and causes of absence for each absentee. The causes of illness absence were first screened by school nurses daily and doubly checked by trained physicians biweekly. We included only absenteeism due to respiratory diseases in our data analysis. If an individual had consecutive days of absence, only the first day was considered for that event in the analyses. '), HighwireSpan(start=14008, length=1418, text="\r\n\r\nEnvironmental data\r\n\r\nThe hourly concentrations of six major air pollutants--particulate matter with a diameter less than 10 µm (PM10), sulfur dioxide, nitrogen oxides, nitrogen monoxide, nitrogen dioxide, and ozone\x97were continuously measured by air-monitoring stations located in these six primary schools. Weather data, including temperature, wind speed and direction, and precipitation, were also measured continuously in these air-monitoring stations. The environmental data obtained from school-based monitoring stations, which are located in community centers, are generally well representative of a community's ambient air quality in Taiwan (20). Since Taiwanese schoolchildren spend most of their outdoor time at school, where classrooms are always well ventilated naturally and without air conditioning, we calculated daytime averages of environmental data from 8:00 a.m. to 6:00 p.m. in order to represent their outdoor exposures. A questionnaire survey on home characteristics (described below) was used to account for children's potential exposures to air pollutants indoors. "), HighwireSpan(start=15429, length=1100, text="\r\n\r\nQuestionnaire\r\n\r\nWe treated subjects' personal information from the questionnaire survey as confounding factors to be controlled in our subject-domain model. The survey was carried out at the beginning of the study. The information gathered in the questionnaire included an individual student's demographic data, personal and family history of respiratory diseases, and characteristics of the home environment. Major categories of respiratory symptoms and diseases in the questionnaire included morning cough, day or night cough, chronic cough, shortness of breath, nasal symptoms, sinusitis, wheezing or asthma, allergic rhinitis, bronchitis, pneumonia, and family history of respiratory diseases. Key indicators of home environment included a crowding index, household smoking, the presence of pets or fowl, coal stove use, gas-cooker use, incense-burning, mosquito repellent-burning, indoor plants, and home dampness. "), HighwireSpan(start=16532, length=2450, text="\r\n\r\n\r\n\r\n \r\n \xa0\xa0\r\nSTATISTICAL METHODS\r\n\r\n\r\nTOP\r\nABSTRACT\r\nINTRODUCTION\r\nMATERIALS AND METHODS\r\nSTATISTICAL METHODS\r\nRESULTS\r\nDISCUSSION\r\nAPPENDIX\r\nREFERENCES\r\n\r\n\xa0\r\n\r\nDaily health events can generally be represented by yit for students in a school. Here, i = 1,... I index the schoolchil-dren and t = 1,... T index the school days. Let yit be equal to 1 when subject i is absent on day t and zero otherwise. Let xt be the area level of a pollutant measured from representative monitoring stations on lagged h days. The lag ranging from 0 to 4 days was used in this paper. We may assume that yit is a realization from a Bernoulli distribution with a rare rate of pit, which may be affected by at least two main effects of subject i's personal characteristics and the associated environmental conditions xt. "), HighwireSpan(start=18985, length=1217, text='\r\n\r\nTime-domain approach\r\n\r\nSince illness absence is usually a rare event, it is difficult to select a proper model for testing the association between xt and original observed sparse yit directly. However, if xt has an acute effect on yit, with the assumption of independent subjects, xt should also have an acute effect on daily total counts of absence, . Hence, the conventional time-domain approaches treat the aggregated absence counts by times, y+t, as a conditionally independent Poisson variable when the mean total number of absentees, , is not too small. It is easily and widely applied to health effects studies but usually requires a large population size to have a chance to detect a significant association between xt and y+t from fitted Poisson or negative binomial models. '), HighwireSpan(start=20205, length=1651, text="\r\n\r\nSubject-domain approach\r\n\r\nOne equivalent method of aggregating data by time is to aggregate the sparse yit data by subject. Similarly, if xt have affected subject i such that yit = 1 for some t's in the study period, the average of these xt's can be treated as having an acute effect on the subject's total absence in the study time period, . For each subject having yi+, we define the average of these xt's as the subject's level of exposure, denoted by . When subjects have no absences during the study period, their air pollution levels are given a constant value, that is, the average of xt for all t with y+t = 0. Conceivably, the original association between the area exposure index, xt, and the time-specific population absence frequency in an area, y+t, in the time-domain approach will be equivalent to the association between individual exposure levels, zi, and each subject's absence frequency observed over the entire study period, yi+, in the subject-domain approach. The theoretical proof of such equivalence is given in the Appendix. "), HighwireSpan(start=21859, length=991, text="\r\n\r\nUnder this framework, we can reasonably assume yi+ to approximate a Poisson distribution with a mean obtained by . Obviously, yi+ is affected not only by the subject's air pollution levels zi and weather levels wi but also by personal/housing characteristics and other confounding factors. Here, the individual level of weather exposures for the ith schoolchild, w>i, is calculated by replacing the pollutant level of xt in the zi statistic by weather measurements. Therefore, having adjusted for the influential personal variables, we expect that models built on subject domain are more powerful in detecting the association between pollutants and health outcomes. "), HighwireSpan(start=22853, length=199, text="\r\n\r\nCombining data from all students in the six schools, we propose using standard Poisson regression to model subjects' absence counts as follows:\r\n "), HighwireSpan(start=23055, length=810, text='\r\n\r\nwhere pki+ is the expected total number of absences for subject i at school k and Tk is the number of school days in the study period on which data were collected at the kth school. The individual level of air pollution and weather exposures for schoolchild i at school k are denoted by vectors zki and wki. The vector uki consists of explanatory variables including a dummy variable for area characteristics and covariates for personal characteristics. '), HighwireSpan(start=23868, length=1828, text='\r\n\r\nSelection of questionnaire items\r\n\r\nTo reduce the burden of selecting proper personal variables from a large number of questionnaire items, we used classical association-testing approaches of Pearson\'s 2 test and the generalized logit model to identify a few key factors (21, 22). We classified the response variable of absence counts into three categories of none (0 absences), low (1\x963 absences), and high (>4 absences), which had a natural ordering. In the same way, we classified each personal variable into appropriate categories. For example, we classified "school grade" into three categories of low (grade 1\x962), medium (grade 3\x964), and high (grade 5\x966) and the symptom "day or night cough" into two categories of yes and no. We first applied Pearson\'s 2 test to exclude any items that were not significantly associated with absence incidence. We then entered the items that had been screened significant into the generalized logit models to finalize the selection of personal variables. We used generalized logit models as the second step of variable selection, because the responses had a natural ordering. We reserved items that were statistically significant in more than two schools as the personal variables for the subject-domain Poisson regression model. '), HighwireSpan(start=25699, length=2804, text="\r\n\r\n\r\n\r\n \r\n \xa0\xa0\r\nRESULTS\r\n\r\n\r\nTOP\r\nABSTRACT\r\nINTRODUCTION\r\nMATERIALS AND METHODS\r\nSTATISTICAL METHODS\r\nRESULTS\r\nDISCUSSION\r\nAPPENDIX\r\nREFERENCES\r\n\r\n\xa0\r\n\r\nIndividual level of exposure\r\n\r\nAlthough the data were analyzed with different time lags, we present only the results for a 1-day lag for illustration and simplicity. As described above, individual levels of air pollution exposure were calculated according to individual students' absence records from fixed-site ambient monitoring data. For example, suppose one student has records of five absences on the dates May 1, May 2, June 4, September 27, and October 3 during the study period. Accordingly, daytime average pollutant levels measured on April 30, June 3, September 26, and October 2 are averaged to represent this student's 1-day-lagged individual level of air pollution exposure. Exposure levels for students without any absences during the school year are calculated by averaging ambient air monitoring data on days with no absences during the study period. In total, we derive individual air pollution levels, zi, of six air pollutants and two meteorologic parameters, wi, for 4,697 students with a 1-day lag from the SOAP&HIT data set. "), HighwireSpan(start=28506, length=1506, text="\r\n\r\nThe 1-day-lagged individual air pollution levels classified by absence counts are shown in table 1. Among 4,697 students, 17.5 percent had at least one absence and 82.5 percent had no absences. For the 3,875 students without absences, the individual air pollution levels were 17.2 parts per billion (ppb) for sulfur dioxide, 39.3 ppb for nitrogen oxides, 25.7 ppb for nitrogen dioxide, 74.3 µg/m3 for PM10, and 45.9 ppb for ozone. For the 822 students with at least one absence, the individual air pollution levels were 17.8\x9619.9 ppb for sulfur dioxide, 41.2\x9647.1 ppb for nitrogen oxides, 27.5\x9631.8 ppb for nitrogen dioxide, 80.2\x9680.4 µg/m3 for PM10, and 46.1\x9646.2 ppb for ozone. Apparently, absentees' levels of pollution exposure were all greater than nonabsentees' pollution levels. Individual levels of sulfur dioxide, nitrogen oxides, and nitrogen dioxide were also positively correlated with absence frequency. Such results indicate that these three pollutants may have acute effects on illness absence. By contrast, PM10, ozone, and rainfall may have no acute effects on absence, because their individual levels did not show trends with different degrees of absence. "), HighwireSpan(start=30015, length=795, text='\r\n\r\n\r\n\r\n\r\nView this table:\r\n[in this window]\r\n[in a new window]\xa0\r\n\r\nTABLE 1. One-day-lagged individual exposures to air pollutants among 4,697 schoolchildren, by number of absences during the 1994\x961995 school year, Taiwan'), HighwireSpan(start=30813, length=1939, text="\r\n\r\n\r\n\r\n\xa0\r\n\r\n\r\nModel comparisons\r\n\r\nTo illustrate the improvement in statistical power for detecting pollutant effects on the risk of school absence, we make model comparisons among Poisson models built on time domain and subject domain, without and with adjustment for personal factors. All of these three models are also adjusted for weather exposures. The effects of six air pollutants on the risks of illness absence estimated by these three different modeling approaches are summarized in table 2. The equivalence of time-domain and subject-domain modeling is affirmed from the estimated relative risks and 95 percent confidence intervals. These two models cannot detect any significant air pollution effects on absence. By contrast, the subject-domain models with adjustment for personal factors (SP) illustrate significant effects on schoolchildren's illness absence by two air pollutants, nitrogen oxides and nitrogen dioxide. The SP models predict that relative risks of illness absence are 1.11 and 1.23 for every 10-ppb increase in acute exposure to nitrogen oxides and nitrogen dioxide, respectively. The SP models also find that relative risks are increased with narrower confidence intervals for the other four air pollutants, although their values are not statistically significant. Such empirical results demonstrate that controlling for personal factors in the SP models contributes significant gains to the models. Detailed effects of personal factors on illness absence are described below. "), HighwireSpan(start=32755, length=817, text="\r\n\r\n\r\n\r\n\r\nView this table:\r\n[in this window]\r\n[in a new window]\xa0\r\n\r\nTABLE 2. Effects of air pollutants on schoolchildren's illness absences, as estimated by a 10-unit increase in pollution levels with a 1-day lag, Taiwan, 1994\x961995*"), HighwireSpan(start=33575, length=1545, text="\r\n\r\n\r\n\r\n\xa0\r\n\r\n\r\nKey subject attributes included in the subject-domain model\r\n\r\nResults from the generalized logit model suggested that school grade was a common factor affecting individual students' absence records at all six schools. Children in low grades had higher absence rates than those in high grades. The presence of family or personal respiratory symptoms/diseases was also an important factor affecting individual students' absence records at all six schools. Having a family history of respiratory diseases increased schoolchildren's absence rates in Taihsi, Keelong, and Jenwu. Illness absence also increased in Keelong, Sanchung, Linyuan, and Toufen when schoolchildren had the respiratory symptom of nasal symptoms, shortness of breath, or cough or the respiratory disease of pneumonia or asthma. By contrast, no single housing factor significantly affected students' absence rates in more than two schools. Accordingly, school grade, the child respiratory symptom of day or night cough, the child respiratory disease of wheezing or asthma, the child respiratory disease of pneumonia, and family history of respiratory diseases were five key individual confounding factors which were controlled in our subject-domain model (SP). "), HighwireSpan(start=35123, length=680, text="\r\n\r\nEffects of various factors on illness absence\r\n\r\nThe expected effects of air pollution and other predictors on illness absence, as estimated by the subject-domain model with a 1-day lag, are presented in table 3. Acute exposures to nitrogen oxides and nitrogen dioxide had significant effects on individual students' total absence counts (p < 0.01), and acute exposures to sulfur dioxide had marginal effects (p = 0.08). By contrast, acute exposures to either PM10 or ozone had no significant effects on illness absence. "), HighwireSpan(start=35806, length=813, text="\r\n\r\n\r\n\r\n\r\nView this table:\r\n[in this window]\r\n[in a new window]\xa0\r\n\r\nTABLE 3. Effects of air pollution on schoolchildren's illness absences (relative risk), as estimated by subject-domain models including personal factors, Taiwan, 1994\x961995"), HighwireSpan(start=36622, length=1780, text="\r\n\r\n\r\n\r\n\xa0\r\n\r\n\r\nIn addition to air pollution effects on absence, the subject-domain model also detected other factors associated with schoolchildren's illness absence: temperature, community, grade, the personal symptom of day or night cough, and family history of respiratory diseases. Illness absence increased as the ambient temperature decreased. Apparently, there was also a community effect on absence after the air pollution effects of nitrogen dioxide and nitrogen oxides, weather, and other factors were adjusted for in the subject-domain model. Since the data had been adjusted for relatively worse weather conditions, the relative risks of absence for the two northern urban schools in Keelong and Sanchung were slightly lower than those for the rural Taihsi school. Compared with the rural school in Taihsi, illness absences were significantly higher in the schools in Jenwu and Linyuan but significantly lower in the school in Toufen. As table 4 shows, mean daily absence rates were 1.0 percent per thousand in Taihsi, 1.5 percent per thousand in Jenwu, and 1.9 percent per thousand in Linyuan. Overall, schoolchildren in the Linyuan and Jenwu southern petrochemical areas took 1.5\x961.9 times more sick leave than children in the rural Taihsi area. Differences in general environmental conditions, nutritional situations, and social and cultural status among communities are some possible explanations for the community effect on absence. "), HighwireSpan(start=38405, length=783, text='\r\n\r\n\r\n\r\n\r\nView this table:\r\n[in this window]\r\n[in a new window]\xa0\r\n\r\nTABLE 4. Median daily air pollution and weather levels and mean daily absence rates of six schools in Taiwan during the 1994\x961995 school year'), HighwireSpan(start=39191, length=1764, text="\r\n\r\n\r\n\r\n\xa0\r\n\r\n\r\nAs table 4 shows, the yearlong measurements of ambient air quality indicate that air pollution is significantly worse in Linyuan and Jenwu than in Taihsi, while weather conditions are better in Linyuan and Jenwu. Among six air pollutants, the levels of PM10 and ozone in the schools of Jenwu and Linyuan are above current air quality guidelines. Although the concentrations of nitrogen oxides and nitrogen dioxide are still below the air quality guidelines, the air pollution levels are relatively higher in Jenwu, Linyuan, and Toufen in comparison with Taihsi. Relatively higher sulfur dioxide concentrations are also found in the three schools in Linyuan, Jenwu, and Toufen in comparison with the school in Taihsi. Therefore, schoolchildren's chronic exposure to relatively higher air pollution levels may result in higher rates of illness absence in Linyuan and Jenwu. However, the community effects on the low absence rate in Toufen, 0.6 percent per thousand, may be attributable to the residents' ethnicity rather than environmental conditions in Toufen. Although Toufen is located in a petrochemical area, its air pollution level is moderate in comparison with Linyuan and Jenwu. By contrast, the Toufen school is in a township populated mostly by persons of Hakka ethnicity, and schoolchildren in Toufen may take fewer sick days than expected because of the Hakka's renowned diligence in attending school. "), HighwireSpan(start=40958, length=2833, text="\r\n\r\n\r\n\r\n \r\n \xa0\xa0\r\nDISCUSSION\r\n\r\n\r\nTOP\r\nABSTRACT\r\nINTRODUCTION\r\nMATERIALS AND METHODS\r\nSTATISTICAL METHODS\r\nRESULTS\r\nDISCUSSION\r\nAPPENDIX\r\nREFERENCES\r\n\r\n\xa0\r\n\r\nThe subject-domain approach proposed in this paper has the advantage of including subject-related information in the model, which can largely reduce the confounding effects seen in the traditional time-domain model. Therefore, the subject-domain model can significantly increase statistical power for detecting associations between exposures and effects in environmental epidemiologic studies. To illustrate the equivalence of statistical analysis between time-domain and subject-domain approaches with the same environmental data, we used only pollutant, weather, and area characteristic factors in the time-domain model in this paper. The residuals from the fitted time-domain models showed no patterns of day-of-week or other long term trends, except for minor autocorrelation in our data. Our subject-domain approach ignored the time-dependent structure within each subject's absence records. We believe that it has little effect on the inference, because a subject's autocorrelated absence records are usually a reflection of personal factors, which are treated as potential confounders and were included in our models. "), HighwireSpan(start=43794, length=1355, text="\r\n\r\nFor most air pollution and mortality studies, we usually find substantial seasonal fluctuations and long term trends in mortality data that cannot be fully explained by either air pollution or meteorology. Such trends and seasonality can be modeled by the nonparametric function fitting method in the time-domain framework. For a study on air pollution and hospital admission, we can also easily solve the day-of-week problem by simply adding terms for dummy variables in the time-domain approach. Therefore, our subject-domain model may suffer a loss of some statistical power in such applications because of its lack of control for seasonality and day-of-week patterns. However, the limitations might be removed by building time-dependent exposure patterns in the subject-domain models. For example, we can redefine a subject's exposure level by using the observed time pattern to weight the environmental data before weighting across the subject's absence records. Another possible solution would be to identify significant trends and patterns in health outcome series first, and remove them before constructing the subject-domain models. "), HighwireSpan(start=45152, length=918, text='\r\n\r\nApparently, using the subject-domain approach to analyze the time x subject matrix of our data set not only detects the associations between response (illness absence) and predictors (air pollution) but also identifies several personal factors influencing illness, which the time-series approach is unable to recognize. Such findings imply that the subject-domain model may be a general solution to the problem of the ecologic fallacy, which is commonly encountered in environmental and social epidemiologic studies. One immediate application of this finding would be to use the subject-domain approach to reanalyze data from previous studies on air pollution and health, especially when informative personal data are available in the databases. '), HighwireSpan(start=46073, length=830, text="\r\n\r\nAlthough Navidi's (18) bidirectional case-crossover design is also a subject-domain approach, he considers only cases in the logistic model. By contrast, our subject-domain approach used all subjects, i.e., cases and noncases, in the Poisson model. With less than 17 percent of our study population ever absent, we expect that the statistical gain from our approach is more significant than that from the crossover design, through the inclusion of all noncases' information in our study. Furthermore, the computations in our standard Poisson models are much more straightforward than the estimation procedures in Navidi's models. "), HighwireSpan(start=46906, length=849, text="\r\n\r\nWe argue that the subject-domain model has no greater limitation than the conventional time-series approach in the interpretation of associations between air pollution and absenteeism. In the time-domain approach, we use air pollution levels measured at specific fixed monitoring sites as acute exposure proxies for all schoolchildren on a particular day. In the subject-domain model, we use the average of absence-related air pollution levels measured at fixed monitoring sites to represent an individual's acute exposures. In fact, personal exposures to air pollutants are not actually measured directly for individual schoolchildren in either our subject-domain model or the conventional time-domain approach. "), HighwireSpan(start=47758, length=1213, text='\r\n\r\nWe detected the same air pollution effects of nitrogen dioxide and nitrogen oxides on illness absence in the SOAP&HIT data set when we used daytime as well as daily average concentrations with 0- to 4-day lags as levels of exposure. Such findings are consistent with the results of other epidemiologic studies, which also report significant associations between nitrogen oxide-linked pollution and health effects at levels below World Health Organization guidelines, which are 80 ppb for 24-hour average levels and 150 ppb for 1-hour average levels (23\x9626). Accordingly, we recommend that more studies be conducted to investigate the biologic plausibility of an effect of nitrogen oxide tox-icity on the respiratory system. '), HighwireSpan(start=48974, length=3459, text="\r\n\r\n\r\n\r\n \r\n \xa0\xa0\r\nAPPENDIX\r\n\r\n\r\nTOP\r\nABSTRACT\r\nINTRODUCTION\r\nMATERIALS AND METHODS\r\nSTATISTICAL METHODS\r\nRESULTS\r\nDISCUSSION\r\nAPPENDIX\r\nREFERENCES\r\n\r\n\xa0\r\n\r\nProof of Equivalence between the Time-Domain and Subject-Domain Approaches\r\n\r\nFollowing the notation defined in the text, the daily illness absence counts can also be decomposed as , for some m far less than I. Here, nt(j) is the number of subjects who are absent on day t and have a total count of illness absences j in a school during the study period. Empirically, we have for 2 j m. The set of whole population indices can be decomposed into a union of disjoint subsets, B(j), 0 j m, which consists of subject indices whose total counts of illness absence are exactly j in the study period. The subject's expected mean individual air pollution levels, whose total absence count is j, could be estimated by the average for j > 0. For those with no absences, i.e., j = 0, the expected mean individual air pollution level is estimated by , where k is the number of days having y+t = 0. Therefore, a positive association of xt and y+t implies Z(j) > Z(j - 1) for j 1 and vice versa. "), HighwireSpan(start=52436, length=1283, text='\r\n\r\nIn fact, Z(j) can be further rewritten as , a weighted average of xt for j 1. Note that Z(0) is an equally weighted average of these xt with y+t = 0. If we let nt(0) = #B(0) when y+t = 0 and nt(0) = 0 when y+t > 0, we can rewrite Z(0) in the same form, . Therefore, we will show that Z(j) > Z(j - 1) for j 1 when xt and y+t are positively associated and vice versa. That is, we need to show that the weights when xt is large and when xt is relatively small. '), HighwireSpan(start=53722, length=627, text='\r\n\r\nIn the simple case of j = 1, we have either > 0 and nt(0) = 0 for large xt or nt(1) = 0 and = for smaller xt values; the truth of the equivalence statement is therefore obvious. For the case of j 2, we give detailed arguments below. '), HighwireSpan(start=54352, length=1342, text='\r\n\r\nIn practical application, nt(j - 1) tends to be greater than nt(j). Meanwhile, when xt is large and so is the associated y+t, with the assumption of a positive association, then both nt(j - 1) and nt(j) will be greater than zero. We may add one more assumption that there is a q(j) between 0 and 1, such that nt(j) is approximately equal to q(j)nt(j - 1). Let C1 be the set of time indices having both nt(j - 1) and nt(j) greater than zero. Those time indices with nt(j - 1) > 0 and nt(j) = 0 are grouped to C2. This may happen when y+t is not large enough, i.e., when a relatively small xt is observed. Hence, we complete the argument with the following two statements.\r\n '), HighwireSpan(start=55697, length=183, text='\r\n\r\nand\r\n '), HighwireSpan(start=55883, length=160, text='\r\n '), HighwireSpan(start=56046, length=680, text='\r\n\r\n\r\n\r\n\r\n \r\n \xa0\xa0\r\nACKNOWLEDGMENTS\r\n\r\n\xa0\r\n\r\nThe research described in this article was conducted through the sponsorship of contracts awarded by the National Research Council, the Executive Yuan, Taiwan (NSC-84-2621-p-002-018, NSC-84-2621-p-002-024, NSC-85-2621-p-002-008, and NSC-85-2621-p-002-024). '), HighwireSpan(start=56729, length=155, text='\r\n\r\nThe authors acknowledge the technical support of the Taiwan Environmental Protection Agency in the measurement of air pollution. '), HighwireSpan(start=56887, length=843, text='\r\n\r\n\r\n\r\n\r\n \r\n \xa0\xa0\r\nNOTES\r\n\r\n\xa0\r\n\r\n\r\nReprint requests to Dr. Chang-Chuan Chan, College of Public Health, National Taiwan University, Room 1447, No. 1, 1st sec., Jen-ai Road, Taipei, Taiwan (e-mail: ccchan{at}ha.mc.ntu.edu.tw). '), HighwireSpan(start=57733, length=11230, text='\r\n\r\n\r\n\r\n\r\n \r\n \xa0\xa0\r\nREFERENCES\r\n\r\n\r\nTOP\r\nABSTRACT\r\nINTRODUCTION\r\nMATERIALS AND METHODS\r\nSTATISTICAL METHODS\r\nRESULTS\r\nDISCUSSION\r\nAPPENDIX\r\nREFERENCES\r\n\r\n\xa0\r\n\r\n\r\n\r\n\r\n Walters S, Phupinyokul M, Ayres J. Hospital admission rates for asthma and respiratory disease in the West Midlands: their relationship to air pollution levels. Thorax 1995;50:948\x9654.[Abstract]\r\n\r\n\r\n Ponka A. Asthma and low level air pollution in Helsinki. Arch Environ Health 1991;46:262\x9670.[ISI][Medline]\r\n\r\n\r\n Ponka A, Virtanen M. Asthma and ambient air pollution in Helsinki. J Epidemiol Community Health 1996;50(suppl 1):S59\x9662.\r\n\r\n\r\n Castellsague J, Sunyer J, Saez M, et al. Short-term association between air pollution and emergency room visits for asthma in Barcelona. Thorax 1995;50:1051\x966.[Abstract]\r\n\r\n\r\n Anderson HR, Limb ES, Bland JM, et al. Health effects of an air pollution episode in London, December 1991. Thorax 1995;50:1188\x9693.[Abstract]\r\n\r\n\r\n Schwartz J, Slater D, Larson TV, et al. Particulate air pollution and hospital emergency room visits for asthma in Seattle. Am Rev Respir Dis 1993;147:826\x9631.[ISI][Medline]\r\n\r\n\r\n Bates DV, Baker-Anderson M, Sizto R. Asthma attack periodicity: a study of hospital emergency visits in Vancouver. Environ Res 1990;51:51\x9670.[ISI][Medline]\r\n\r\n\r\n Bates DV, Sizto R. Air pollution and hospital admissions in Southern Ontario: the acid summer haze effect. Environ Res 1987;43:317\x9631.[ISI][Medline]\r\n\r\n\r\n Thurston G, Ito K, Hayes CG, et al. Respiratory hospital admissions and summertime haze air pollution in Toronto, Ontario: consideration of the role of acid aerosols. Environ Res 1994;65:271\x9690.[ISI][Medline]\r\n\r\n\r\n Sunyer J, Anto JM, Murillo C, et al. Effects of urban air pollution on emergency room admissions for chronic obstructive pulmonary disease. Am J Epidemiol 1991;134:277\x9686.[Abstract]\r\n\r\n\r\n Romieu I, Lugo MC, Velasco SR, et al. Air pollution and school absenteeism among children in Mexico City. Am J Epidemiol 1992;136:1524\x9631.[Abstract]\r\n\r\n\r\n Moll Van Charante AW, Mulder PG. Effects of smog on absenteeism in forestry workers. Arch Environ Health 1996;51:34\x9641.[ISI][Medline]\r\n\r\n\r\n Ferris BG. Effect of air pollution on school absences and differences in lung function in first and second graders in Berlin, New Hampshire, January 1966 to June 1967. Am Rev Respir Dis 1970;102:591\x96606.[ISI][Medline]\r\n\r\n\r\n Verma MP, Schilling FJ, Becker WH. Epidemiological study of illness absences in relation to air pollution. Arch Environ Health 1969;18:536\x9643.[ISI][Medline]\r\n\r\n\r\n Wayne WS, Wehrle PF. Oxidant air pollution and school absenteeism. Arch Environ Health 1969;19:315\x9622.[ISI][Medline]\r\n\r\n\r\n Ponka A. Absenteeism and respiratory disease among children and adults in Helsinki in relation to low-level air pollution and temperature. Environ Res 1990;52:34\x9646.[ISI][Medline]\r\n\r\n\r\n Maclure M. The case-crossover design: a method for studying transient effects on the risk of acute events. Am J Epidemiol 1991;133:144\x9653.[Abstract]\r\n\r\n\r\n Navidi W. Bidirectional case-crossover designs for exposures with time trends. Biometrics 1998;54:596\x96605.[ISI][Medline]\r\n\r\n\r\n Chen PC, Lai YM, Wang JD, et al. Adverse effect of air pollution on respiratory health of primary school children in Taiwan. Environ Health Perspect 1998;106:331\x965.[ISI][Medline]\r\n\r\n\r\n Chan CC, Hwang JS. Site representativeness of urban air monitoring stations. J Air Waste Manag Assoc 1996;46:755\x9660.[ISI]\r\n\r\n\r\n Agresti A. An introduction to categorical data analysis. New York, NY: John Wiley and Sons, Inc, 1996.\r\n\r\n\r\n Agresti A. Categorical data analysis. New York, NY: John Wiley and Sons, Inc, 1994.\r\n\r\n\r\n Schwartz J, Zeger S. Passive smoking, air pollution, and acute respiratory symptoms in a diary study of student nurses. Am Rev Respir Dis 1990;141:62\x967.[ISI][Medline]\r\n\r\n\r\n Lebowitz MD, Collins L, Holberg CJ. Time series analyses of respiratory responses to indoor and outdoor environmental phenomena. Environ Res 1987;43:332\x9641.[ISI][Medline]\r\n\r\n\r\n Rutishauser M, Ackermann U, Braun C, et al. Significant association between outdoor NO2 and respiratory symptoms in preschool children. Lung 1990;168(suppl):347\x9652.[ISI][Medline]\r\n\r\n\r\n Braun-Fahrlander C, Ackermann-Liebrich U, Schwartz J, et al. Air pollution and respiratory symptoms in preschool children. Am Rev Respir Dis 1992;145:42\x967.[ISI][Medline]\r\n\r\n\r\nReceived for publication December 28, 1998. \r\n\r\nAccepted for publication November 29, 1999.'), HighwireSpan(start=68966, length=208, text='\r\n\r\n\t\t\r\n \r\n \r\n\t\r\n\t\r\n\t\r\n\t\r\n\t\r\n\r\n\r\n\r\n\r\n\r\n \r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n \r\n \r\n \r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n'), HighwireSpan(start=69177, length=16, text='\r\n'))), 162258: HighwireDoc('16280387', 'toxsci', '\r\nCORRECTION\r\n', (HighwireSpan(start=0, length=148, text='\r\n\r\n\r\nCORRECTION\r\n\r\n\r\n\r\nJames F. Dillman, III and \r\nChristopher S. Phillips\r\n'), HighwireSpan(start=151, length=168, text='\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\nComparison of Non-Human Primate and Human Whole Blood Tissue Gene Expression Profiles '), HighwireSpan(start=322, length=41, text='\r\n\r\ndoi:10.1093/toxsci/kfi243 '), HighwireSpan(start=366, length=68, text='\r\n\r\nToxicological Sciences 87, 306\x96314, 2005 '), HighwireSpan(start=437, length=719, text='\r\n\r\nIn our paper "Comparison of non-human primate and human whole blood tissue gene expression profiles" which was published ahead of print on June 23, 2005 (doi:10.1093/toxscikfi243) and is printed in the September 2005 issue of the journal (Toxicol. Sci. 84: 306\x96314) we reference the work of Wang et al. 2004. Their paper, entitled "Identification and utilization of inter-species conserved (ISC) probesets on Affymetrix human Genechip platforms for the optimization of the assessment of expression patterns in non human primate (NHP) samples" was published in BMC Bioinformatics 5, 165, 2004. '), HighwireSpan(start=1159, length=382, text="\r\n\r\nAlthough we cite Wang et al. 2004 in our introduction and discussion, we failed to give appropriate attribution to text in our discussion taken from that paper. We sincerely regret our failure to properly cite these authors' work and wish to correct this by attributing the following text in our discussion to them: "), HighwireSpan(start=1544, length=223, text='\r\n\r\nA single gene does not necessarily generate a single transcript. Splicing variants are very common in the human, and humans and NHPs may use different splicing strategies in some genes. '), HighwireSpan(start=1770, length=893, text='\r\n\r\nChismar and colleagues used the U95Av2 human genechip and compared the expression patterns of humans with rhesus. They concluded that the percentage of detected genes (genes called \x91present\x92) in the rhesus brain is lower than that of human brain, and that this is especially true for genes with lower signal intensity. Caceres and colleagues used the HG-U95Av2 human genechip to identify upregulated genes in the human cortex compared with those of the NHPs. Since sequence divergence could lead to an underestimation of expression levels in NHPs, they excluded 4572 probes that exhibited different hybridization behavior between two sets of samples in order to reduce false positives. However, this analysis is solely based on probe intensities. '), HighwireSpan(start=2666, length=12162, text="\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\t\t\r\n \r\n \r\n\t\r\n\t\r\n\t\r\n\t\r\n\t\r\n\r\n\r\n\r\n\r\n\r\n \r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n \r\n \r\n \r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\t\r\n\t \r\n\t\t\r\n\t\t \r\n\t\t\t\r\n\t\r\n\t\t\r\n\t\t\t\r\n\t\t\t\r\n\t\r\n\r\n\r\n\t\r\n\r\n\r\n\t\r\n\t\t\r\n\r\n\r\n\r\n\r\n\t\r\n\r\n\t\r\n\r\n\t\r\n\t\t\r\n\t\t\t\r\n\t\t\t\r\n\t\t\t\r\nThis Article\r\n\t\r\n\t\t\t\t\r\n\t\t\r\n\t\t\t\t\t\r\n\t\t\t\t\t\t\r\n\t\t\t\t\t\r\n\t\t\t\t\t\r\n\t\t\t\t\tExtract\r\n\t\t\t\t\t\r\n\t\t\t\t\r\n\t\t\t\r\n\t\t\t\r\n\t\t\r\n\t\t\r\n\t\r\n\r\n\t\r\n\t\r\n\t\r\n\r\n\t\r\n\t\r\n\t\r\n\r\n\r\n\t\r\n\t\r\n\t\r\n\r\n\t\r\n\t\r\n\t\r\n\r\n\t\r\n\t\r\n\t\r\n\r\n\t\r\n\t\r\n\t\r\n\r\n\t\r\n\t\t\r\n\t\t\r\n\t\r\n\t\r\n\t\t\r\n\t\r\n\r\n\t\r\n\r\n\t\r\n\t\t\r\n\t\t\t\r\n\t\t\r\n\t\r\n\r\n\t\r\n\r\n\t\r\n\t \r\n\r\n\t\r\n\t \r\n\r\n\t\t\r\n\t\t\r\n\t\t\r\n\t\t\r\n\t\t\r\n\t\t\r\n\t\t\t\r\n\t\t\t\t\r\n\t\t\t\t\r\n\t\t\t\t\t\r\n\t\tFull Text (PDF)\r\n\t\t\t\t\r\n\t\t\t\r\n\t\t\r\n\t\t\r\n\t\t\r\n\t\r\n\t\r\n\r\n\t\r\n\t\r\n\t\r\n\r\n\t\r\n\t\r\n\t\r\n\r\n\t\r\n\t\r\n\r\n\r\n\r\n\r\n\r\n\t\r\n\t\t\r\n\r\n\t\r\n\t\t\r\n\r\n\t\r\n\t\t\r\n\t\t\r\n\r\n\t\t\t\r\n\r\n\r\n\t\r\n\r\n\t\t\t\r\n\r\n\t\t\t\r\n\t\t\r\n\t\r\n\r\n\t\t\t\r\n\t \r\n\t\r\n\r\n\t\t\t\r\n\r\n\t\t\t\r\n\r\n\t\t\t\r\n\t\r\n\t\r\n\r\n\t\r\n\t\t\r\n\t\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\t\r\n\t\t\r\n\t\r\n\r\n\r\n\r\n\r\n\r\n\t\r\n\t\r\n\t\r\n\r\n\t\r\n\t\t\r\n\t\r\n\r\n\t \r\n\r\n\t\r\n\r\n\t\r\n\t\r\n\t\t \r\n \r\n\r\n\t\r\n\t\r\n\t\r\n\t\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\t\t\r\n\t\r\n\t\t\r\n\t\t\t\r\n\t\t\r\n\t\t\t\t\r\n\t\t\t\tAlert me when this article is cited\r\n\t\t\t\r\n\t\r\n\r\n\t\t\r\n\t\t\r\n\r\n\t\t\r\n\t\t\r\n\r\n\t\t\r\n\t\t\r\n\r\n\t\t\t\r\n\t\t\r\n\t\t\t\t\r\n\t\t\t\tAlert me if a correction is posted\r\n\t\t\t\r\n\t\t\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\t\t\t\r\n\t\t\r\n\t\r\n\r\n\t\t\t\r\n\t\t\t\r\nServices\r\n\t\r\n\t\t\t\r\n\t\t\r\n\t\t\t\t\r\n\t\t\t\tEmail this article to a friend\r\n\t\t\t\r\n\t\t\t\r\n\t\t\t\r\n\r\n\r\n\t\r\n\r\n\t\r\n\r\n\t\t\r\n\t\t\r\n\r\n\r\n\r\n\t\t\t\r\n\t\t\t\r\n\t\t\t\r\n\t\t\r\n\t\r\n\r\n\t\t\r\n\t\t\r\n\t\t\r\n\t\t\r\n\t\tSimilar articles in this journal\r\n\t\t\r\n\t\t\r\n\r\n\t\t\r\n\t\t\r\n\r\n\t\t\t\r\n\t\t\r\n\t\t\t\t\r\n\t\t\t\tSimilar articles in PubMed\r\n\t\t\t\r\n\t\t\r\n\t\r\n\r\n\r\n\t\t\r\n\r\n\r\n\r\n\t\t\t\r\n\t\t\r\n\t\t\t\t\r\n\t\t\t\tAlert me to new issues of the journal\r\n\t\t\t\r\n\t\t\r\n\r\n\t\t\r\n\t\t\r\n\t\t\r\n\r\n\t\t\t\r\n\t\t\r\n\t\t\t\t\r\n\t\t\t\tAdd to My Personal Archive\r\n\t\t\t\r\n\t\t\r\n\t\t\r\n\t\r\n\r\n\t\t\r\n\t\t\r\n\t\t\r\n\r\n\t\t\t\r\n\t\t\r\n\t\t\t\r\n\t\t\tDownload to citation manager\r\n\t\t\t\r\n\t\t\r\n\t\t\r\n\t\r\n\r\n\r\n\t\r\n\t\t\r\n\t\r\n\r\n\t\r\n\r\n\r\n\t\r\n\r\n\t\r\n\t\t\r\n\t\t\r\n\r\n\t\t\r\n\r\n\t\r\n\t\r\n\t\r\n\t\t\r\n\r\n\t\t\t\r\n\t\t\r\n\t\t\t\tDisclaimer\r\n\t\t\t\r\n\t\t\r\n\r\n\t\r\n\t\t\r\n\r\n\t\t\r\n\t\t\r\n\t\t\t\r\n\t\t\tRequest Permissions\r\n\t\t\r\n\t\t\r\n\t\r\n\r\n\t\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\t\r\n\t\t\r\n\t\t\t\r\n\t\t\t\t\r\n\t\t\t\t\r\nGoogle Scholar\r\n\t\r\n\t\t\t\t\t\t\r\n\t\t\r\n\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\tArticles by Dillman, J. F.\r\n\t\t\t\t\t\t\r\n\t\t\t\t\r\n\t\t\t\t\r\n\t\t\t\r\n\t\t\t\t\r\n\t\t\t\t\r\n\r\n\t\t\t\t\t\t\r\n\t\t\r\n\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\tArticles by Phillips, C. S.\r\n\t\t\t\t\t\t\r\n\t\t\t\t\r\n\t\t\t\t\r\n\t\t\t\r\n\t\t\r\n\t\t\r\n\t\r\n\t\r\n\t\r\n\t\t\r\n\t\r\n\t\r\n\r\n\r\n\r\n\r\n\r\n\t\r\n\t\t\r\nPubMed\r\n\t\r\n\t\t\t\r\n\t\t\r\n\t\t\t\tPubMed Citation\r\n\t\t\t\r\n\t\t\r\n\t\r\n\r\n\t\r\n\t\t\r\n\t\t\r\n\t\t\t\r\n\t\t\t\t\r\n\r\n\t\t\t\t\t\t\r\n\t\t\r\n\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\tArticles by Dillman, J. F., III\r\n\t\t\t\t\t\t\r\n\t\t\t\t\r\n\t\t\t\r\n\t\t\t\t\r\n\r\n\t\t\t\t\t\t\r\n\t\t\r\n\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\tArticles by Phillips, C. S.\r\n\t\t\t\t\t\t\r\n\t\t\t\t\r\n\t\t\t\r\n\t\t\r\n\r\n\t\t\r\n\t\r\n\r\n\t\r\n\t\t\r\n\t\r\n\r\n\r\n\r\n\r\n\r\n\t\r\n\r\n\r\n\r\n\r\n\r\n\r\n\t\r\n\r\n\r\n\r\n\r\n\r\n\r\n\t\r\n\t\t\r\n\r\n\t\r\n\r\n\t\t\r\n\t\t\t\r\n\t\t\r\n\t \r\n\t\t\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\t\r\n\r\n\r\n\r\n\r\n\t\r\n \r\n\r\n\t\t\t\t\t\r\n\t\t\t\t\t\r\n\t\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\t\r\n\r\n\r\n\r\n\r\nOnline ISSN 1096-0929 - Print ISSN 1096-6080\r\n\r\n\r\nCopyright © 2005 Society of Toxicology\r\n\r\n\r\n\r\n\r\nOxford Journals\r\nOxford University Press\r\n\r\n\r\n\r\nSite Map\r\n\r\nPrivacy Policy\r\nFrequently Asked Questions\r\n\r\n\r\n\r\n\r\n\r\nOther Oxford University Press sites:\r\n\r\nOxford University Press\r\nAmerican National Biography\r\nBooksellers' Information Service\r\nChildren's Fiction and Poetry\r\nChildren's Reference\r\nCorporate & Special Sales\r\nDictionaries\r\nDictionary of National Biography\r\nDigital Reference\r\nEnglish Language Teaching\r\nHigher Education Textbooks\r\nHumanities\r\nInternational Education Unit\r\nLaw\r\nMedicine\r\nMusic\r\nOnline Products\r\nOxford English Dictionary\r\nReference\r\nRights and Permissions\r\nScience\r\nSchool Books\r\nSocial Sciences\r\nVery Short Introductions\r\nWorld's Classics\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n"))), }) def test_highwire_queries(self): self._test_queries('highwire/trec-genomics-2006', count=28, items={ 0: GenericQuery('160', 'What is the role of PrnP in mad cow disease?'), 1: GenericQuery('161', 'What is the role of IDE in Alzheimer’s disease'), # testing windows encoding here with curly apos 9: GenericQuery('169', 'How does APC (adenomatous polyposis coli) protein affect actin assembly'), 27: GenericQuery('187', 'How do mutations in familial hemiplegic migraine type 1 (FHM1) gene affect calcium ion influx in hippocampal neurons?'), }) self._test_queries('highwire/trec-genomics-2007', count=36, items={ 0: GenericQuery('200', 'What serum proteins change expression in association with high disease activity in lupus?'), 9: GenericQuery('209', 'What biological substances have been used to measure toxicity in response to etidronate?'), 35: GenericQuery('235', 'Which genes involved in NFkappaB signaling regulate iNOS?'), }) def test_highwire_qrels(self): self._test_qrels('highwire/trec-genomics-2006', count=27999, items={ 0: HighwireQrel('160', '11152658', 42431, 2100, 0), 9: HighwireQrel('160', '12519913', 61139, 1786, 0), 27998: HighwireQrel('187', '16103228', 8974, 3974, 0), }) self._test_qrels('highwire/trec-genomics-2007', count=35996, items={ 0: HighwireQrel('200', '7493960', 39964, 2283, 0), 9: HighwireQrel('200', '8996242', 5373, 1092, 0), 35995: HighwireQrel('235', '16339966', 117520, 519, 0), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/istella22.py ================================================ import re import unittest import ir_datasets from ir_datasets.formats import GenericQuery, TrecQrel from ir_datasets.datasets.istella22 import Istella22Doc from .base import DatasetIntegrationTest class TestIstella22(DatasetIntegrationTest): def test_docs(self): self._test_docs('istella22', count=8421456, items={ 0: Istella22Doc('1990010000000002', 'Play Online Roulette 2016 - #1 Best Online Roulette Casinos!', 'http://www.onlineroulette.org/', re.compile('^The benefits of online roulette often outweigh live roulette in a number of different ways\\. There’s .{6411}Play Now Casino Rating Payout 4\\.8 /5 98\\.29% 4\\.7 /5 97\\.19% 4\\.0 /5 97\\.29% 3\\.9 /5 95\\.05% Bonus 1 2 3 4 $', flags=48), re.compile('^\ufeff Play Online Roulette 2016 \\- \\#1 Best Online Roulette Casinos! Responsive, accurate customer support.{1029}i Qatar Deutschland Nederland French España Italia Português Canadien Home Real Money About Sitemap $', flags=48), 'en', 92), 9: Istella22Doc('1990010000000016', 'HowOpenSource', 'http://www.howopensource.com/', re.compile("^This website uses cookies to improve your experience\\. We'll assume you're ok with this, but you can .{4532}t have Firefox Add\\-ons How To Add User To Ubuntu Simple backup to DropBox Select Page Page 1 of 3 1 $", flags=48), re.compile('^HowOpenSource How To Install Linux Error Python Today’s Wisdom Chrome Firefox WordPress Plugins Some.{13039}3\\.04, Ubuntu 12\\.10 / 12\\. 04 / 11\\.10 / 11\\.04 using\\.\\.\\. 2 3 » Home About Privacy Cookie Policy Contact $', flags=48), 'en', 97), 8421455: Istella22Doc('1990159902674778', 'Comune di Nuragus (CA) - Italia: Informazioni', 'http://www.comuni-italiani.it/092/115/', re.compile("^Per segnalare aggiunte o correzioni da effettuare sulla scheda del comune di Nuragus, inviaci un'ema.{7}a: questo non è l'indirizzo email del comune\\) Comune di Nuragus Informativa Privacy \\- Note sui Dati $", flags=48), re.compile('^Comune di Nuragus \\(CA\\) \\- Italia: Informazioni Comune di Nuragus \\(Provincia di Cagliari, Regione Sard.{1619}Prometheo Comuni Provincia di Cagliari: Comune di Nurallao Comune di Muravera Lista Nuragus, Italy $', flags=48), 'it', 95), }) def test_queries(self): self._test_queries('istella22/test', count=2198, items={ 0: GenericQuery('263', 'calcio mercato'), 9: GenericQuery('1008', 'milano finanza'), 2197: GenericQuery('90118', 'abbigliamento per parrucchieri'), }) self._test_queries('istella22/test/fold1', count=440, items={ 0: GenericQuery('480', 'giallo zafferano'), 9: GenericQuery('4797', 'apple store'), 439: GenericQuery('89903', 'agevolazione pagamenti tasse ai disabili'), }) self._test_queries('istella22/test/fold2', count=440, items={ 0: GenericQuery('594', 'mediaset video'), 9: GenericQuery('5599', 'informazione scorretta'), 439: GenericQuery('89898', 'aggiornare navigon'), }) self._test_queries('istella22/test/fold3', count=440, items={ 0: GenericQuery('263', 'calcio mercato'), 9: GenericQuery('6516', 'attrezzature di saldatura'), 439: GenericQuery('90118', 'abbigliamento per parrucchieri'), }) self._test_queries('istella22/test/fold4', count=439, items={ 0: GenericQuery('788', 'gianna nannini'), 9: GenericQuery('5166', 'ultime notizie'), 438: GenericQuery('88988', 'avvelenamento da acqua'), }) self._test_queries('istella22/test/fold5', count=439, items={ 0: GenericQuery('326', 'cotto e mangiato'), 9: GenericQuery('3079', 'quote snai'), 438: GenericQuery('89691', 'allarmi fai da te'), }) def test_qrels(self): self._test_qrels('istella22/test', count=10693, items={ 0: TrecQrel('263', '1990028700044315', 3, '0'), 9: TrecQrel('326', '1990066502519695', 3, '0'), 10692: TrecQrel('90118', '1990158300000425', 3, '0'), }) self._test_qrels('istella22/test/fold1', count=2164, items={ 0: TrecQrel('480', '1990024100001149', 4, '0'), 9: TrecQrel('2519', '1990060100096961', 1, '0'), 2163: TrecQrel('89903', '1990096101187019', 1, '0'), }) self._test_qrels('istella22/test/fold2', count=2140, items={ 0: TrecQrel('594', '1990081700000321', 3, '0'), 9: TrecQrel('680', '1990046900019247', 3, '0'), 2139: TrecQrel('89898', '1990130400118331', 3, '0'), }) self._test_qrels('istella22/test/fold3', count=2197, items={ 0: TrecQrel('263', '1990028700044315', 3, '0'), 9: TrecQrel('1498', '1990035702542094', 3, '0'), 2196: TrecQrel('90118', '1990158300000425', 3, '0'), }) self._test_qrels('istella22/test/fold4', count=2098, items={ 0: TrecQrel('788', '1990058802543970', 1, '0'), 9: TrecQrel('1421', '1990110302541381', 1, '0'), 2097: TrecQrel('88988', '1990017202541615', 1, '0'), }) self._test_qrels('istella22/test/fold5', count=2094, items={ 0: TrecQrel('326', '1990022900152987', 3, '0'), 9: TrecQrel('381', '1990123900000072', 3, '0'), 2093: TrecQrel('89691', '1990153102541850', 1, '0'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/kilt.py ================================================ import re import unittest import ir_datasets from ir_datasets.datasets.kilt import KiltDoc, KiltDocAnchor from ir_datasets.datasets.codec import CodecQuery from ir_datasets.formats import TrecQrel from .base import DatasetIntegrationTest class TestKilt(DatasetIntegrationTest): def test_docs(self): self._test_docs('kilt', count=5903530, items={ 0: KiltDoc('290', 'A', re.compile('^A\nA \\(named , plural "As", "A\'s", "a"s, "a\'s" or "aes"\\) is the first letter and the first vowel of th.{9615}ably derives from old Italic A\n\\- : Gothic letter aza/asks\nExternal links\\.\n\\- History of the Alphabet\n$', flags=48), ('A\n', 'A (named , plural "As", "A\'s", "a"s, "a\'s" or "aes") is the first letter and the first vowel of the modern English alphabet and the ISO basic Latin alphabet. It is similar to the Ancient Greek letter alpha, from which it derives. The uppercase version consists of the two slanting sides of a triangle, crossed in the middle by a horizontal bar. The lowercase version can be written in two forms: the double-storey a and single-storey ɑ. The latter is commonly used in handwriting and fonts based on it, especially fonts intended to be read by children, and is also found in italic type.\n', 'In the English grammar, "a", and its variant "an", is an indefinite article.\n', 'Section::::History.\n', 'The earliest certain ancestor of "A" is aleph (also written \'aleph), the first letter of the Phoenician alphabet, which consisted entirely of consonants (for that reason, it is also called an abjad to distinguish it from a true alphabet). In turn, the ancestor of aleph may have been a pictogram of an ox head in proto-Sinaitic script influenced by Egyptian hieroglyphs, styled as a triangular head with two horns extended.\n', 'By 1600 BC, the Phoenician alphabet letter had a linear form that served as the base for some later forms. Its name is thought to have corresponded closely to the Paleo-Hebrew or Arabic aleph.\n', 'When the ancient Greeks adopted the alphabet, they had no use for a letter to represent the glottal stop—the consonant sound that the letter denoted in Phoenician and other Semitic languages, and that was the first phoneme of the Phoenician pronunciation of the letter—so they used their version of the sign to represent the vowel , and called it by the similar name of alpha. In the earliest Greek inscriptions after the Greek Dark Ages, dating to the 8th century BC, the letter rests upon its side, but in the Greek alphabet of later times it generally resembles the modern capital letter, although many local varieties can be distinguished by the shortening of one leg, or by the angle at which the cross line is set.\n', 'The Etruscans brought the Greek alphabet to their civilization in the Italian Peninsula and left the letter unchanged. The Romans later adopted the Etruscan alphabet to write the Latin language, and the resulting letter was preserved in the Latin alphabet that would come to be used to write many languages, including English.\n', 'Section::::History.:Typographic variants.\n', 'During Roman times, there were many variant forms of the letter "A". First was the monumental or lapidary style, which was used when inscribing on stone or other "permanent" media. There was also a cursive style used for everyday or utilitarian writing, which was done on more perishable surfaces. Due to the "perishable" nature of these surfaces, there are not as many examples of this style as there are of the monumental, but there are still many surviving examples of different types of cursive, such as majuscule cursive, minuscule cursive, and semicursive minuscule. Variants also existed that were intermediate between the monumental and cursive styles. The known variants include the early semi-uncial, the uncial, and the later semi-uncial.\n', 'At the end of the Roman Empire (5th century AD), several variants of the cursive minuscule developed through Western Europe. Among these were the semicursive minuscule of Italy, the Merovingian script in France, the Visigothic script in Spain, and the Insular or Anglo-Irish semi-uncial or Anglo-Saxon majuscule of Great Britain. By the 9th century, the Caroline script, which was very similar to the present-day form, was the principal form used in book-making, before the advent of the printing press. This form was derived through a combining of prior forms.\n', '15th-century Italy saw the formation of the two main variants that are known today. These variants, the "Italic" and "Roman" forms, were derived from the Caroline Script version. The Italic form, also called "script a," is used in most current handwriting and consists of a circle and vertical stroke. This slowly developed from the fifth-century form resembling the Greek letter tau in the hands of medieval Irish and English writers. The Roman form is used in most printed material; it consists of a small loop with an arc over it ("a"). Both derive from the majuscule (capital) form. In Greek handwriting, it was common to join the left leg and horizontal stroke into a single loop, as demonstrated by the uncial version shown. Many fonts then made the right leg vertical. In some of these, the serif that began the right leg stroke developed into an arc, resulting in the printed form, while in others it was dropped, resulting in the modern handwritten form.\n', 'Italic type is commonly used to mark emphasis or more generally to distinguish one part of a text from the rest (set in Roman type). There are some other cases aside from italic type where "script a" ("ɑ"), also called Latin alpha, is used in contrast with Latin "a" (such as in the International Phonetic Alphabet).\n', 'Section::::Use in writing systems.\n', 'Section::::Use in writing systems.:English.\n', 'In modern English orthography, the letter represents at least seven different vowel sounds:\n', 'BULLET::::- the near-open front unrounded vowel as in "pad";\n', 'BULLET::::- the open back unrounded vowel as in "father", which is closer to its original Latin and Greek sound;\n', 'BULLET::::- the diphthong as in "ace" and "major" (usually when is followed by one, or occasionally two, consonants and then another vowel letter) – this results from Middle English lengthening followed by the Great Vowel Shift;\n', 'BULLET::::- the modified form of the above sound that occurs before, as in "square" and "Mary";\n', 'BULLET::::- the rounded vowel of "water";\n', 'BULLET::::- the shorter rounded vowel (not present in General American) in "was" and "what";\n', 'BULLET::::- a schwa, in many unstressed syllables, as in "about", "comma", "solar".\n', 'The double sequence does not occur in native English words, but is found in some words derived from foreign languages such as "Aaron" and "aardvark". However, occurs in many common digraphs, all with their own sound or sounds, particularly , , , , and .\n', 'Section::::Use in writing systems.:Other languages.\n', 'In most languages that use the Latin alphabet, denotes an open unrounded vowel, such as , , or . An exception is Saanich, in which (and the glyph Á) stands for a close-mid front unrounded vowel .\n', 'Section::::Use in writing systems.:Other systems.\n', 'In phonetic and phonemic notation:\n', 'BULLET::::- in the International Phonetic Alphabet, is used for the open front unrounded vowel, is used for the open central unrounded vowel, and is used for the open back unrounded vowel.\n', 'BULLET::::- in X-SAMPA, is used for the open front unrounded vowel and is used for the open back unrounded vowel.\n', 'Section::::Other uses.\n', 'In algebra, the letter "a" along with other letters at the beginning of the alphabet is used to represent known quantities, whereas the letters at the end of the alphabet ("x", "y", "z") are used to denote unknown quantities.\n', 'In geometry, capital A, B, C etc. are used to denote segments, lines, rays, etc. A capital A is also typically used as one of the letters to represent an angle in a triangle, the lowercase a representing the side opposite angle A.\n', '"A" is often used to denote something or someone of a better or more prestigious quality or status: A-, A or A+, the best grade that can be assigned by teachers for students\' schoolwork; "A grade" for clean restaurants; A-list celebrities, etc. Such associations can have a motivating effect, as exposure to the letter A has been found to improve performance, when compared with other letters.\n', '"A" is used as a prefix on some words, such as asymmetry, to mean "not" or "without" (from Greek).\n', 'In English grammar, "a", and its variant "an", is an indefinite article.\n', 'Finally, the letter A is used to denote size, as in a narrow size shoe, or a small cup size in a brassiere.\n', 'Section::::Related characters.\n', 'Section::::Related characters.:Descendants and related characters in the Latin alphabet.\n', 'BULLET::::- Æ æ : Latin "AE" ligature\n', 'BULLET::::- A with diacritics: Å å Ǻ ǻ Ḁ ḁ ẚ Ă ă Ặ ặ Ắ ắ Ằ ằ Ẳ ẳ Ẵ ẵ Ȃ ȃ Â â Ậ ậ Ấ ấ Ầ ầ Ẫ ẫ Ẩ ẩ Ả ả Ǎ ǎ Ⱥ ⱥ Ȧ ȧ Ǡ ǡ Ạ ạ Ä ä Ǟ ǟ À à Ȁ ȁ Á á Ā ā Ā̀ ā̀ Ã ã Ą ą Ą́ ą́ Ą̃ ą̃ A̲ a̲ ᶏ\n', 'BULLET::::- Phonetic alphabet symbols related to A (the International Phonetic Alphabet only uses lowercase, but uppercase forms are used in some other writing systems):\n', 'BULLET::::- Ɑ ɑ : Latin letter alpha / script A, which represents an open back unrounded vowel in the IPA\n', 'BULLET::::- ᶐ : Latin small letter alpha with retroflex hook\n', 'BULLET::::- Ɐ ɐ : Turned A, which represents a near-open central vowel in the IPA\n', 'BULLET::::- Λ ʌ : Turned V (also called a wedge, a caret, or a hat), which represents an open-mid back unrounded vowel in the IPA\n', 'BULLET::::- Ɒ ɒ : Turned alpha / script A, which represents an open back rounded vowel in the IPA\n', 'BULLET::::- ᶛ : Modifier letter small turned alpha\n', 'BULLET::::- ᴀ : Small capital A, an obsolete or non-standard symbol in the International Phonetic Alphabet used to represent various sounds (mainly open vowels)\n', 'BULLET::::- ᴬ ᵃ ᵄ : Modifier letters are used in the Uralic Phonetic Alphabet (UPA)\n', 'BULLET::::- ₐ : Subscript small a is used in Indo-European studies\n', 'BULLET::::- ꬱ : Small letter a reversed-schwa is used in the Teuthonista phonetic transcription system\n', 'BULLET::::- Ꞻ ꞻ : Glottal A, used in the transliteration of Ugaritic\n', 'Section::::Related characters.:Derived signs, symbols and abbreviations.\n', 'BULLET::::- ª : an ordinal indicator\n', 'BULLET::::- Å : Ångström sign\n', 'BULLET::::- ∀ : a turned capital letter A, used in predicate logic to specify universal quantification ("for all")\n', 'BULLET::::- @ : At sign\n', 'BULLET::::- ₳ : Argentine austral\n', 'Section::::Related characters.:Ancestors and siblings in other alphabets.\n', 'BULLET::::- 𐤀 : Semitic letter Aleph, from which the following symbols originally derive\n', 'BULLET::::- Α α : Greek letter Alpha, from which the following letters derive\n', 'BULLET::::- А а : Cyrillic letter A\n', 'BULLET::::- : Coptic letter Alpha\n', 'BULLET::::- 𐌀 : Old Italic A, which is the ancestor of modern Latin A\n', 'BULLET::::- : Runic letter ansuz, which probably derives from old Italic A\n', 'BULLET::::- : Gothic letter aza/asks\n', 'Section::::External links.\n', 'BULLET::::- History of the Alphabet\n'), (KiltDocAnchor(text='named', href='English%20alphabet%23Letter%20names', paragraph_id=1, start=3, end=8), KiltDocAnchor(text='letter', href='Letter%20%28alphabet%29', paragraph_id=1, start=66, end=72), KiltDocAnchor(text='vowel', href='vowel', paragraph_id=1, start=87, end=92), KiltDocAnchor(text='modern English alphabet', href='English%20alphabet', paragraph_id=1, start=100, end=123), KiltDocAnchor(text='ISO basic Latin alphabet', href='ISO%20basic%20Latin%20alphabet', paragraph_id=1, start=132, end=156), KiltDocAnchor(text='Ancient Greek letter', href='Greek_alphabet%23History', paragraph_id=1, start=179, end=199), KiltDocAnchor(text='alpha', href='alpha', paragraph_id=1, start=200, end=205), KiltDocAnchor(text='italic type', href='italic%20type', paragraph_id=1, start=574, end=585), KiltDocAnchor(text='indefinite article', href='Article%20%28grammar%29%23Indefinite%20article', paragraph_id=2, start=57, end=75), KiltDocAnchor(text='aleph', href='aleph', paragraph_id=4, start=40, end=45), KiltDocAnchor(text='Phoenician alphabet', href='Phoenician%20alphabet', paragraph_id=4, start=93, end=112), KiltDocAnchor(text='consonant', href='consonant', paragraph_id=4, start=142, end=151), KiltDocAnchor(text='abjad', href='abjad', paragraph_id=4, start=192, end=197), KiltDocAnchor(text='alphabet', href='alphabet', paragraph_id=4, start=228, end=236), KiltDocAnchor(text='pictogram', href='pictogram', paragraph_id=4, start=286, end=295), KiltDocAnchor(text='proto-Sinaitic script', href='proto-Sinaitic%20script', paragraph_id=4, start=313, end=334), KiltDocAnchor(text='Egyptian hieroglyphs', href='Egyptian%20hieroglyphs', paragraph_id=4, start=349, end=369), KiltDocAnchor(text='Paleo-Hebrew', href='Paleo-Hebrew%20alphabet', paragraph_id=5, start=163, end=175), KiltDocAnchor(text='Arabic', href='Arabic%20script', paragraph_id=5, start=179, end=185), KiltDocAnchor(text='aleph', href='aleph', paragraph_id=5, start=186, end=191), KiltDocAnchor(text='ancient Greeks', href='Ancient%20Greece', paragraph_id=6, start=9, end=23), KiltDocAnchor(text='glottal stop', href='glottal%20stop', paragraph_id=6, start=92, end=104), KiltDocAnchor(text='Phoenician', href='Phoenician%20language', paragraph_id=6, start=152, end=162), KiltDocAnchor(text='Semitic languages', href='Semitic%20languages', paragraph_id=6, start=173, end=190), KiltDocAnchor(text='phoneme', href='phoneme', paragraph_id=6, start=215, end=222), KiltDocAnchor(text='alpha', href='alpha', paragraph_id=6, start=370, end=375), KiltDocAnchor(text='Greek Dark Ages', href='Greek%20Dark%20Ages', paragraph_id=6, start=422, end=437), KiltDocAnchor(text='Greek alphabet', href='Greek%20alphabet', paragraph_id=6, start=512, end=526), KiltDocAnchor(text='Etruscans', href='Etruscan%20civilization', paragraph_id=7, start=4, end=13), KiltDocAnchor(text='Italian Peninsula', href='Italian%20Peninsula', paragraph_id=7, start=70, end=87), KiltDocAnchor(text='Etruscan alphabet', href='Old%20Italic%20script', paragraph_id=7, start=148, end=165), KiltDocAnchor(text='Latin language', href='Latin', paragraph_id=7, start=179, end=193), KiltDocAnchor(text='Latin alphabet', href='Latin%20script', paragraph_id=7, start=241, end=255), KiltDocAnchor(text='cursive', href='cursive', paragraph_id=9, start=198, end=205), KiltDocAnchor(text='majuscule', href='letter%20case', paragraph_id=9, start=508, end=517), KiltDocAnchor(text='minuscule', href='letter%20case', paragraph_id=9, start=527, end=536), KiltDocAnchor(text='semi-uncial', href='Uncial%20script%23Half-uncial', paragraph_id=9, start=698, end=709), KiltDocAnchor(text='Roman Empire', href='Roman%20Empire', paragraph_id=10, start=18, end=30), KiltDocAnchor(text='Italy', href='Italy', paragraph_id=10, start=171, end=176), KiltDocAnchor(text='Merovingian script', href='Merovingian%20script', paragraph_id=10, start=182, end=200), KiltDocAnchor(text='Visigothic script', href='Visigothic%20script', paragraph_id=10, start=216, end=233), KiltDocAnchor(text='Insular', href='Insular%20script', paragraph_id=10, start=252, end=259), KiltDocAnchor(text='Caroline script', href='Carolingian%20minuscule', paragraph_id=10, start=354, end=369), KiltDocAnchor(text='handwriting', href='handwriting', paragraph_id=11, start=244, end=255), KiltDocAnchor(text='tau', href='tau', paragraph_id=11, start=380, end=383), KiltDocAnchor(text='serif', href='serif', paragraph_id=11, start=798, end=803), KiltDocAnchor(text='Italic type', href='Italic%20type', paragraph_id=12, start=0, end=11), KiltDocAnchor(text='Latin alpha', href='Latin%20alpha', paragraph_id=12, start=219, end=230), KiltDocAnchor(text='International Phonetic Alphabet', href='International%20Phonetic%20Alphabet', paragraph_id=12, start=283, end=314), KiltDocAnchor(text='English orthography', href='English%20orthography', paragraph_id=15, start=10, end=29), KiltDocAnchor(text='near-open front unrounded vowel', href='near-open%20front%20unrounded%20vowel', paragraph_id=16, start=16, end=47), KiltDocAnchor(text='open back unrounded vowel', href='open%20back%20unrounded%20vowel', paragraph_id=17, start=16, end=41), KiltDocAnchor(text='diphthong', href='diphthong', paragraph_id=18, start=16, end=25), KiltDocAnchor(text='Middle English lengthening', href='Middle%20English%20lengthening', paragraph_id=18, start=167, end=193), KiltDocAnchor(text='Great Vowel Shift', href='Great%20Vowel%20Shift', paragraph_id=18, start=210, end=227), KiltDocAnchor(text='before', href='English-language%20vowel%20changes%20before%20historic%20/r/', paragraph_id=19, start=61, end=67), KiltDocAnchor(text='General American', href='General%20American', paragraph_id=21, start=54, end=70), KiltDocAnchor(text='schwa', href='schwa', paragraph_id=22, start=14, end=19), KiltDocAnchor(text='aardvark', href='aardvark', paragraph_id=23, start=139, end=147), KiltDocAnchor(text='many common digraphs', href='List%20of%20Latin-script%20digraphs', paragraph_id=23, start=169, end=189), KiltDocAnchor(text='Saanich', href='Saanich%20dialect', paragraph_id=25, start=113, end=120), KiltDocAnchor(text='Á', href='%C3%81', paragraph_id=25, start=146, end=147), KiltDocAnchor(text='close-mid front unrounded vowel', href='close-mid%20front%20unrounded%20vowel', paragraph_id=25, start=162, end=193), KiltDocAnchor(text='International Phonetic Alphabet', href='International%20Phonetic%20Alphabet', paragraph_id=28, start=19, end=50), KiltDocAnchor(text='open front unrounded vowel', href='open%20front%20unrounded%20vowel', paragraph_id=28, start=68, end=94), KiltDocAnchor(text='open central unrounded vowel', href='open%20central%20unrounded%20vowel', paragraph_id=28, start=112, end=140), KiltDocAnchor(text='open back unrounded vowel', href='open%20back%20unrounded%20vowel', paragraph_id=28, start=162, end=187), KiltDocAnchor(text='X-SAMPA', href='X-SAMPA', paragraph_id=29, start=15, end=22), KiltDocAnchor(text='open front unrounded vowel', href='open%20front%20unrounded%20vowel', paragraph_id=29, start=40, end=66), KiltDocAnchor(text='open back unrounded vowel', href='open%20back%20unrounded%20vowel', paragraph_id=29, start=87, end=112), KiltDocAnchor(text='algebra', href='algebra', paragraph_id=31, start=3, end=10), KiltDocAnchor(text='geometry', href='geometry', paragraph_id=32, start=3, end=11), KiltDocAnchor(text='segment', href='Line%20segment', paragraph_id=32, start=53, end=60), KiltDocAnchor(text='line', href='line%20%28geometry%29', paragraph_id=32, start=63, end=67), KiltDocAnchor(text='rays', href='Line%20%28geometry%29%23Ray', paragraph_id=32, start=70, end=74), KiltDocAnchor(text='triangle', href='triangle', paragraph_id=32, start=165, end=173), KiltDocAnchor(text='A-list', href='A-list', paragraph_id=33, start=220, end=226), KiltDocAnchor(text='motivating', href='motivation', paragraph_id=33, start=274, end=284), KiltDocAnchor(text='asymmetry', href='asymmetry', paragraph_id=34, start=47, end=56), KiltDocAnchor(text='indefinite article', href='Article%20%28grammar%29%23Indefinite%20article', paragraph_id=35, start=53, end=71), KiltDocAnchor(text='brassiere', href='brassiere', paragraph_id=36, start=97, end=106), KiltDocAnchor(text='Latin "AE"', href='%C3%86', paragraph_id=39, start=18, end=28), KiltDocAnchor(text='diacritic', href='diacritic', paragraph_id=40, start=19, end=28), KiltDocAnchor(text='Å å', href='%C3%85', paragraph_id=40, start=31, end=34), KiltDocAnchor(text='Ǻ ǻ', href='%C7%BA', paragraph_id=40, start=35, end=38), KiltDocAnchor(text='Ḁ ḁ', href='Ring%20%28diacritic%29', paragraph_id=40, start=39, end=42), KiltDocAnchor(text='ẚ', href='%E1%BA%9A', paragraph_id=40, start=43, end=44), KiltDocAnchor(text='Ă ă', href='%C4%82', paragraph_id=40, start=45, end=48), KiltDocAnchor(text='Ặ ặ', href='%E1%BA%B6', paragraph_id=40, start=49, end=52), KiltDocAnchor(text='Ắ ắ', href='%E1%BA%AE', paragraph_id=40, start=53, end=56), KiltDocAnchor(text='Ằ ằ', href='%E1%BA%B0', paragraph_id=40, start=57, end=60), KiltDocAnchor(text='Ẳ ẳ', href='%E1%BA%B2', paragraph_id=40, start=61, end=64), KiltDocAnchor(text='Ẵ ẵ', href='%E1%BA%B4', paragraph_id=40, start=65, end=68), KiltDocAnchor(text='Ȃ ȃ', href='%C8%82', paragraph_id=40, start=69, end=72), KiltDocAnchor(text='Â â', href='%C3%82', paragraph_id=40, start=73, end=76), KiltDocAnchor(text='Ậ ậ', href='%E1%BA%AC', paragraph_id=40, start=77, end=80), KiltDocAnchor(text='Ấ ấ', href='%E1%BA%A4', paragraph_id=40, start=81, end=84), KiltDocAnchor(text='Ầ ầ', href='%E1%BA%A6', paragraph_id=40, start=85, end=88), KiltDocAnchor(text='Ẫ ẫ', href='%E1%BA%AA', paragraph_id=40, start=89, end=92), KiltDocAnchor(text='Ẩ ẩ', href='%E1%BA%A8', paragraph_id=40, start=93, end=96), KiltDocAnchor(text='Ả ả', href='%E1%BA%A2', paragraph_id=40, start=97, end=100), KiltDocAnchor(text='Ǎ ǎ', href='Caron', paragraph_id=40, start=101, end=104), KiltDocAnchor(text='Ⱥ ⱥ', href='Bar%20%28diacritic%29', paragraph_id=40, start=105, end=108), KiltDocAnchor(text='Ȧ ȧ', href='Dot%20%28diacritic%29', paragraph_id=40, start=109, end=112), KiltDocAnchor(text='Ǡ ǡ', href='%C7%A0', paragraph_id=40, start=113, end=116), KiltDocAnchor(text='Ạ ạ', href='Dot%20%28diacritic%29', paragraph_id=40, start=117, end=120), KiltDocAnchor(text='Ä ä', href='%C3%84', paragraph_id=40, start=121, end=124), KiltDocAnchor(text='Ǟ ǟ', href='%C7%9E', paragraph_id=40, start=125, end=128), KiltDocAnchor(text='À à', href='%C3%80', paragraph_id=40, start=129, end=132), KiltDocAnchor(text='Ȁ ȁ', href='%C8%80', paragraph_id=40, start=133, end=136), KiltDocAnchor(text='Á á', href='%C3%81', paragraph_id=40, start=137, end=140), KiltDocAnchor(text='Ā ā', href='%C4%80', paragraph_id=40, start=141, end=144), KiltDocAnchor(text='Ã ã', href='%C3%83', paragraph_id=40, start=151, end=154), KiltDocAnchor(text='Ą ą', href='%C4%84', paragraph_id=40, start=155, end=158), KiltDocAnchor(text='ᶏ', href='%E1%B6%8F', paragraph_id=40, start=177, end=178), KiltDocAnchor(text='Phonetic alphabet', href='Phonetic%20transcription%23Alphabetic', paragraph_id=41, start=12, end=29), KiltDocAnchor(text='International Phonetic Alphabet', href='International%20Phonetic%20Alphabet', paragraph_id=41, start=56, end=87), KiltDocAnchor(text='Latin letter alpha / script A', href='Latin%20alpha', paragraph_id=42, start=18, end=47), KiltDocAnchor(text='open back unrounded vowel', href='open%20back%20unrounded%20vowel', paragraph_id=42, start=69, end=94), KiltDocAnchor(text='Turned A', href='Turned%20A', paragraph_id=44, start=18, end=26), KiltDocAnchor(text='near-open central vowel', href='near-open%20central%20vowel', paragraph_id=44, start=47, end=70), KiltDocAnchor(text='Turned V', href='Turned%20V', paragraph_id=45, start=18, end=26), KiltDocAnchor(text='open-mid back unrounded vowel', href='open-mid%20back%20unrounded%20vowel', paragraph_id=45, start=89, end=118), KiltDocAnchor(text='open back rounded vowel', href='open%20back%20rounded%20vowel', paragraph_id=46, start=63, end=86), KiltDocAnchor(text='obsolete or non-standard symbol in the International Phonetic Alphabet', href='Obsolete%20and%20nonstandard%20symbols%20in%20the%20International%20Phonetic%20Alphabet', paragraph_id=48, start=36, end=106), KiltDocAnchor(text='Uralic Phonetic Alphabet', href='Uralic%20Phonetic%20Alphabet', paragraph_id=49, start=53, end=77), KiltDocAnchor(text='Indo-European studies', href='Indo-European%20studies', paragraph_id=50, start=45, end=66), KiltDocAnchor(text='Teuthonista', href='Teuthonista', paragraph_id=51, start=61, end=72), KiltDocAnchor(text='Ugaritic', href='Ugaritic', paragraph_id=52, start=60, end=68), KiltDocAnchor(text='ordinal indicator', href='ordinal%20indicator', paragraph_id=54, start=19, end=36), KiltDocAnchor(text='Ångström', href='%C3%85ngstr%C3%B6m', paragraph_id=55, start=16, end=24), KiltDocAnchor(text='predicate logic', href='predicate%20logic', paragraph_id=56, start=51, end=66), KiltDocAnchor(text='universal quantification', href='universal%20quantification', paragraph_id=56, start=78, end=102), KiltDocAnchor(text='At sign', href='At%20sign', paragraph_id=57, start=16, end=23), KiltDocAnchor(text='Argentine austral', href='Argentine%20austral', paragraph_id=58, start=16, end=33), KiltDocAnchor(text='Semitic', href='Phoenician%20alphabet', paragraph_id=60, start=16, end=23), KiltDocAnchor(text='Aleph', href='Aleph', paragraph_id=60, start=31, end=36), KiltDocAnchor(text='Greek', href='Greek%20alphabet', paragraph_id=61, start=18, end=23), KiltDocAnchor(text='Alpha', href='Alpha', paragraph_id=61, start=31, end=36), KiltDocAnchor(text='Cyrillic', href='Cyrillic', paragraph_id=62, start=18, end=26), KiltDocAnchor(text='A', href='A%20%28Cyrillic%29', paragraph_id=62, start=34, end=35), KiltDocAnchor(text='Coptic', href='Coptic%20alphabet', paragraph_id=63, start=14, end=20), KiltDocAnchor(text='Old Italic', href='Old%20Italic%20script', paragraph_id=64, start=16, end=26), KiltDocAnchor(text='Runic', href='Runes', paragraph_id=65, start=14, end=19), KiltDocAnchor(text='ansuz', href='Ansuz%20%28rune%29', paragraph_id=65, start=27, end=32), KiltDocAnchor(text='Gothic', href='Gothic%20alphabet', paragraph_id=66, start=14, end=20)), ('ISO basic Latin letters',), 'Q9659', '907008348', '2019-07-19T20:25:53Z', '906725792', '290', 'https://en.wikipedia.org/w/index.php?title=A&oldid=907008348'), 9: KiltDoc('25', 'Autism', re.compile('^Autism\nAutism is a developmental disorder characterized by difficulties with social interaction and .{46982} job market\\. Some studies also find decreased earning among parents who care for autistic children\\.\n$', flags=48), ('Autism\n', "Autism is a developmental disorder characterized by difficulties with social interaction and communication, and by restricted and repetitive behavior. Parents usually notice signs during the first three years of their child's life. These signs often develop gradually, though some children with autism reach their developmental milestones at a normal pace before worsening.\n", 'Autism is associated with a combination of genetic and environmental factors. Risk factors during pregnancy include certain infections, such as rubella, toxins including valproic acid, alcohol, cocaine, pesticides and air pollution, fetal growth restriction, and autoimmune diseases. Controversies surround other proposed environmental causes; for example, the vaccine hypothesis, which has been disproven. Autism affects information processing in the brain by altering connections and organization of nerve cells and their synapses. How this occurs is not well understood. In the DSM-5, autism and less severe forms of the condition, including Asperger syndrome and pervasive developmental disorder not otherwise specified (PDD-NOS), have been combined into the diagnosis of autism spectrum disorder (ASD).\n', 'Early behavioral interventions or speech therapy can help children with autism gain self-care, social, and communication skills. Although there is no known cure, there have been cases of children who recovered. Not many children with autism live independently after reaching adulthood, though some are successful. An autistic culture has developed, with some individuals seeking a cure and others believing autism should be accepted as a difference and not treated as a disorder.\n', 'Globally, autism is estimated to affect 24.8 million people . In the 2000s, the number of people affected was estimated at 1–2 per 1,000 people worldwide. In the developed countries, about 1.5% of children are diagnosed with ASD , from 0.7% in 2000 in the United States. It occurs four-to-five times more often in males than females. The number of people diagnosed has increased dramatically since the 1960s, which may be partly due to changes in diagnostic practice. The question of whether actual rates have increased is unresolved.\n', 'Section::::Characteristics.\n', 'Autism is a highly variable, neurodevelopmental disorder whose symptoms first appears during infancy or childhood, and generally follows a steady course without remission. People with autism may be severely impaired in some respects but normal, or even superior, in others. Overt symptoms gradually begin after the age of six months, become established by age two or three years and tend to continue through adulthood, although often in more muted form. It is distinguished not by a single symptom but by a characteristic triad of symptoms: impairments in social interaction; impairments in communication; and restricted interests and repetitive behavior. Other aspects, such as atypical eating, are also common but are not essential for diagnosis. Individual symptoms of autism occur in the general population and appear not to associate highly, without a sharp line separating pathologically severe from common traits.\n', 'Section::::Characteristics.:Social development.\n', 'Social deficits distinguish autism and the related autism spectrum disorders (ASD; see Classification) from other developmental disorders. People with autism have social impairments and often lack the intuition about others that many people take for granted. Noted autistic Temple Grandin described her inability to understand the social communication of neurotypicals, or people with normal neural development, as leaving her feeling "like an anthropologist on Mars".\n', "Unusual social development becomes apparent early in childhood. Autistic infants show less attention to social stimuli, smile and look at others less often, and respond less to their own name. Autistic toddlers differ more strikingly from social norms; for example, they have less eye contact and turn-taking, and do not have the ability to use simple movements to express themselves, such as pointing at things. Three- to five-year-old children with autism are less likely to exhibit social understanding, approach others spontaneously, imitate and respond to emotions, communicate nonverbally, and take turns with others. However, they do form attachments to their primary caregivers. Most children with autism display moderately less attachment security than neurotypical children, although this difference disappears in children with higher mental development or less severe ASD. Older children and adults with ASD perform worse on tests of face and emotion recognition although this may be partly due to a lower ability to define a person's own emotions.\n", 'Children with high-functioning autism suffer from more intense and frequent loneliness compared to non-autistic peers, despite the common belief that children with autism prefer to be alone. Making and maintaining friendships often proves to be difficult for those with autism. For them, the quality of friendships, not the number of friends, predicts how lonely they feel. Functional friendships, such as those resulting in invitations to parties, may affect the quality of life more deeply.\n', 'There are many anecdotal reports, but few systematic studies, of aggression and violence in individuals with ASD. The limited data suggest that, in children with intellectual disability, autism is associated with aggression, destruction of property, and meltdowns.\n', 'Section::::Characteristics.:Communication.\n', "About a third to a half of individuals with autism do not develop enough natural speech to meet their daily communication needs. Differences in communication may be present from the first year of life, and may include delayed onset of babbling, unusual gestures, diminished responsiveness, and vocal patterns that are not synchronized with the caregiver. In the second and third years, children with autism have less frequent and less diverse babbling, consonants, words, and word combinations; their gestures are less often integrated with words. Children with autism are less likely to make requests or share experiences, and are more likely to simply repeat others' words (echolalia) or reverse pronouns. Joint attention seems to be necessary for functional speech, and deficits in joint attention seem to distinguish infants with ASD. For example, they may look at a pointing hand instead of the pointed-at object, and they consistently fail to point at objects in order to comment on or share an experience. Children with autism may have difficulty with imaginative play and with developing symbols into language.\n", 'In a pair of studies, high-functioning children with autism aged 8–15 performed equally well as, and as adults better than, individually matched controls at basic language tasks involving vocabulary and spelling. Both autistic groups performed worse than controls at complex language tasks such as figurative language, comprehension and inference. As people are often sized up initially from their basic language skills, these studies suggest that people speaking to autistic individuals are more likely to overestimate what their audience comprehends.\n', 'Section::::Characteristics.:Repetitive behavior.\n', 'Autistic individuals can display many forms of repetitive or restricted behavior, which the Repetitive Behavior Scale-Revised (RBS-R) categorizes as follows.\n', 'BULLET::::- Stereotyped behaviors: Repetitive movements, such as hand flapping, head rolling, or body rocking.\n', 'BULLET::::- Compulsive behaviors: Time-consuming behaviors intended to reduce anxiety that an individual feels compelled to perform repeatedly or according to rigid rules, such as placing objects in a specific order, checking things, or hand washing.\n', 'BULLET::::- Sameness: Resistance to change; for example, insisting that the furniture not be moved or refusing to be interrupted.\n', 'BULLET::::- Ritualistic behavior: Unvarying pattern of daily activities, such as an unchanging menu or a dressing ritual. This is closely associated with sameness and an independent validation has suggested combining the two factors.\n', 'BULLET::::- Restricted interests: Interests or fixations that are abnormal in theme or intensity of focus, such as preoccupation with a single television program, toy, or game.\n', 'BULLET::::- Self-injury: Behaviors such as eye-poking, skin-picking, hand-biting and head-banging.\n', 'No single repetitive or self-injurious behavior seems to be specific to autism, but autism appears to have an elevated pattern of occurrence and severity of these behaviors.\n', 'Section::::Characteristics.:Other symptoms.\n', 'Autistic individuals may have symptoms that are independent of the diagnosis, but that can affect the individual or the family.\n', 'An estimated 0.5% to 10% of individuals with ASD show unusual abilities, ranging from splinter skills such as the memorization of trivia to the extraordinarily rare talents of prodigious autistic savants. Many individuals with ASD show superior skills in perception and attention, relative to the general population. Sensory abnormalities are found in over 90% of those with autism, and are considered core features by some, although there is no good evidence that sensory symptoms differentiate autism from other developmental disorders. Differences are greater for under-responsivity (for example, walking into things) than for over-responsivity (for example, distress from loud noises) or for sensation seeking (for example, rhythmic movements). An estimated 60–80% of autistic people have motor signs that include poor muscle tone, poor motor planning, and toe walking; deficits in motor coordination are pervasive across ASD and are greater in autism proper. Unusual eating behavior occurs in about three-quarters of children with ASD, to the extent that it was formerly a diagnostic indicator. Selectivity is the most common problem, although eating rituals and food refusal also occur.\n', 'Parents of children with ASD have higher levels of stress. Siblings of children with ASD report greater admiration of and less conflict with the affected sibling than siblings of unaffected children and were similar to siblings of children with Down syndrome in these aspects of the sibling relationship. However, they reported lower levels of closeness and intimacy than siblings of children with Down syndrome; siblings of individuals with ASD have greater risk of negative well-being and poorer sibling relationships as adults. There is tentative evidence that autism occurs more frequently in people with gender dysphoria.\n', 'Gastrointestinal problems are one of the most commonly associated medical disorders in people with autism. These are linked to greater social impairment, irritability, behavior and sleep problems, language impairments and mood changes.\n', 'Section::::Causes.\n', "It has long been presumed that there is a common cause at the genetic, cognitive, and neural levels for autism's characteristic triad of symptoms. However, there is increasing suspicion that autism is instead a complex disorder whose core aspects have distinct causes that often co-occur.\n", 'Autism has a strong genetic basis, although the genetics of autism are complex and it is unclear whether ASD is explained more by rare mutations with major effects, or by rare multigene interactions of common genetic variants. Complexity arises due to interactions among multiple genes, the environment, and epigenetic factors which do not change DNA sequencing but are heritable and influence gene expression. Many genes have been associated with autism through sequencing the genomes of affected individuals and their parents. Studies of twins suggest that heritability is 0.7 for autism and as high as 0.9 for ASD, and siblings of those with autism are about 25 times more likely to be autistic than the general population. However, most of the mutations that increase autism risk have not been identified. Typically, autism cannot be traced to a Mendelian (single-gene) mutation or to a single chromosome abnormality, and none of the genetic syndromes associated with ASDs have been shown to selectively cause ASD. Numerous candidate genes have been located, with only small effects attributable to any particular gene. Most loci individually explain less than 1% of cases of autism. The large number of autistic individuals with unaffected family members may result from spontaneous structural variation\xa0— such as deletions, duplications or inversions in genetic material during meiosis. Hence, a substantial fraction of autism cases may be traceable to genetic causes that are highly heritable but not inherited: that is, the mutation that causes the autism is not present in the parental genome. Autism may be underdiagnosed in women and girls due to an assumption that it is primarily a male condition.\n', 'Maternal nutrition and inflammation during preconception and pregnancy influences fetal neurodevelopment. Intrauterine growth restriction is associated with ASD, in both term and preterm infants. Maternal inflammatory and autoimmune diseases may damage fetal tissues, aggravating a genetic problem or damaging the nervous system.\n', 'Exposure to air pollution during pregnancy, especially heavy metals and particulates, may increase the risk of autism. Environmental factors that have been claimed without evidence to contribute to or exacerbate autism include certain foods, infectious diseases, solvents, PCBs, phthalates and phenols used in plastic products, pesticides, brominated flame retardants, alcohol, smoking, illicit drugs, vaccines, and prenatal stress. Some, such as the MMR vaccine, have been completely disproven.\n', 'Parents may first become aware of autistic symptoms in their child around the time of a routine vaccination. This has led to unsupported theories blaming vaccine "overload", a vaccine preservative, or the MMR vaccine for causing autism. The latter theory was supported by a litigation-funded study that has since been shown to have been "an elaborate fraud". Although these theories lack convincing scientific evidence and are biologically implausible, parental concern about a potential vaccine link with autism has led to lower rates of childhood immunizations, outbreaks of previously controlled childhood diseases in some countries, and the preventable deaths of several children.\n', 'Section::::Mechanism.\n', "Autism's symptoms result from maturation-related changes in various systems of the brain. How autism occurs is not well understood. Its mechanism can be divided into two areas: the pathophysiology of brain structures and processes associated with autism, and the neuropsychological linkages between brain structures and behaviors. The behaviors appear to have multiple pathophysiologies.\n", 'There is evidence that gut–brain axis abnormalities may be involved. A 2015 review proposed that immune dysregulation, gastrointestinal inflammation, malfunction of the autonomic nervous system, gut flora alterations, and food metabolites may cause brain neuroinflammation and dysfunction. A 2016 review concludes that enteric nervous system abnormalities might play a role in neurological disorders such as autism. Neural connections and the immune system are a pathway that may allow diseases originated in the intestine to spread to the brain.\n', 'Several lines of evidence point to synaptic dysfunction as a cause of autism. Some rare mutations may lead to autism by disrupting some synaptic pathways, such as those involved with cell adhesion. Gene replacement studies in mice suggest that autistic symptoms are closely related to later developmental steps that depend on activity in synapses and on activity-dependent changes. All known teratogens (agents that cause birth defects) related to the risk of autism appear to act during the first eight weeks from conception, and though this does not exclude the possibility that autism can be initiated or affected later, there is strong evidence that autism arises very early in development.\n', 'Section::::Diagnosis.\n', 'Diagnosis is based on behavior, not cause or mechanism. Under the DSM-5, autism is characterized by persistent deficits in social communication and interaction across multiple contexts, as well as restricted, repetitive patterns of behavior, interests, or activities. These deficits are present in early childhood, typically before age three, and lead to clinically significant functional impairment. Sample symptoms include lack of social or emotional reciprocity, stereotyped and repetitive use of language or idiosyncratic language, and persistent preoccupation with unusual objects. The disturbance must not be better accounted for by Rett syndrome, intellectual disability or global developmental delay. ICD-10 uses essentially the same definition.\n', 'Several diagnostic instruments are available. Two are commonly used in autism research: the Autism Diagnostic Interview-Revised (ADI-R) is a semistructured parent interview, and the Autism Diagnostic Observation Schedule (ADOS) uses observation and interaction with the child. The Childhood Autism Rating Scale (CARS) is used widely in clinical environments to assess severity of autism based on observation of children. The Diagnostic interview for social and communication disorders (DISCO) may also be used.\n', 'A pediatrician commonly performs a preliminary investigation by taking developmental history and physically examining the child. If warranted, diagnosis and evaluations are conducted with help from ASD specialists, observing and assessing cognitive, communication, family, and other factors using standardized tools, and taking into account any associated medical conditions. A pediatric neuropsychologist is often asked to assess behavior and cognitive skills, both to aid diagnosis and to help recommend educational interventions. A differential diagnosis for ASD at this stage might also consider intellectual disability, hearing impairment, and a specific language impairment such as Landau–Kleffner syndrome. The presence of autism can make it harder to diagnose coexisting psychiatric disorders such as depression.\n', "Clinical genetics evaluations are often done once ASD is diagnosed, particularly when other symptoms already suggest a genetic cause. Although genetic technology allows clinical geneticists to link an estimated 40% of cases to genetic causes, consensus guidelines in the US and UK are limited to high-resolution chromosome and fragile X testing. A genotype-first model of diagnosis has been proposed, which would routinely assess the genome's copy number variations. As new genetic tests are developed several ethical, legal, and social issues will emerge. Commercial availability of tests may precede adequate understanding of how to use test results, given the complexity of autism's genetics. Metabolic and neuroimaging tests are sometimes helpful, but are not routine.\n", 'ASD can sometimes be diagnosed by age 14 months, although diagnosis becomes increasingly stable over the first three years of life: for example, a one-year-old who meets diagnostic criteria for ASD is less likely than a three-year-old to continue to do so a few years later. In the UK the National Autism Plan for Children recommends at most 30 weeks from first concern to completed diagnosis and assessment, though few cases are handled that quickly in practice. Although the symptoms of autism and ASD begin early in childhood, they are sometimes missed; years later, adults may seek diagnoses to help them or their friends and family understand themselves, to help their employers make adjustments, or in some locations to claim disability living allowances or other benefits. Girls are often diagnosed later than boys.\n', 'Underdiagnosis and overdiagnosis are problems in marginal cases, and much of the recent increase in the number of reported ASD cases is likely due to changes in diagnostic practices. The increasing popularity of drug treatment options and the expansion of benefits has given providers incentives to diagnose ASD, resulting in some overdiagnosis of children with uncertain symptoms. Conversely, the cost of screening and diagnosis and the challenge of obtaining payment can inhibit or delay diagnosis. It is particularly hard to diagnose autism among the visually impaired, partly because some of its diagnostic criteria depend on vision, and partly because autistic symptoms overlap with those of common blindness syndromes or blindisms.\n', 'Section::::Diagnosis.:Classification.\n', 'Autism is one of the five pervasive developmental disorders (PDD), which are characterized by widespread abnormalities of social interactions and communication, and severely restricted interests and highly repetitive behavior. These symptoms do not imply sickness, fragility, or emotional disturbance.\n', 'Of the five PDD forms, Asperger syndrome is closest to autism in signs and likely causes; Rett syndrome and childhood disintegrative disorder share several signs with autism, but may have unrelated causes; PDD not otherwise specified (PDD-NOS; also called "atypical autism") is diagnosed when the criteria are not met for a more specific disorder. Unlike with autism, people with Asperger syndrome have no substantial delay in language development. The terminology of autism can be bewildering, with autism, Asperger syndrome and PDD-NOS often called the "autism spectrum disorders" (ASD) or sometimes the "autistic disorders", whereas autism itself is often called "autistic disorder", "childhood autism", or "infantile autism". In this article, "autism" refers to the classic autistic disorder; in clinical practice, though, "autism", "ASD", and "PDD" are often used interchangeably. ASD, in turn, is a subset of the broader autism phenotype, which describes individuals who may not have ASD but do have autistic-like traits, such as avoiding eye contact.\n', 'The manifestations of autism cover a wide spectrum, ranging from individuals with severe impairments—who may be silent, developmentally disabled, and locked into hand flapping and rocking—to high functioning individuals who may have active but distinctly odd social approaches, narrowly focused interests, and verbose, pedantic communication. Because the behavior spectrum is continuous, boundaries between diagnostic categories are necessarily somewhat arbitrary. Sometimes the syndrome is divided into low-, medium- or high-functioning autism (LFA, MFA, and HFA), based on IQ thresholds, or on how much support the individual requires in daily life; these subdivisions are not standardized and are controversial. Autism can also be divided into syndromal and non-syndromal autism; the syndromal autism is associated with severe or profound intellectual disability or a congenital syndrome with physical symptoms, such as tuberous sclerosis. Although individuals with Asperger syndrome tend to perform better cognitively than those with autism, the extent of the overlap between Asperger syndrome, HFA, and non-syndromal autism is unclear.\n', 'Some studies have reported diagnoses of autism in children due to a loss of language or social skills, as opposed to a failure to make progress, typically from 15 to 30 months of age. The validity of this distinction remains controversial; it is possible that regressive autism is a specific subtype, or that there is a continuum of behaviors between autism with and without regression.\n', 'Research into causes has been hampered by the inability to identify biologically meaningful subgroups within the autistic population and by the traditional boundaries between the disciplines of psychiatry, psychology, neurology and pediatrics. Newer technologies such as fMRI and diffusion tensor imaging can help identify biologically relevant phenotypes (observable traits) that can be viewed on brain scans, to help further neurogenetic studies of autism; one example is lowered activity in the fusiform face area of the brain, which is associated with impaired perception of people versus objects. It has been proposed to classify autism using genetics as well as behavior.\n', 'Section::::Screening.\n', 'About half of parents of children with ASD notice their child\'s unusual behaviors by age 18 months, and about four-fifths notice by age 24 months. According to an article, failure to meet any of the following milestones "is an absolute indication to proceed with further evaluations. Delay in referral for such testing may delay early diagnosis and treatment and affect the long-term outcome".\n', 'BULLET::::- No babbling by 12 months.\n', 'BULLET::::- No gesturing (pointing, waving, etc.) by 12 months.\n', 'BULLET::::- No single words by 16 months.\n', 'BULLET::::- No two-word (spontaneous, not just echolalic) phrases by 24 months.\n', 'BULLET::::- Loss of any language or social skills, at any age.\n', "The United States Preventive Services Task Force in 2016 found it was unclear if screening was beneficial or harmful among children in whom there is no concerns. The Japanese practice is to screen all children for ASD at 18 and 24 months, using autism-specific formal screening tests. In contrast, in the UK, children whose families or doctors recognize possible signs of autism are screened. It is not known which approach is more effective. Screening tools include the Modified Checklist for Autism in Toddlers (M-CHAT), the Early Screening of Autistic Traits Questionnaire, and the First Year Inventory; initial data on M-CHAT and its predecessor, the Checklist for Autism in Toddlers (CHAT), on children aged 18–30 months suggests that it is best used in a clinical setting and that it has low sensitivity (many false-negatives) but good specificity (few false-positives). It may be more accurate to precede these tests with a broadband screener that does not distinguish ASD from other developmental disorders. Screening tools designed for one culture's norms for behaviors like eye contact may be inappropriate for a different culture. Although genetic screening for autism is generally still impractical, it can be considered in some cases, such as children with neurological symptoms and dysmorphic features.\n", 'Section::::Prevention.\n', 'While infection with rubella during pregnancy causes fewer than 1% of cases of autism, vaccination against rubella can prevent many of those cases.\n', 'Section::::Management.\n', "The main goals when treating children with autism are to lessen associated deficits and family distress, and to increase quality of life and functional independence. In general, higher IQs are correlated with greater responsiveness to treatment and improved treatment outcomes. No single treatment is best and treatment is typically tailored to the child's needs. Families and the educational system are the main resources for treatment. Services should be carried out by behavior analysts, special education teachers, speech pathologists, and licensed psychologists. Studies of interventions have methodological problems that prevent definitive conclusions about efficacy. However, the development of evidence-based interventions has advanced in recent years. Although many psychosocial interventions have some positive evidence, suggesting that some form of treatment is preferable to no treatment, the methodological quality of systematic reviews of these studies has generally been poor, their clinical results are mostly tentative, and there is little evidence for the relative effectiveness of treatment options. Intensive, sustained special education programs and behavior therapy early in life can help children acquire self-care, communication, and job skills, and often improve functioning and decrease symptom severity and maladaptive behaviors; claims that intervention by around age three years is crucial are not substantiated. While medications have not been found to help with core symptoms, they may be used for associated symptoms, such as irritability, inattention, or repetitive behavior patterns.\n", 'Section::::Management.:Education.\n', 'Educational interventions often used include applied behavior analysis (ABA), developmental models, structured teaching, speech and language therapy, social skills therapy, and occupational therapy. Among these approaches, interventions either treat autistic features comprehensively, or focalize treatment on a specific area of deficit. The quality of research for early intensive behavioral intervention (EIBI)—a treatment procedure carried out with very young children that incorporates over thirty hours per week of the structured type of ABA—is currently low, and more vigorous research designs with larger sample sizes are needed. Two theoretical frameworks outlined for early childhood intervention include structured and naturalistic ABA interventions, and developmental social pragmatic models (DSP). One interventional strategy utilizes a parent training model, which teaches parents how to implement various ABA and DSP techniques, allowing for parents to disseminate interventions themselves. Various DSP programs have been developed to explicitly deliver intervention systems through at-home parent implementation. Despite the recent development of parent training models, these interventions have demonstrated effectiveness in numerous studies, being evaluated as a probable efficacious mode of treatment.\n', 'Early, intensive ABA therapy has demonstrated effectiveness in enhancing communication, as well as adaptive and global functioning in preschool children; it is well-established for improving the intellectual performance of that age group. Similarly, a teacher-implemented intervention that utilizes a more naturalistic form of ABA combined with a developmental social pragmatic approach has been found to be beneficial in improving social-communication skills in young children, although there is less evidence in its treatment of global symptoms. Neuropsychological reports are often poorly communicated to educators, resulting in a gap between what a report recommends and what education is provided. It is not known whether treatment programs for children lead to significant improvements after the children grow up, and the limited research on the effectiveness of adult residential programs shows mixed results. The appropriateness of including children with varying severity of autism spectrum disorders in the general education population is a subject of current debate among educators and researchers.\n', 'Section::::Management.:Medication.\n', "Medications may be used to treat ASD symptoms that interfere with integrating a child into home or school when behavioral treatment fails. They may also be used for associated health problems, such as ADHD or anxiety. More than half of US children diagnosed with ASD are prescribed psychoactive drugs or anticonvulsants, with the most common drug classes being antidepressants, stimulants, and antipsychotics. The atypical antipsychotic drugs risperidone and aripiprazole are FDA-approved for treating associated aggressive and self-injurious behaviors. However, their side effects must be weighed against their potential benefits, and people with autism may respond atypically. Side effects, for example, may include weight gain, tiredness, drooling, and aggression. SSRI antidepressants, such as fluoxetine and fluvoxamine, have been shown to be effective in reducing repetitive and ritualistic behaviors, while the stimulant medication methylphenidate is beneficial for some children with co-morbid inattentiveness or hyperactivity. There is scant reliable research about the effectiveness or safety of drug treatments for adolescents and adults with ASD. No known medication relieves autism's core symptoms of social and communication impairments. Experiments in mice have reversed or reduced some symptoms related to autism by replacing or modulating gene function, suggesting the possibility of targeting therapies to specific rare mutations known to cause autism.\n", 'Section::::Management.:Alternative medicine.\n', "Although many alternative therapies and interventions are available, few are supported by scientific studies. Treatment approaches have little empirical support in quality-of-life contexts, and many programs focus on success measures that lack predictive validity and real-world relevance. Some alternative treatments may place the child at risk. The preference that children with autism have for unconventional foods can lead to reduction in bone cortical thickness with this being greater in those on casein-free diets, as a consequence of the low intake of calcium and vitamin D; however, suboptimal bone development in ASD has also been associated with lack of exercise and gastrointestinal disorders. In 2005, botched chelation therapy killed a five-year-old child with autism. Chelation is not recommended for people with ASD since the associated risks outweigh any potential benefits. Another alternative medicine practice with no evidence is CEASE therapy, a mixture of homeopathy, supplements, and 'vaccine detoxing'.\n", 'Although popularly used as an alternative treatment for people with autism, as of 2018 there is no good evidence to recommend a gluten- and casein-free diet as a standard treatment. A 2018 review concluded that it may be a therapeutic option for specific groups of children with autism, such as those with known food intolerances or allergies, or with food intolerance markers. The authors analyzed the prospective trials conducted to date that studied the efficacy of the gluten- and casein-free diet in children with ASD (4 in total). All of them compared gluten- and casein-free diet versus normal diet with a control group (2 double blind randomized controlled trials, 1 double blind crossover trial, 1 single blind trial). In two of the studies, whose duration was 12 and 24 months, a significant improvement in ASD symptoms (efficacy rate 50%) was identified. In the other two studies, whose duration was 3 months, no significant effect was observed. The authors concluded that a longer duration of the diet may be necessary to achieve the improvement of the ASD symptoms. Other problems documented in the trials carried out include transgressions of the diet, small sample size, the heterogeneity of the participants and the possibility of a placebo effect.\n', 'In the subset of people who have gluten sensitivity there is limited evidence that suggests that a gluten-free diet may improve some autistic behaviors.\n', 'There is tentative evidence that music therapy may improve social interactions, verbal communication, and non-verbal communication skills. There has been early research looking at hyperbaric treatments in children with autism.\n', 'Section::::Prognosis.\n', 'There is no known cure. Children recover occasionally, so that they lose their diagnosis of ASD; this occurs sometimes after intensive treatment and sometimes not. It is not known how often recovery happens; reported rates in unselected samples have ranged from 3% to 25%. Most children with autism acquire language by age five or younger, though a few have developed communication skills in later years. Most children with autism lack social support, meaningful relationships, future employment opportunities or self-determination. Although core difficulties tend to persist, symptoms often become less severe with age.\n', 'Few high-quality studies address long-term prognosis. Some adults show modest improvement in communication skills, but a few decline; no study has focused on autism after midlife. Acquiring language before age six, having an IQ above 50, and having a marketable skill all predict better outcomes; independent living is unlikely with severe autism.\n', 'Many individuals with autism face significant obstacles in transitioning to adulthood. Compared to the general population individuals with autism are more likely to be unemployed and to have never had a job. People in their 20s with autism have an employment rate of 58%.\n', 'Section::::Epidemiology.\n', "Most recent reviews tend to estimate a prevalence of 1–2 per 1,000 for autism and close to 6 per 1,000 for ASD, and 11 per 1,000 children in the United States for ASD as of 2008; because of inadequate data, these numbers may underestimate ASD's true rate. Globally, autism affects an estimated 24.8 million people , while Asperger syndrome affects a further 37.2 million. In 2012, the NHS estimated that the overall prevalence of autism among adults aged 18 years and over in the UK was 1.1%. Rates of PDD-NOS's has been estimated at 3.7 per 1,000, Asperger syndrome at roughly 0.6 per 1,000, and childhood disintegrative disorder at 0.02 per 1,000. CDC estimates about 1 out of 59 (1.7%) for 2014, an increase from 1 out of every 68 children (1.5%) for 2010.\n", "The number of reported cases of autism increased dramatically in the 1990s and early 2000s. This increase is largely attributable to changes in diagnostic practices, referral patterns, availability of services, age at diagnosis, and public awareness, though unidentified environmental risk factors cannot be ruled out. The available evidence does not rule out the possibility that autism's true prevalence has increased; a real increase would suggest directing more attention and funding toward changing environmental factors instead of continuing to focus on genetics.\n", 'Boys are at higher risk for ASD than girls. The sex ratio averages 4.3:1 and is greatly modified by cognitive impairment: it may be close to 2:1 with intellectual disability and more than 5.5:1 without. Several theories about the higher prevalence in males have been investigated, but the cause of the difference is unconfirmed; one theory is that females are underdiagnosed.\n', 'Although the evidence does not implicate any single pregnancy-related risk factor as a cause of autism, the risk of autism is associated with advanced age in either parent, and with diabetes, bleeding, and use of psychiatric drugs in the mother during pregnancy. The risk is greater with older fathers than with older mothers; two potential explanations are the known increase in mutation burden in older sperm, and the hypothesis that men marry later if they carry genetic liability and show some signs of autism. Most professionals believe that race, ethnicity, and socioeconomic background do not affect the occurrence of autism.\n', 'Several other conditions are common in children with autism. They include:\n', 'BULLET::::- Genetic disorders. About 10–15% of autism cases have an identifiable Mendelian (single-gene) condition, chromosome abnormality, or other genetic syndrome, and ASD is associated with several genetic disorders.\n', "BULLET::::- Intellectual disability. The percentage of autistic individuals who also meet criteria for intellectual disability has been reported as anywhere from 25% to 70%, a wide variation illustrating the difficulty of assessing intelligence of individuals on the autism spectrum. In comparison, for PDD-NOS the association with intellectual disability is much weaker, and by definition, the diagnosis of Asperger's excludes intellectual disability.\n", "BULLET::::- Anxiety disorders are common among children with ASD; there are no firm data, but studies have reported prevalences ranging from 11% to 84%. Many anxiety disorders have symptoms that are better explained by ASD itself, or are hard to distinguish from ASD's symptoms.\n", 'BULLET::::- Epilepsy, with variations in risk of epilepsy due to age, cognitive level, and type of language disorder.\n', 'BULLET::::- Several metabolic defects, such as phenylketonuria, are associated with autistic symptoms.\n', 'BULLET::::- Minor physical anomalies are significantly increased in the autistic population.\n', 'BULLET::::- Preempted diagnoses. Although the DSM-IV rules out concurrent diagnosis of many other conditions along with autism, the full criteria for Attention deficit hyperactivity disorder (ADHD), Tourette syndrome, and other of these conditions are often present and these comorbid diagnoses are increasingly accepted.\n', 'BULLET::::- Sleep problems affect about two-thirds of individuals with ASD at some point in childhood. These most commonly include symptoms of insomnia such as difficulty in falling asleep, frequent nocturnal awakenings, and early morning awakenings. Sleep problems are associated with difficult behaviors and family stress, and are often a focus of clinical attention over and above the primary ASD diagnosis.\n', 'Section::::History.\n', 'A few examples of autistic symptoms and treatments were described long before autism was named. The "Table Talk" of Martin Luther, compiled by his notetaker, Mathesius, contains the story of a 12-year-old boy who may have been severely autistic. Luther reportedly thought the boy was a soulless mass of flesh possessed by the devil, and suggested that he be suffocated, although a later critic has cast doubt on the veracity of this report. The earliest well-documented case of autism is that of Hugh Blair of Borgue, as detailed in a 1747 court case in which his brother successfully petitioned to annul Blair\'s marriage to gain Blair\'s inheritance. The Wild Boy of Aveyron, a feral child caught in 1798, showed several signs of autism; the medical student Jean Itard treated him with a behavioral program designed to help him form social attachments and to induce speech via imitation.\n', 'The New Latin word "autismus" (English translation "autism") was coined by the Swiss psychiatrist Eugen Bleuler in 1910 as he was defining symptoms of schizophrenia. He derived it from the Greek word "autós" (αὐτός, meaning "self"), and used it to mean morbid self-admiration, referring to "autistic withdrawal of the patient to his fantasies, against which any influence from outside becomes an intolerable disturbance". A Soviet child psychiatrist, Grunya Sukhareva, described a similar syndrome that was published in Russian in 1925, and in German in 1926.\n', 'Section::::History.:Clinical development and diagnoses.\n', 'The word "autism" first took its modern sense in 1938 when Hans Asperger of the Vienna University Hospital adopted Bleuler\'s terminology "autistic psychopaths" in a lecture in German about child psychology. Asperger was investigating an ASD now known as Asperger syndrome, though for various reasons it was not widely recognized as a separate diagnosis until 1981. Leo Kanner of the Johns Hopkins Hospital first used "autism" in its modern sense in English when he introduced the label "early infantile autism" in a 1943 report of 11 children with striking behavioral similarities. Almost all the characteristics described in Kanner\'s first paper on the subject, notably "autistic aloneness" and "insistence on sameness", are still regarded as typical of the autistic spectrum of disorders. It is not known whether Kanner derived the term independently of Asperger.\n', 'Donald Triplett was the first person diagnosed with autism. He was diagnosed by Kanner after being first examined in 1938, and was labeled as "case 1". Triplett was noted for his savant abilities, particularly being able to name musical notes played on a piano and to mentally multiply numbers. His father, Oliver, described him as socially withdrawn but interested in number patterns, music notes, letters of the alphabet, and U.S. president pictures. By the age of 2, he had the ability to recite the 23rd Psalm and memorized 25 questions and answers from the Presbyterian catechism. He was also interested in creating musical chords.\n', 'Kanner\'s reuse of "autism" led to decades of confused terminology like "infantile schizophrenia", and child psychiatry\'s focus on maternal deprivation led to misconceptions of autism as an infant\'s response to "refrigerator mothers". Starting in the late 1960s autism was established as a separate syndrome.\n', 'Section::::History.:Terminology and distinction from schizophrenia.\n', "As late as the mid-1970s there was little evidence of a genetic role in autism; while in 2007 it was believed to be one of the most heritable psychiatric conditions. Although the rise of parent organizations and the destigmatization of childhood ASD have affected how ASD is viewed, parents continue to feel social stigma in situations where their child's autistic behavior is perceived negatively, and many primary care physicians and medical specialists express some beliefs consistent with outdated autism research.\n", 'It took until 1980 for the DSM-III to differentiate autism from childhood schizophrenia. In 1987, the DSM-III-R provided a checklist for diagnosing autism. In May 2013, the DSM-5 was released, updating the classification for pervasive developmental disorders. The grouping of disorders, including PDD-NOS, autism, Asperger syndrome, Rett syndrome, and CDD, has been removed and replaced with the general term of Autism Spectrum Disorders. The two categories that exist are impaired social communication and/or interaction, and restricted and/or repetitive behaviors.\n', 'The Internet has helped autistic individuals bypass nonverbal cues and emotional sharing that they find difficult to deal with, and has given them a way to form online communities and work remotely. Societal and cultural aspects of autism have developed: some in the community seek a cure, while others believe that autism is simply another way of being.\n', 'Section::::Society and culture.\n', 'An autistic culture has emerged, accompanied by the autistic rights and neurodiversity movements. Events include World Autism Awareness Day, Autism Sunday, Autistic Pride Day, Autreat, and others. Organizations dedicated to promoting awareness of autism include Autism Speaks, Autism National Committee, and Autism Society of America. Social-science scholars study those with autism in hopes to learn more about "autism as a culture, transcultural comparisons... and research on social movements." While most autistic individuals do not have savant skills, many have been successful in their fields.\n', 'Section::::Society and culture.:Autism rights movement.\n', 'The autism rights movement is a social movement within the context of disability rights that emphasizes the concept of neurodiversity, viewing the autism spectrum as a result of natural variations in the human brain rather than a disorder to be cured. The autism rights movement advocates for including greater acceptance of autistic behaviors; therapies that focus on coping skills rather than imitating the behaviors those without autism; and the recognition of the autistic community as a minority group. Autism rights or neurodiversity advocates believe that the autism spectrum is genetic and should be accepted as a natural expression of the human genome. This perspective is distinct from two other likewise distinct views: the medical perspective, that autism is caused by a genetic defect and should be addressed by targeting the autism gene(s), and fringe theories that autism is caused by environmental factors such as vaccines. A common criticism against autistic activists is that the majority of them are "high-functioning" or have Asperger syndrome and do not represent the views of "low-functioning" autistic people.\n', 'Section::::Society and culture.:Employment.\n', 'About half of autistics are unemployed, and one third of those with graduate degrees may be unemployed. Among autistics who find work, most are employed in sheltered settings working for wages below the national minimum. While employers state hiring concerns about productivity and supervision, experienced employers of autistics give positive reports of above average memory and detail orientation as well as a high regard for rules and procedure in autistic employees. A majority of the economic burden of autism if caused by decreased earnings in the job market. Some studies also find decreased earning among parents who care for autistic children.\n'), (KiltDocAnchor(text='developmental disorder', href='developmental%20disorder', paragraph_id=1, start=12, end=34), KiltDocAnchor(text='developmental milestones', href='developmental%20milestones', paragraph_id=1, start=314, end=338), KiltDocAnchor(text='worsening', href='Regressive%20autism', paragraph_id=1, start=363, end=372), KiltDocAnchor(text='genetic', href='Heritability%20of%20autism', paragraph_id=2, start=43, end=50), KiltDocAnchor(text='environmental factors', href='environmental%20factors', paragraph_id=2, start=55, end=76), KiltDocAnchor(text='rubella', href='rubella', paragraph_id=2, start=144, end=151), KiltDocAnchor(text='valproic acid', href='valproic%20acid', paragraph_id=2, start=170, end=183), KiltDocAnchor(text='cocaine', href='cocaine', paragraph_id=2, start=194, end=201), KiltDocAnchor(text='pesticides', href='pesticides', paragraph_id=2, start=203, end=213), KiltDocAnchor(text='air pollution', href='air%20pollution', paragraph_id=2, start=218, end=231), KiltDocAnchor(text='fetal growth restriction', href='fetal%20growth%20restriction', paragraph_id=2, start=233, end=257), KiltDocAnchor(text='autoimmune disease', href='autoimmune%20disease', paragraph_id=2, start=263, end=281), KiltDocAnchor(text='Controversies', href='Controversies%20in%20autism', paragraph_id=2, start=284, end=297), KiltDocAnchor(text='causes', href='Causes%20of%20autism', paragraph_id=2, start=336, end=342), KiltDocAnchor(text='vaccine hypothesis', href='MMR%20vaccine%20controversy', paragraph_id=2, start=361, end=379), KiltDocAnchor(text='nerve cell', href='nerve%20cell', paragraph_id=2, start=502, end=512), KiltDocAnchor(text='synapse', href='synapse', paragraph_id=2, start=524, end=531), KiltDocAnchor(text='DSM-5', href='DSM-5', paragraph_id=2, start=581, end=586), KiltDocAnchor(text='Asperger syndrome', href='Asperger%20syndrome', paragraph_id=2, start=645, end=662), KiltDocAnchor(text='pervasive developmental disorder not otherwise specified', href='pervasive%20developmental%20disorder%20not%20otherwise%20specified', paragraph_id=2, start=667, end=723), KiltDocAnchor(text='autism spectrum disorder', href='autism%20spectrum%20disorder', paragraph_id=2, start=776, end=800), KiltDocAnchor(text='behavioral interventions', href='Applied%20behavior%20analysis', paragraph_id=3, start=6, end=30), KiltDocAnchor(text='speech therapy', href='speech%20therapy', paragraph_id=3, start=34, end=48), KiltDocAnchor(text='self-care', href='self-care', paragraph_id=3, start=84, end=93), KiltDocAnchor(text='autistic culture', href='Societal%20and%20cultural%20aspects%20of%20autism', paragraph_id=3, start=317, end=333), KiltDocAnchor(text='accepted as a difference and not treated as a disorder', href='Autism%20rights%20movement', paragraph_id=3, start=424, end=478), KiltDocAnchor(text='neurodevelopmental disorder', href='neurodevelopmental%20disorder', paragraph_id=6, start=29, end=56), KiltDocAnchor(text='remission', href='Remission%20%28medicine%29', paragraph_id=6, start=161, end=170), KiltDocAnchor(text='autism spectrum disorder', href='autism%20spectrum%20disorder', paragraph_id=8, start=51, end=75), KiltDocAnchor(text='Temple Grandin', href='Temple%20Grandin', paragraph_id=8, start=274, end=288), KiltDocAnchor(text='social communication', href='social%20communication', paragraph_id=8, start=331, end=351), KiltDocAnchor(text='neurotypical', href='neurotypical', paragraph_id=8, start=355, end=367), KiltDocAnchor(text='neural development', href='neural%20development', paragraph_id=8, start=392, end=410), KiltDocAnchor(text='toddler', href='toddler', paragraph_id=9, start=202, end=209), KiltDocAnchor(text='social norms', href='social%20norms', paragraph_id=9, start=239, end=251), KiltDocAnchor(text='eye contact', href='eye%20contact', paragraph_id=9, start=281, end=292), KiltDocAnchor(text='turn-taking', href='turn-taking', paragraph_id=9, start=297, end=308), KiltDocAnchor(text='nonverbal', href='nonverbal%20autism', paragraph_id=9, start=583, end=592), KiltDocAnchor(text='attachments', href='Attachment%20%28psychology%29', paragraph_id=9, start=646, end=657), KiltDocAnchor(text='attachment security', href='Attachment%20in%20children%23Secure%20attachment', paragraph_id=9, start=737, end=756), KiltDocAnchor(text='perform worse on tests of face and emotion recognition', href='Face%20perception%23Face%20perception%20in%20individuals%20with%20autism', paragraph_id=9, start=919, end=973), KiltDocAnchor(text='babbling', href='babbling', paragraph_id=13, start=235, end=243), KiltDocAnchor(text='echolalia', href='echolalia', paragraph_id=13, start=676, end=685), KiltDocAnchor(text='reverse pronouns', href='Pronoun%20reversal', paragraph_id=13, start=690, end=706), KiltDocAnchor(text='Joint attention', href='Joint%20attention', paragraph_id=13, start=708, end=723), KiltDocAnchor(text='Stereotyped behaviors', href='Stereotypy', paragraph_id=17, start=12, end=33), KiltDocAnchor(text='Compulsive behavior', href='Compulsive%20behavior', paragraph_id=18, start=12, end=31), KiltDocAnchor(text='Ritualistic behavior', href='Ritual%23Psychology', paragraph_id=20, start=12, end=32), KiltDocAnchor(text='Self-injury', href='Self-injury', paragraph_id=22, start=12, end=23), KiltDocAnchor(text='skin-picking', href='Dermatillomania', paragraph_id=22, start=55, end=67), KiltDocAnchor(text='splinter skill', href='splinter%20skill', paragraph_id=26, start=86, end=100), KiltDocAnchor(text='autistic savants', href='Savant%20syndrome', paragraph_id=26, start=187, end=203), KiltDocAnchor(text='Sensory', href='Sensory%20system', paragraph_id=26, start=317, end=324), KiltDocAnchor(text='poor muscle tone', href='Hypotonia', paragraph_id=26, start=818, end=834), KiltDocAnchor(text='poor motor planning', href='Apraxia', paragraph_id=26, start=836, end=855), KiltDocAnchor(text='toe walking', href='toe%20walking', paragraph_id=26, start=861, end=872), KiltDocAnchor(text='stress', href='stress%20%28psychological%29', paragraph_id=27, start=51, end=57), KiltDocAnchor(text='Down syndrome', href='Down%20syndrome', paragraph_id=27, start=245, end=258), KiltDocAnchor(text='Down syndrome', href='Down%20syndrome', paragraph_id=27, start=398, end=411), KiltDocAnchor(text='gender dysphoria', href='gender%20dysphoria', paragraph_id=27, start=609, end=625), KiltDocAnchor(text='Gastrointestinal problems', href='Gastrointestinal%20diseases', paragraph_id=28, start=0, end=25), KiltDocAnchor(text='associated medical disorders', href='comorbidity', paragraph_id=28, start=55, end=83), KiltDocAnchor(text='genetics of autism', href='Heritability%20of%20autism', paragraph_id=31, start=48, end=66), KiltDocAnchor(text='mutation', href='mutation', paragraph_id=31, start=135, end=143), KiltDocAnchor(text='epigenetic', href='epigenetic', paragraph_id=31, start=308, end=318), KiltDocAnchor(text='DNA', href='DNA', paragraph_id=31, start=347, end=350), KiltDocAnchor(text='gene expression', href='gene%20expression', paragraph_id=31, start=394, end=409), KiltDocAnchor(text='heritability', href='heritability', paragraph_id=31, start=559, end=571), KiltDocAnchor(text='Mendelian', href='Mendelian', paragraph_id=31, start=850, end=859), KiltDocAnchor(text='chromosome abnormality', href='chromosome%20abnormality', paragraph_id=31, start=898, end=920), KiltDocAnchor(text='structural variation', href='structural%20variation', paragraph_id=31, start=1288, end=1308), KiltDocAnchor(text='deletions', href='Deletion%20%28genetics%29', paragraph_id=31, start=1319, end=1328), KiltDocAnchor(text='duplications', href='Gene%20duplication', paragraph_id=31, start=1330, end=1342), KiltDocAnchor(text='inversions', href='Chromosomal%20inversion', paragraph_id=31, start=1346, end=1356), KiltDocAnchor(text='meiosis', href='meiosis', paragraph_id=31, start=1384, end=1391), KiltDocAnchor(text='Intrauterine growth restriction', href='Intrauterine%20growth%20restriction', paragraph_id=32, start=106, end=137), KiltDocAnchor(text='autoimmune disease', href='autoimmune%20disease', paragraph_id=32, start=222, end=240), KiltDocAnchor(text='air pollution', href='air%20pollution', paragraph_id=33, start=12, end=25), KiltDocAnchor(text='heavy metals', href='heavy%20metal%20%28chemistry%29', paragraph_id=33, start=55, end=67), KiltDocAnchor(text='Environmental factor', href='Environmental%20factor', paragraph_id=33, start=119, end=139), KiltDocAnchor(text='infectious disease', href='infectious%20disease', paragraph_id=33, start=242, end=260), KiltDocAnchor(text='solvent', href='solvent', paragraph_id=33, start=263, end=270), KiltDocAnchor(text='PCBs', href='Polychlorinated%20biphenyl', paragraph_id=33, start=273, end=277), KiltDocAnchor(text='phthalates', href='phthalates', paragraph_id=33, start=279, end=289), KiltDocAnchor(text='phenols', href='phenols', paragraph_id=33, start=294, end=301), KiltDocAnchor(text='pesticide', href='pesticide', paragraph_id=33, start=328, end=337), KiltDocAnchor(text='brominated flame retardant', href='brominated%20flame%20retardant', paragraph_id=33, start=340, end=366), KiltDocAnchor(text='alcohol', href='Ethanol', paragraph_id=33, start=369, end=376), KiltDocAnchor(text='illicit drug', href='illicit%20drug', paragraph_id=33, start=387, end=399), KiltDocAnchor(text='vaccine', href='vaccine', paragraph_id=33, start=402, end=409), KiltDocAnchor(text='prenatal stress', href='prenatal%20stress', paragraph_id=33, start=416, end=431), KiltDocAnchor(text='vaccine "overload"', href='Vaccine%20controversy%23Vaccine%20overload', paragraph_id=34, start=154, end=172), KiltDocAnchor(text='vaccine preservative', href='Thiomersal%20controversy', paragraph_id=34, start=176, end=196), KiltDocAnchor(text='MMR vaccine', href='MMR%20vaccine%20controversy', paragraph_id=34, start=205, end=216), KiltDocAnchor(text='childhood immunizations', href='childhood%20immunizations', paragraph_id=34, start=539, end=562), KiltDocAnchor(text='outbreaks of previously controlled childhood diseases', href='MMR%20vaccine%20controversy%23Disease%20outbreaks', paragraph_id=34, start=564, end=617), KiltDocAnchor(text='pathophysiology', href='pathophysiology', paragraph_id=36, start=181, end=196), KiltDocAnchor(text='neuropsychological', href='neuropsychological', paragraph_id=36, start=263, end=281), KiltDocAnchor(text='gut–brain axis', href='gut%E2%80%93brain%20axis', paragraph_id=37, start=23, end=37), KiltDocAnchor(text='gastrointestinal', href='gastrointestinal%20tract', paragraph_id=37, start=119, end=135), KiltDocAnchor(text='autonomic nervous system', href='autonomic%20nervous%20system', paragraph_id=37, start=169, end=193), KiltDocAnchor(text='gut flora', href='gut%20flora', paragraph_id=37, start=195, end=204), KiltDocAnchor(text='metabolite', href='metabolite', paragraph_id=37, start=227, end=237), KiltDocAnchor(text='enteric nervous system', href='enteric%20nervous%20system', paragraph_id=37, start=319, end=341), KiltDocAnchor(text='synaptic', href='Synapse', paragraph_id=38, start=35, end=43), KiltDocAnchor(text='cell adhesion', href='cell%20adhesion', paragraph_id=38, start=183, end=196), KiltDocAnchor(text='teratogen', href='teratogen', paragraph_id=38, start=392, end=401), KiltDocAnchor(text='birth defect', href='birth%20defect', paragraph_id=38, start=422, end=434), KiltDocAnchor(text='conception', href='Human%20fertilization', paragraph_id=38, start=515, end=525), KiltDocAnchor(text='Diagnosis', href='Medical%20diagnosis', paragraph_id=40, start=0, end=9), KiltDocAnchor(text='DSM-5', href='DSM-5', paragraph_id=40, start=66, end=71), KiltDocAnchor(text='idiosyncratic language', href='Idiosyncrasy%23Psychiatry%20and%20psychology', paragraph_id=40, start=512, end=534), KiltDocAnchor(text='Rett syndrome', href='Rett%20syndrome', paragraph_id=40, start=639, end=652), KiltDocAnchor(text='intellectual disability', href='intellectual%20disability', paragraph_id=40, start=654, end=677), KiltDocAnchor(text='ICD-10', href='ICD-10', paragraph_id=40, start=709, end=715), KiltDocAnchor(text='Autism Diagnostic Interview-Revised', href='Autism%20Diagnostic%20Interview-Revised', paragraph_id=41, start=92, end=127), KiltDocAnchor(text='Autism Diagnostic Observation Schedule', href='Autism%20Diagnostic%20Observation%20Schedule', paragraph_id=41, start=182, end=220), KiltDocAnchor(text='Childhood Autism Rating Scale', href='Childhood%20Autism%20Rating%20Scale', paragraph_id=41, start=281, end=310), KiltDocAnchor(text='pediatrician', href='pediatrician', paragraph_id=42, start=2, end=14), KiltDocAnchor(text='medical conditions', href='medical%20conditions', paragraph_id=42, start=356, end=374), KiltDocAnchor(text='neuropsychologist', href='neuropsychologist', paragraph_id=42, start=388, end=405), KiltDocAnchor(text='differential diagnosis', href='differential%20diagnosis', paragraph_id=42, start=535, end=557), KiltDocAnchor(text='intellectual disability', href='intellectual%20disability', paragraph_id=42, start=600, end=623), KiltDocAnchor(text='hearing impairment', href='hearing%20impairment', paragraph_id=42, start=625, end=643), KiltDocAnchor(text='specific language impairment', href='specific%20language%20impairment', paragraph_id=42, start=651, end=679), KiltDocAnchor(text='Landau–Kleffner syndrome', href='Landau%E2%80%93Kleffner%20syndrome', paragraph_id=42, start=688, end=712), KiltDocAnchor(text='depression', href='Major%20depressive%20disorder', paragraph_id=42, start=809, end=819), KiltDocAnchor(text='Clinical genetics', href='Clinical%20genetics', paragraph_id=43, start=0, end=17), KiltDocAnchor(text='fragile X', href='fragile%20X', paragraph_id=43, start=327, end=336), KiltDocAnchor(text='genotype-first', href='Genotype-first%20approach', paragraph_id=43, start=348, end=362), KiltDocAnchor(text='Metabolic', href='Metabolic', paragraph_id=43, start=696, end=705), KiltDocAnchor(text='neuroimaging', href='neuroimaging', paragraph_id=43, start=710, end=722), KiltDocAnchor(text='visually impaired', href='visually%20impaired', paragraph_id=45, start=554, end=571), KiltDocAnchor(text='blindism', href='blindism', paragraph_id=45, start=727, end=735), KiltDocAnchor(text='pervasive developmental disorder', href='pervasive%20developmental%20disorder', paragraph_id=47, start=26, end=58), KiltDocAnchor(text='Asperger syndrome', href='Asperger%20syndrome', paragraph_id=48, start=23, end=40), KiltDocAnchor(text='Rett syndrome', href='Rett%20syndrome', paragraph_id=48, start=90, end=103), KiltDocAnchor(text='childhood disintegrative disorder', href='childhood%20disintegrative%20disorder', paragraph_id=48, start=108, end=141), KiltDocAnchor(text='PDD not otherwise specified', href='PDD%20not%20otherwise%20specified', paragraph_id=48, start=206, end=233), KiltDocAnchor(text='language development', href='language%20development', paragraph_id=48, start=427, end=447), KiltDocAnchor(text='phenotype', href='phenotype', paragraph_id=48, start=934, end=943), KiltDocAnchor(text='traits', href='Trait%20%28biology%29', paragraph_id=48, start=1020, end=1026), KiltDocAnchor(text='spectrum', href='Spectrum%20disorder', paragraph_id=49, start=42, end=50), KiltDocAnchor(text='developmentally disabled', href='developmentally%20disabled', paragraph_id=49, start=120, end=144), KiltDocAnchor(text='pedantic', href='pedantic', paragraph_id=49, start=319, end=327), KiltDocAnchor(text='high-functioning autism', href='high-functioning%20autism', paragraph_id=49, start=521, end=544), KiltDocAnchor(text='IQ', href='IQ', paragraph_id=49, start=575, end=577), KiltDocAnchor(text='syndromal', href='Syndrome', paragraph_id=49, start=747, end=756), KiltDocAnchor(text='intellectual disability', href='intellectual%20disability', paragraph_id=49, start=842, end=865), KiltDocAnchor(text='tuberous sclerosis', href='tuberous%20sclerosis', paragraph_id=49, start=923, end=941), KiltDocAnchor(text='overlap between Asperger syndrome, HFA, and non-syndromal autism', href='Diagnosis%20of%20Asperger%20syndrome%23Differences%20from%20high-functioning%20autism', paragraph_id=49, start=1064, end=1128), KiltDocAnchor(text='regressive autism', href='regressive%20autism', paragraph_id=50, start=260, end=277), KiltDocAnchor(text='psychiatry', href='psychiatry', paragraph_id=51, start=194, end=204), KiltDocAnchor(text='psychology', href='psychology', paragraph_id=51, start=206, end=216), KiltDocAnchor(text='neurology', href='neurology', paragraph_id=51, start=218, end=227), KiltDocAnchor(text='pediatrics', href='pediatrics', paragraph_id=51, start=232, end=242), KiltDocAnchor(text='fMRI', href='fMRI', paragraph_id=51, start=271, end=275), KiltDocAnchor(text='diffusion tensor imaging', href='diffusion%20tensor%20imaging', paragraph_id=51, start=280, end=304), KiltDocAnchor(text='phenotype', href='phenotype', paragraph_id=51, start=345, end=354), KiltDocAnchor(text='brain scan', href='brain%20scan', paragraph_id=51, start=398, end=408), KiltDocAnchor(text='neurogenetic', href='neurogenetic', paragraph_id=51, start=427, end=439), KiltDocAnchor(text='fusiform face area', href='fusiform%20face%20area', paragraph_id=51, start=498, end=516), KiltDocAnchor(text='babbling', href='babbling', paragraph_id=54, start=15, end=23), KiltDocAnchor(text='gesturing', href='Gesture', paragraph_id=55, start=15, end=24), KiltDocAnchor(text='echolalic', href='echolalia', paragraph_id=57, start=47, end=56), KiltDocAnchor(text='United States Preventive Services Task Force', href='United%20States%20Preventive%20Services%20Task%20Force', paragraph_id=59, start=4, end=48), KiltDocAnchor(text='screen', href='Screening%20%28medicine%29', paragraph_id=59, start=190, end=196), KiltDocAnchor(text='Modified Checklist for Autism in Toddlers', href='Modified%20Checklist%20for%20Autism%20in%20Toddlers', paragraph_id=59, start=471, end=512), KiltDocAnchor(text='M-CHAT', href='Modified%20Checklist%20for%20Autism%20in%20Toddlers', paragraph_id=59, start=623, end=629), KiltDocAnchor(text='Checklist for Autism in Toddlers', href='Checklist%20for%20Autism%20in%20Toddlers', paragraph_id=59, start=655, end=687), KiltDocAnchor(text='sensitivity', href='Sensitivity%20%28tests%29', paragraph_id=59, start=798, end=809), KiltDocAnchor(text='specificity', href='Specificity%20%28tests%29', paragraph_id=59, start=842, end=853), KiltDocAnchor(text='genetic screening', href='genetic%20screening', paragraph_id=59, start=1151, end=1168), KiltDocAnchor(text='dysmorphic feature', href='dysmorphic%20feature', paragraph_id=59, start=1296, end=1314), KiltDocAnchor(text='rubella', href='rubella', paragraph_id=61, start=21, end=28), KiltDocAnchor(text='pregnancy', href='pregnancy', paragraph_id=61, start=36, end=45), KiltDocAnchor(text='vaccination against rubella', href='rubella%20vaccine', paragraph_id=61, start=87, end=114), KiltDocAnchor(text='behavior analysts', href='Professional%20practice%20of%20behavior%20analysis', paragraph_id=63, start=472, end=489), KiltDocAnchor(text='special education', href='special%20education', paragraph_id=63, start=491, end=508), KiltDocAnchor(text='teachers', href='teachers', paragraph_id=63, start=509, end=517), KiltDocAnchor(text='speech pathologists', href='speech-language%20pathology', paragraph_id=63, start=519, end=538), KiltDocAnchor(text='psychologists', href='psychologists', paragraph_id=63, start=553, end=566), KiltDocAnchor(text='efficacy', href='efficacy', paragraph_id=63, start=664, end=672), KiltDocAnchor(text='psychosocial', href='psychosocial', paragraph_id=63, start=775, end=787), KiltDocAnchor(text='systematic review', href='systematic%20review', paragraph_id=63, start=931, end=948), KiltDocAnchor(text='special education', href='special%20education', paragraph_id=63, start=1140, end=1157), KiltDocAnchor(text='applied behavior analysis', href='applied%20behavior%20analysis', paragraph_id=65, start=45, end=70), KiltDocAnchor(text='speech and language therapy', href='speech%20and%20language%20therapy', paragraph_id=65, start=121, end=148), KiltDocAnchor(text='social skills', href='social%20skills', paragraph_id=65, start=150, end=163), KiltDocAnchor(text='occupational therapy', href='occupational%20therapy', paragraph_id=65, start=177, end=197), KiltDocAnchor(text='structured type of ABA', href='Discrete%20trial%20training', paragraph_id=65, start=524, end=546), KiltDocAnchor(text='structured', href='Discrete%20trial%20training', paragraph_id=65, start=714, end=724), KiltDocAnchor(text='naturalistic', href='Pivotal%20response%20treatment', paragraph_id=65, start=729, end=741), KiltDocAnchor(text='Early, intensive ABA therapy', href='Discrete%20trial%20training', paragraph_id=66, start=0, end=28), KiltDocAnchor(text='naturalistic form of ABA', href='Pivotal%20response%20treatment', paragraph_id=66, start=306, end=330), KiltDocAnchor(text='ADHD', href='ADHD', paragraph_id=68, start=201, end=205), KiltDocAnchor(text='anxiety', href='anxiety%20disorder', paragraph_id=68, start=209, end=216), KiltDocAnchor(text='psychoactive drug', href='psychoactive%20drug', paragraph_id=68, start=282, end=299), KiltDocAnchor(text='anticonvulsant', href='anticonvulsant', paragraph_id=68, start=304, end=318), KiltDocAnchor(text='antidepressant', href='antidepressant', paragraph_id=68, start=361, end=375), KiltDocAnchor(text='stimulant', href='stimulant', paragraph_id=68, start=378, end=387), KiltDocAnchor(text='antipsychotic', href='antipsychotic', paragraph_id=68, start=394, end=407), KiltDocAnchor(text='atypical antipsychotic', href='atypical%20antipsychotic', paragraph_id=68, start=414, end=436), KiltDocAnchor(text='risperidone', href='risperidone', paragraph_id=68, start=443, end=454), KiltDocAnchor(text='aripiprazole', href='aripiprazole', paragraph_id=68, start=459, end=471), KiltDocAnchor(text='FDA', href='Food%20and%20drug%20administration', paragraph_id=68, start=476, end=479), KiltDocAnchor(text='SSRI antidepressants', href='SSRI%20antidepressants', paragraph_id=68, start=768, end=788), KiltDocAnchor(text='fluoxetine', href='fluoxetine', paragraph_id=68, start=798, end=808), KiltDocAnchor(text='fluvoxamine', href='fluvoxamine', paragraph_id=68, start=813, end=824), KiltDocAnchor(text='methylphenidate', href='methylphenidate', paragraph_id=68, start=939, end=954), KiltDocAnchor(text='alternative therapies and interventions', href='Alternative%20therapies%20for%20developmental%20and%20learning%20disabilities', paragraph_id=70, start=14, end=53), KiltDocAnchor(text='quality-of-life', href='Quality%20of%20life', paragraph_id=70, start=164, end=179), KiltDocAnchor(text='casein-free diet', href='casein-free%20diet', paragraph_id=70, start=503, end=519), KiltDocAnchor(text='calcium', href='calcium', paragraph_id=70, start=560, end=567), KiltDocAnchor(text='vitamin D', href='vitamin%20D', paragraph_id=70, start=572, end=581), KiltDocAnchor(text='gastrointestinal disorders', href='gastrointestinal%20disease', paragraph_id=70, start=678, end=704), KiltDocAnchor(text='chelation therapy', href='chelation%20therapy', paragraph_id=70, start=723, end=740), KiltDocAnchor(text='CEASE therapy', href='CEASE%20therapy', paragraph_id=70, start=950, end=963), KiltDocAnchor(text='homeopathy', href='homeopathy', paragraph_id=70, start=978, end=988), KiltDocAnchor(text='alternative treatment', href='complementary%20and%20alternative%20medicine', paragraph_id=71, start=30, end=51), KiltDocAnchor(text='gluten- and casein-free diet', href='gluten-free%2C%20casein-free%20diet', paragraph_id=71, start=128, end=156), KiltDocAnchor(text='food intolerance', href='food%20intolerance', paragraph_id=71, start=312, end=328), KiltDocAnchor(text='allergies', href='food%20allergy', paragraph_id=71, start=333, end=342), KiltDocAnchor(text='gluten sensitivity', href='non-celiac%20gluten%20sensitivity', paragraph_id=72, start=33, end=51), KiltDocAnchor(text='gluten-free diet', href='gluten-free%20diet', paragraph_id=72, start=99, end=115), KiltDocAnchor(text='music therapy', href='music%20therapy', paragraph_id=73, start=33, end=46), KiltDocAnchor(text='hyperbaric treatment', href='Hyperbaric%20medicine', paragraph_id=73, start=180, end=200), KiltDocAnchor(text='social support', href='social%20support', paragraph_id=75, start=436, end=450), KiltDocAnchor(text='self-determination', href='Self-determination%20theory', paragraph_id=75, start=513, end=531), KiltDocAnchor(text='prognosis', href='prognosis', paragraph_id=76, start=43, end=52), KiltDocAnchor(text='IQ', href='IQ', paragraph_id=76, start=225, end=227), KiltDocAnchor(text='independent living', href='independent%20living', paragraph_id=76, start=297, end=315), KiltDocAnchor(text='review', href='review', paragraph_id=79, start=12, end=18), KiltDocAnchor(text='NHS', href='National%20Health%20Service', paragraph_id=79, start=385, end=388), KiltDocAnchor(text='PDD-NOS', href='PDD-NOS', paragraph_id=79, start=502, end=509), KiltDocAnchor(text='childhood disintegrative disorder', href='childhood%20disintegrative%20disorder', paragraph_id=79, start=597, end=630), KiltDocAnchor(text='Genetic disorder', href='Genetic%20disorder', paragraph_id=84, start=12, end=28), KiltDocAnchor(text='Mendelian', href='Mendelian', paragraph_id=84, start=81, end=90), KiltDocAnchor(text='chromosome abnormality', href='chromosome%20abnormality', paragraph_id=84, start=116, end=138), KiltDocAnchor(text='Intellectual disability', href='Intellectual%20disability', paragraph_id=85, start=12, end=35), KiltDocAnchor(text='assessing', href='Intelligence%20assessment', paragraph_id=85, start=222, end=231), KiltDocAnchor(text='intelligence', href='Controversies%20in%20autism%23Intelligence', paragraph_id=85, start=232, end=244), KiltDocAnchor(text='Anxiety disorder', href='Anxiety%20disorder', paragraph_id=86, start=12, end=28), KiltDocAnchor(text='Epilepsy', href='Epilepsy', paragraph_id=87, start=12, end=20), KiltDocAnchor(text='language disorder', href='language%20disorder', paragraph_id=87, start=99, end=116), KiltDocAnchor(text='metabolic defect', href='metabolic%20defect', paragraph_id=88, start=20, end=36), KiltDocAnchor(text='phenylketonuria', href='phenylketonuria', paragraph_id=88, start=47, end=62), KiltDocAnchor(text='Minor physical anomalies', href='Minor%20physical%20anomalies', paragraph_id=89, start=12, end=36), KiltDocAnchor(text='Attention deficit hyperactivity disorder (ADHD)', href='Attention%20deficit%20hyperactivity%20disorder', paragraph_id=90, start=150, end=197), KiltDocAnchor(text='Tourette syndrome', href='Tourette%20syndrome', paragraph_id=90, start=199, end=216), KiltDocAnchor(text='comorbid diagnoses', href='Conditions%20comorbid%20to%20autism%20spectrum%20disorders', paragraph_id=90, start=276, end=294), KiltDocAnchor(text='insomnia', href='insomnia', paragraph_id=91, start=143, end=151), KiltDocAnchor(text='nocturnal awakenings', href='middle-of-the-night%20insomnia', paragraph_id=91, start=199, end=219), KiltDocAnchor(text='Table Talk', href='Table%20Talk%20%28Luther%29', paragraph_id=93, start=101, end=111), KiltDocAnchor(text='Martin Luther', href='Martin%20Luther', paragraph_id=93, start=116, end=129), KiltDocAnchor(text='Wild Boy of Aveyron', href='Wild%20Boy%20of%20Aveyron', paragraph_id=93, start=655, end=674), KiltDocAnchor(text='feral child', href='feral%20child', paragraph_id=93, start=678, end=689), KiltDocAnchor(text='Jean Itard', href='Jean%20Marc%20Gaspard%20Itard', paragraph_id=93, start=758, end=768), KiltDocAnchor(text='New Latin', href='New%20Latin', paragraph_id=94, start=4, end=13), KiltDocAnchor(text='Swiss', href='Swiss%20people', paragraph_id=94, start=79, end=84), KiltDocAnchor(text='Eugen Bleuler', href='Eugen%20Bleuler', paragraph_id=94, start=98, end=111), KiltDocAnchor(text='schizophrenia', href='schizophrenia', paragraph_id=94, start=151, end=164), KiltDocAnchor(text='Grunya Sukhareva', href='Grunya%20Sukhareva', paragraph_id=94, start=451, end=467), KiltDocAnchor(text='Hans Asperger', href='Hans%20Asperger', paragraph_id=96, start=59, end=72), KiltDocAnchor(text='Vienna University Hospital', href='Vienna%20General%20Hospital', paragraph_id=96, start=80, end=106), KiltDocAnchor(text='child psychology', href='child%20psychology', paragraph_id=96, start=189, end=205), KiltDocAnchor(text='Asperger syndrome', href='Asperger%20syndrome', paragraph_id=96, start=254, end=271), KiltDocAnchor(text='Leo Kanner', href='Leo%20Kanner', paragraph_id=96, start=365, end=375), KiltDocAnchor(text='Johns Hopkins Hospital', href='Johns%20Hopkins%20Hospital', paragraph_id=96, start=383, end=405), KiltDocAnchor(text='Donald Triplett', href='Donald%20Triplett', paragraph_id=97, start=0, end=15), KiltDocAnchor(text='refrigerator mother', href='refrigerator%20mother', paragraph_id=98, start=211, end=230), KiltDocAnchor(text='social stigma', href='social%20stigma', paragraph_id=100, start=308, end=321), KiltDocAnchor(text='primary care physician', href='primary%20care%20physician', paragraph_id=100, start=408, end=430), KiltDocAnchor(text='medical specialist', href='medical%20specialist', paragraph_id=100, start=436, end=454), KiltDocAnchor(text='DSM-III', href='DSM-III', paragraph_id=101, start=27, end=34), KiltDocAnchor(text='DSM-III-R', href='DSM-III-R', paragraph_id=101, start=102, end=111), KiltDocAnchor(text='DSM-5', href='DSM-5', paragraph_id=101, start=173, end=178), KiltDocAnchor(text='PDD-NOS', href='PDD-NOS', paragraph_id=101, start=297, end=304), KiltDocAnchor(text='Asperger syndrome', href='Asperger%20syndrome', paragraph_id=101, start=314, end=331), KiltDocAnchor(text='Rett syndrome', href='Rett%20syndrome', paragraph_id=101, start=333, end=346), KiltDocAnchor(text='CDD', href='Childhood%20disintegrative%20disorder', paragraph_id=101, start=352, end=355), KiltDocAnchor(text='Societal and cultural aspects of autism', href='Societal%20and%20cultural%20aspects%20of%20autism', paragraph_id=102, start=199, end=238), KiltDocAnchor(text='autism is simply another way of being', href='Neurodiversity', paragraph_id=102, start=316, end=353), KiltDocAnchor(text='autistic rights', href='autistic%20rights', paragraph_id=104, start=52, end=67), KiltDocAnchor(text='neurodiversity', href='neurodiversity', paragraph_id=104, start=72, end=86), KiltDocAnchor(text='World Autism Awareness Day', href='World%20Autism%20Awareness%20Day', paragraph_id=104, start=113, end=139), KiltDocAnchor(text='Autism Sunday', href='Autism%20Sunday', paragraph_id=104, start=141, end=154), KiltDocAnchor(text='Autistic Pride Day', href='Autistic%20Pride%20Day', paragraph_id=104, start=156, end=174), KiltDocAnchor(text='Autreat', href='Autreat', paragraph_id=104, start=176, end=183), KiltDocAnchor(text='Autism Speaks', href='Autism%20Speaks', paragraph_id=104, start=262, end=275), KiltDocAnchor(text='Autism National Committee', href='Autism%20National%20Committee', paragraph_id=104, start=277, end=302), KiltDocAnchor(text='Autism Society of America', href='Autism%20Society%20of%20America', paragraph_id=104, start=308, end=333), KiltDocAnchor(text='autism rights movement', href='autism%20rights%20movement', paragraph_id=106, start=4, end=26), KiltDocAnchor(text='social movement', href='social%20movement', paragraph_id=106, start=32, end=47), KiltDocAnchor(text='disability right', href='disability%20rights%20movement', paragraph_id=106, start=70, end=86), KiltDocAnchor(text='neurodiversity', href='neurodiversity', paragraph_id=106, start=119, end=133), KiltDocAnchor(text='human brain', href='human%20brain', paragraph_id=106, start=204, end=215), KiltDocAnchor(text='minority group', href='minority%20group', paragraph_id=106, start=492, end=506), KiltDocAnchor(text='human genome', href='human%20genome', paragraph_id=106, start=648, end=660), KiltDocAnchor(text='fringe theories', href='fringe%20theories', paragraph_id=106, start=859, end=874), KiltDocAnchor(text='vaccines', href='Vaccine', paragraph_id=106, start=930, end=938), KiltDocAnchor(text='high-functioning', href='High-functioning%20autism', paragraph_id=106, start=1020, end=1036), KiltDocAnchor(text='Asperger syndrome', href='Asperger%20syndrome', paragraph_id=106, start=1046, end=1063), KiltDocAnchor(text='low-functioning', href='Low-functioning%20autism', paragraph_id=106, start=1099, end=1114)), ('Neurological disorders in children', 'Autism', 'Articles containing video clips', 'Neurological disorders', 'Communication disorders', 'Pervasive developmental disorders', 'Psychiatric diagnosis', 'Mental and behavioural disorders'), 'Q38404', '908372970', '2019-07-29T09:08:56Z', '908206160', '25', 'https://en.wikipedia.org/w/index.php?title=Autism&oldid=908372970'), 5903529: KiltDoc('5141410', 'Origin of the domestic dog', re.compile("^Origin of the domestic dog\nThe origin of the domestic dog includes the dog's evolutionary divergence.{71155} the Holarctic temperate zone hunting dogs were a widespread adaptation to forest ungulate hunting\\.\n$", flags=48), ('Origin of the domestic dog\n', 'The origin of the domestic dog includes the dog\'s evolutionary divergence from the wolf, its domestication, and its development into dog types and dog breeds. The dog is a member of the genus "Canis", which forms part of the wolf-like canids, and was the first species and the only large carnivore to have been domesticated. The dog and the extant gray wolf are sister taxa, as modern wolves are not closely related to the population of wolves that was first domesticated.\n', 'The genetic divergence between dogs and wolves occurred between 40,000–20,000 years ago, just before or during the Last Glacial Maximum. This timespan represents the upper time-limit for the commencement of domestication because it is the time of divergence and not the time of domestication, which occurred later. The domestication of animals commenced over 15,000 years ago, beginning with the grey wolf ("Canis lupus") by nomadic hunter-gatherers. The archaeological record and genetic analysis show the remains of the Bonn–Oberkassel dog buried beside humans 14,200 years ago to be the first undisputed dog, with disputed remains occurring 36,000 years ago. It was not until 11,000 years ago that people living in the Near East entered into relationships with wild populations of aurochs, boar, sheep, and goats.\n', 'Where the domestication of the dog took place remains debated, with the most plausible proposals spanning Western Europe, Central Asia and East Asia. This has been made more complicated by the recent proposal that an initial wolf population split into East and West Eurasian groups. These two groups, before going extinct, were domesticated independently into two distinct dog populations between 14,000 and 6,400 years ago. The Western Eurasian dog population was gradually and partially replaced by East Asian dogs introduced by humans at least 6,400 years ago. This proposal is also debated.\n', 'Section::::Canid and human evolution.\n', "Six million years ago, towards the close of the Miocene era, the earth's climate gradually cooled. This would lead to the glaciations of the Pliocene and the Pleistocene, which are commonly referred to as the Ice Age. In many areas, forests and savannahs were replaced with steppes or grasslands, and only those species of creature that adapted to these changes would survive.\n", 'In southern North America, small woodland foxes grew bigger and better adapted to running, and by the late Miocene the first of the genus "Canis" had arisen - the ancestors of coyotes, wolves and the domestic dog. In eastern Africa, a split occurred among the large primates. Some remained in the trees, while others came down from the trees, learned to walk upright, developed larger brains, and in the more open country learned to avoid predators while becoming predators themselves. The ancestors of humans and dogs would ultimately meet in Eurasia.\n', "Human hunter-gatherers did not live in fear of nature and knew that they posed a formidable risk to any potential predators. Today, the Ju'wasi people of Namibia share their land with prides of lions. Both species coexist with respect and without fear or hostility in a relationship that may go back to the dawn of modern humans. The lion is a much larger and far more dangerous predator than the wolf. Early modern humans entering Eurasia and first encountering packs of wolves may have been assisted in living among them because of the traditional beliefs of their African ancestors. In historical times, mutual respect and cooperation with canids can be found in the stories and traditions of the indigenous peoples of Siberia, East Asia, North America, and Australia.\n", 'Section::::Divergence from wolves.\n', 'The domestication of animals commenced over 15,000 years before present (YBP), beginning with the grey wolf ("Canis lupus") by nomadic hunter-gatherers. It was not until 11,000 YBP that people living in the Near East entered into relationships with wild populations of aurochs, boar, sheep, and goats. A domestication process then began to develop. The grey wolf most likely followed the commensal pathway to domestication. When, where, and how many times wolves may have been domesticated remains debated because only a small number of ancient specimens have been found, and both archaeology and genetics continue to provide conflicting evidence. The most widely accepted, earliest dog remains date back 15,000 YBP to the Bonn–Oberkassel dog. Earlier remains dating back to 30,000 YBP have been described as Paleolithic dogs, however their status as dogs or wolves remains debated. Recent studies indicate that a genetic divergence occurred between dogs and wolves 20,000-40,000 YBP, however this is the upper time-limit for domestication because it represents the time of divergence and not the time of domestication.\n', 'Section::::Divergence from wolves.:Time of genetic divergence.\n', "The date estimated for the evolutionary divergence of a domestic lineage from a wild one does not necessarily indicate the start of the domestication process but it does provide an upper boundary. The divergence of the lineage that led to the domestic horse from the lineage that led to the modern Przewalski's horse is estimated to have occurred around 45,000 YBP but the archaeological record indicates 5,500 YBP. The variance can be due to modern wild populations not being the direct ancestor of the domestic ones, or to a divergence caused by changes in the climate, topography, or other environmental influences. The evolutionary divergence time for the wolf and dog is indicated to have occurred somewhere between 20,000-60,000 YBP but this does not imply that domestication occurred during this period.\n", 'Section::::Divergence from wolves.:Time of genetic divergence.:Paleobiogeography.\n', 'During the Late Pleistocene glaciation, a vast mammoth steppe stretched from Spain eastwards across Eurasia and over the Bering land bridge into Alaska and the Yukon. The Late Pleistocene was characterized by a series of severe and rapid climate oscillations with regional temperature changes of up to , which has been correlated with megafaunal extinctions. There is no evidence of megafaunal extinctions at the height of the Last Glacial Maximum, indicating that increasing cold and glaciation were not factors. Multiple events appear to have caused the rapid replacement of one species by another one within the same genus, or one population by another within the same species, across a broad area. As some species became extinct, so too did the predators that depended on them.\n', 'The origin of dogs is couched in the paleobiogeography of wolf populations during the Late Pleistocene. The earliest fossils of "Canis lupus" were found in what was once eastern Beringia at Old Crow, Yukon, Canada and at Cripple Creek Sump, Fairbanks, Alaska. The age is not agreed but could date 1 million YBP. Considerable morphological diversity existed among grey wolves by the Late Pleistocene. These are regarded as having been more cranio-dentally robust than modern grey wolves, often with a shortened rostrum, the pronounced development of the temporalis muscle, and robust premolars. It is proposed that these features were specialized adaptations for the processing of carcass and bone associated with the hunting and scavenging of Pleistocene megafauna. Compared with modern wolves, some Pleistocene wolves showed an increase in tooth breakage that is similar to that seen in the extinct dire wolf. This suggests that these either often processed carcasses, or that they competed with other carnivores and needed to quickly consume their prey. The frequency and location of tooth fractures found in these wolves compared with the modern spotted hyena indicates that these wolves were habitual bone crackers.\n', 'Section::::Divergence from wolves.:Time of genetic divergence.:Timespan.\n', "Genetic studies indicate that the gray wolf is the closest living relative of the dog, with no evidence of any other canine species having contributed. Attempting to reconstruct the dog's lineage through the phylogenetic analysis of DNA sequences from modern dogs and wolves has given conflicting results for several reasons. Firstly, studies indicate that an extinct Late Pleistocene wolf is the nearest common ancestor to the dog, with modern wolves not being the dog's direct ancestor. Secondly, the genetic divergence between the dog and modern wolves occurred over a short period of time, so that the time of the divergence is difficult to date (referred to as incomplete lineage sorting). This is complicated further by the cross-breeding that has occurred between dogs and wolves since domestication (referred to as post-domestication gene flow). Finally, there have been only tens of thousands of generations of dogs since domestication, so that the number of mutations between the dog and the wolf are few and this makes the timing of domestication difficult to date.\n", "In 2013, the whole genome sequencing of modern dogs and wolves indicated a divergence time of 32,000 YBP. In 2014, another study indicated 16,000-11,000 YBP. The first draft genome sequence of a Pleistocene canid was published in 2015. This Taymyr Peninsula wolf belonged to a population that had diverged from the ancestors of both modern wolves and dogs. Radiocarbon dating indicates its age to be 35,000 YBP, and this age could then be used to re-calibrate the wolf's mutation rate, indicating that the genetic divergence between dogs and wolves occurred before the Last Glacial Maximum, between 40,000–27,000 YBP. When this mutation rate was applied to the timing of the 2014 study, that study gave the same result of 40,000–27,000 YBP.\n", 'Section::::Divergence from wolves.:Place of genetic divergence.\n', 'Most genetic studies conducted over the last two decades were based on modern dog breeds and extant wolf populations, with their findings dependent on a number of assumptions. These studies assumed that the extant wolf was the ancestor of the dog, did not consider genetic admixture between wolves and dogs, nor the impact of incomplete lineage sorting. These pre-genomic studies have suggested an origin of dogs in Southeast Asia, East Asia, Central Asia, the Middle East, or Europe. More recently, the field of Paleogenomics applies the latest molecular technologies to fossil remains that still contain useful DNA. \n', 'Section::::Divergence from wolves.:Place of genetic divergence.:Europe.\n', 'In 2013, a study sequenced the complete mitochondrial genomes of 18 fossil canids and these were then compared with the mitochondrial genomes from modern wolves and dogs in a phylogenetic tree. The study found that there exists more genetic variation between the 18 fossil canids than exists between all modern wolves and dogs. The analyses included the oldest fossils proposed as early dogs dating back as far as 36,000 YBP. However, these Goyet Cave canids were found to have formed an ancient and extinct sister group to modern dogs and wolves. The authors propose that these dog-like fossils were either an early domestication attempt that left no descendants among modern dogs, or that they were a specialized wolf ecomorph whose morphological features were part of the Late Pleistocene wolf diversity.\n', 'An earlier study of the Razboinichya Cave canid that was based on a segment of its mitochondrial DNA (mDNA) had concluded that it was an early dog. This fossil was included for the analysis of its complete mitochondrial genome and on the phylogenetic tree was assigned a much more basal position. The study inferred from the tree that dogs and wolves split 32,000-19,000 YBP and therefore the beginning of domestication occurred in the time of hunter-gatherers rather than in the time of farmers. The tree also confirmed that the ancient dogs of the Americas originated in Eurasia.\n', 'The phylogenetic analyses revealed that three of the four major mDNA clades of dogs relate most closely to the ancient canids from Europe rather than those from China or the Middle East, which supports a European origin of modern dogs. No modern wolf population related closer to dogs than the ancient canids from Europe, indicating that the wolf population that was the ancestor of the dog is extinct.\n', 'Section::::Divergence from wolves.:Place of genetic divergence.:Arctic northeastern Siberia.\n', 'In 2015, a study recovered mDNA from ancient canid specimens that were discovered in arctic northeastern Siberia (which was once western Beringia). These specimens included the mandible of a 360,000-400,000 YBP "Canis c.f. variabilis" (where c.f. is a Latin term meaning uncertain). Phylogenetic analyses of these canids revealed nine mDNA haplotypes not detected before. The "Canis c.f. variabilis" specimen clustered with other wolf samples from across Russia and Asia. The mDNA haplotypes of one 8,750 YBP specimen and some 28,000 YBP specimens matched with those of geographically widely-spread modern dogs. One 47,000 YBP canid was distinct from wolves but was only a few mutations away from those haplotypes found in modern dogs. The authors concluded that the structure of the modern dog gene pool was contributed to from ancient Siberian wolves and possibly from "Canis c.f. variabilis".\n', 'Section::::Divergence from wolves.:Place of genetic divergence.:Two origins.\n', 'Dogs show both ancient and modern lineages. The ancient lineages appear most in Asia but least in Europe because the Victorian era development of modern dog breeds used little of the ancient lineages. All dog populations (breed, village, and feral) show some evidence of genetic admixture between modern and ancient dogs. Some ancient dog populations that once occupied Europe and the New World no longer exist. This implies that some ancient dog populations were entirely replaced and others admixed over a long period of time. European dog populations have undergone extensive turnover during the last 15,000 years which has erased the genomic signature of early European dogs, the genetic heritage of the modern breeds has become blurred due to admixture, and there was the possibility of past domestication events that had gone extinct or had been largely replaced by more modern dog populations.\n', 'Siberian huskies and other northern breeds can trace at least some of their ancestry (1.4–27.3%) back to the Taimyr wolf, which indicates the possibility of more than one domestication event.\n', 'In 2016, a study compared the mitochondrial DNA and whole-genome sequences of a worldwide panel of modern dogs, the mDNA sequences of 59 ancient European dog specimens dated 14,000-3,000 YBP, and the nuclear genome sequence of a dog specimen that was found in the Late Neolithic passage grave at Newgrange, Ireland and radiocarbon dated at 4,800 YBP. A genetic analysis of the Newgrange dog showed that it was male, did not possess genetic variants associated with modern coat length nor color, was not as able to process starch as efficiently as modern dogs but more efficiently than wolves, and showed ancestry from a population of wolves that could not be found in other dogs nor wolves today. As the taxonomic classification of the "proto-dog" Paleolithic dogs as being either dogs or wolves remains controversial, they were excluded from the study. The phylogenetic tree generated from mDNA sequences found a deep division between the Sarloos wolfdog and all other dogs, indicating that breed\'s recent deriving from the German Shepherd and captive gray wolves. The next largest division was between eastern Asian dogs and western Eurasian (Europe and the Middle East) dogs that had occurred between 14,000-6,400 YBP, with the Newgrange dog clustering with the western Eurasian dogs.\n', 'The Newgrange and ancient European dog mDNA sequences could be largely assigned to mDNA haplogroups C and D but modern European dog sequences could be largely assigned to mDNA haplogroups A and B, indicating a turnover of dogs in the past from a place other than Europe. As this split dates older than the Newgrange dog this suggests that the replacement was only partial. The analysis showed that most modern European dogs had undergone a population bottleneck (reduction) which can be an indicator of travel. The archaeological record shows dog remains dating over 15,000 YBP in western Eurasia, over 12,500 YBP in eastern Eurasia, but none older than 8,000 YBP in Central Asia. The study proposes that dogs may have been domesticated separately in both eastern and western Eurasia from two genetically distinct and now extinct wolf populations. East Eurasian dogs then made their way with migrating people to western Europe between 14,000-6,400 YBP where they partially replaced the dogs of Europe. Two domestication events in western Eurasia and eastern Eurasia has recently been found for the domestic pig.\n', "The hypothesis is that two genetically different, and possibly now extinct, wolf populations were domesticated independently in eastern and western Eurasia to produce paleolithic dogs. The eastern Eurasian dogs then dispersed westward alongside humans, reaching western Europe 6,400–14,000 years ago where they partially replaced the western paleolithic dogs. A single domestication is thought to be due to chance, however dual domestication on different sides of the world is unlikely to have happened randomly and it suggests that external factors - an environmental driver - may have forced wolves to work together with humans for survival. It is possible that wolves took advantage of resources that humans had, or humans may have been introduced to wolves in an area in which they didn't previously live.\n", 'The study indicates that the western Eurasian wolf and dog populations genetically diverged 20,000-60,000 YBP. Immediately after this divergence, the dog population outnumbered the wolf population, and later the dog population underwent a population reduction to be much lower.\n', 'Section::::Divergence from wolves.:Place of genetic divergence.:Two origins disputed.\n', 'In 2017, a study compared the nuclear genome sequences of three ancient dog specimens from Germany and Ireland with sequences from over 5,000 dogs and wolves. These Neolithic dog specimens included a dog sample from the Early Neolithic site in Herxheim, Germany dated 7,000 YBP, one from the Late Neolithic site of Kirschbaum (Cherry Tree) Cave near Forchheim, Germany dated 4,700 YBP, and a dog from Newgrange, Ireland dated 4,800 YBP. The study found that modern European dogs descended from their Neolithic ancestors with no evidence of a population turnover. There was evidence of a single dog-wolf divergence occurring between 36,900-41,500 YBP, followed by a divergence between Southeast Asian and Western Eurasian dogs 17,500-23,900 YBP and this indicates a single dog domestication event occurring between 20,000-40,000 YBP. The 3 dogs indicated ancestry that could be found in South East Asian dogs. Additionally, the Cherry Tree Cave dog showed ancestry that could be found in the Middle East, India and Central Asia. The study did not support a dual domestication event, and detected admixture between the ancestors of modern European and Southeast Asian dogs.\n', 'A 2018 study of mDNA sequences shows that the pre-Neolithic dogs of Europe all fell under haplogroup C. The Neolithic and Post-Neolithic dogs from Southeastern Europe that are associated with farmers fell under haplogroup D. In Western and Northern Europe, haplogroup D became diluted into the native dog population. This implies that haplogroup D arrived in Europe 9,000 years ago from the Near East along with pigs, cows, sheep, and goats. Later in 2018, another study looked at the y-chromosome male lineage of the ancient fossils of the Herxheim, Kirschbaum, and Newgrange dogs along with other canines. The study identified six major dog yDNA haplogroups, of which two of these include the majority of modern dogs. The Newgrange dog fell into the most commonly occurring of these haplogroups. The two ancient German dogs fell into a haplogroup commonly found among dogs from the Middle East and Asia, with the Kirschbaum dog sharing a common male lineage with the extant Indian wolf. The study concluded that at least 2 different male haplogroups existed in ancient Europe, and that the dog male lineage diverged from its nearest common ancestor shared with the gray wolf sometime between 68,000-151,000 YBP.\n', 'Section::::Divergence from wolves.:Morphological divergence.\n', 'The questions of when and where dogs were first domesticated have taxed geneticists and archaeologists for decades. Identifying the earliest dogs is difficult because the key morphological characters that are used by zooarchaeologists to differentiate domestic dogs from their wild wolf ancestors (size and position of teeth, dental pathologies, and size and proportion of cranial and postcranial elements) were not yet fixed during the initial phases of the domestication process. The range of natural variation among these characters that may have existed in ancient wolf populations, and the time it took for these traits to appear in dogs, are unknown.\n', 'Section::::Divergence from wolves.:Morphological divergence.:Early dog specimens.\n', 'There are a number of recently discovered specimens which are proposed as being Paleolithic dogs, however their taxonomy is debated. These have been found in either Europe or Siberia and date 40,000-17,000 YBP. They include Hohle Fels in Germany, Goyet Caves in Belgium, Predmosti in the Czech Republic, and four sites in Russia: Razboinichya Cave in the Altai Republic, Kostyonki-8, Ulakhan Sular in the Sakha Republic, and Eliseevichi 1 on the Russian plain. Paw-prints from Chauvet Cave in France dated 26,000 YBP are suggested as being those of a dog, however these have been challenged as being left by a wolf.\n', 'There are also a number of later proposed Paleolithic dogs whose taxonomy has not been confirmed. These include a number of specimens from Germany (Kniegrotte, Oelknitz, Teufelsbrucke), Switzerland (Monruz, Kesslerloch, Champre-veyres-Hauterive), and Ukraine (Mezin, Mezhirich). A set of specimens dating 15,000-13,500 YBP have been confidently identified as domesticated dogs, based on their morphology and the archaeological sites in which they have been found. These include Spain (Erralla), France (Montespan, Le Morin, Le Closeau, Pont d’Ambon), and Germany (Bonn-Oberkassel). After this period, the remains of domesticated dogs have been identified from archaeological sites across Eurasia.\n', 'Possible dog domestication between 40,000-15,000 years ago is not clear due to the debate over what the Paleolithic dog specimens represent. This is due to the flexibility of genus "Canis" morphology, and the close morphological similarities between "Canis lupus" and "Canis familiaris". It is also due to the scarcity of Pleistocene wolf specimens available for analyses and so their morphological variation is unknown. Habitat type, climate, and prey specialization greatly modify the morphological plasticity of grey wolf populations, resulting in a range of morphologically, genetically, and ecologically distinct wolf morphotypes. With no baseline to work from, zooarchaeologists find it difficult to be able to differentiate between the initial indicators of dog domestication and various types of Late Pleistocene wolf ecomorphs, which can lead to the mis-identification of both early dogs and wolves. Additionally, the ongoing prehistoric admixture with local wolf populations during the domestication process may have led to canids that were domesticated in their behavior but wolflike in their morphology. Attempting to identify early tamed wolves, wolfdogs, or proto-dogs through morphological analysis alone may be impossible without the inclusion of genetic analyses.\n', 'A domestication study looked at the reasons why the archeological record that is based on the dating of fossil remains often differed from the genetic record contained within the cells of living species. The study concluded that our inability to date domestication is because domestication is a continuum and there is no single point where we can say that a species was clearly domesticated using these two techniques. The study proposes that changes in morphology across time and how humans were interacting with the species in the past needs to be considered in addition to these two techniques.\n', 'Section::::Dog domestication.\n', 'The earlier association of dogs with humans may have allowed dogs to have a profound influence on the course of early human history and the development of civilization. However, the timing, geographic locations, and ecological conditions that led to dog domestication are not agreed.\n', "There is clear evidence that dogs were derived from gray wolves during the initial phases of domestication and that no other canine species was involved. The wolf population(s) that were involved are likely to be extinct. Despite numerous genetic studies of both modern dogs and ancient dog remains, there is no firm consensus regarding either the timing or location(s) of domestication, the number of wolf populations that were involved, or the long-term effects domestication has had on the dog's genome.\n", "Genetic studies suggest a domestication process commencing over 25,000 YBP, in one or several wolf populations in either Europe, the high Arctic, or eastern Asia. The remains of large carcasses left by human hunter-gatherers may have led some wolves into entering a migratory relationship with humans. This could have led to their divergence from those wolves that remained in the one territory. A closer relationship between these wolves — or proto-dogs — and humans may have then developed, such as hunting together and mutual defence from other carnivores and other humans. Around 10,000 YBP agriculture was developed resulting in a sedentary lifestyle, along with phenotype divergence of the dog from its wolf ancestors, including variance in size. In the Victorian era, directed human selection developed the modern dog breeds, which resulted in a vast range of phenotypes. Each of these domestication phases have left their mark on the dog's genome.\n", 'Genetic studies support two population bottlenecks having occurred to the dog lineage, one due to the initial domestication and one due to the formation of dog breeds.\n', 'Section::::Dog domestication.:Time of domestication.\n', "An apex predator is a predator that sits on the top trophic level of the food chain, while a mesopredator sits further down the food chain and is dependent on smaller animals. Towards the end of the Pleistocene era, most of today's apex predators were mesopredators and this included the wolf. During the ecological upheaval associated with the close of the Late Pleistocene, one type of wolf population rose to become today's apex predator and another joined with humans to become an apex consumer.\n", 'In August 2015, a study undertook an analysis of the complete mitogenome sequences of 555 modern and ancient dogs. The sequences showed an increase in the population size approximately 23,500 YBP, which broadly coincides with the proposed separation of the ancestors of dogs and present-day wolves before the Last Glacial Maximum (refer first divergence). A ten-fold increase in the population size occurred after 15,000 YBP, which may be attributable to domestication events and is consistent with the demographic dependence of dogs on the human population.\n', 'Section::::Dog domestication.:Socialization.\n', 'Humans and wolves both exist in complex social groups. How humans and wolves got together remains unknown. One view holds that domestication as a process that is difficult to define. The term was developed by anthropologists with a human-centric view in which humans took wild animals (ungulates) and bred them to be "domestic", usually in order to provide improved food or materials for human consumption. That term may not be appropriate for a large carnivore such as the dog. This alternate view regards dogs as being either socialized and able to live among humans, or unsocialized. There exists today dogs that live with their human families but are unsocialized and will threaten strangers defensively and aggressively no different from a wild wolf. There also exists a number of cases where wild wolves have approached people in remote places, attempting to initiate play and to form companionship. One such notable wolf was Romeo, a gentle black wolf that formed relationships with the people and dogs of Juneau, Alaska. This view holds that before there could have been domestication of the wolf, there had to have been its socialization.\n', 'Section::::Dog domestication.:Commensal pathway.\n', 'Animal domestication is a coevolutionary process in which a population responds to selective pressure while adapting to a novel niche that included another species with evolving behaviors.\n', 'The dog is a classic example of a domestic animal that likely traveled a commensal pathway into domestication. The dog was the first domesticant, and was domesticated and widely established across Eurasia before the end of the Pleistocene, well before cultivation or the domestication of other animals. It may have been inevitable that the first domesticated animal came from the order of carnivores as these are less afraid when approaching other species. Within the carnivores, the first domesticated animal would need to exist without an all-meat diet, possess a running and hunting ability to provide its own food, and be of a controllable size to coexist with humans, indicating the family "Canidae", and the right temperament with wolves being among the most gregarious and cooperative animals on the planet.\n', 'Ancient DNA supports the hypothesis that dog domestication preceded the emergence of agriculture and was initiated close to the Last Glacial Maximum 27,000 YBP when hunter-gatherers preyed on megafauna, and when proto-dogs might have taken advantage of carcasses left on site by early hunters, assisted in the capture of prey, or provided defense from large competing predators at kill-sites. Wolves were probably attracted to human campfires by the smell of meat being cooked and discarded refuse in the vicinity, first loosely attaching themselves and then considering these as part of their home territory where their warning growls would alert humans to the approach of outsiders. The wolves most likely drawn to human camps were the less-aggressive, subdominant pack members with lowered flight response, higher stress thresholds, less wary around humans, and therefore better candidates for domestication. The earliest sign of domestication in dogs was the neotenization of skull morphology and the shortening of snout length that results in tooth crowding, reduction in tooth size, and a reduction in the number of teeth, which has been attributed to the strong selection for reduced aggression. This process may have begun during the initial commensal stage of dog domestication, even before humans began to be active partners in the process.\n', 'A maternal mitochondrial, paternal Y chromosome, and microsatellite assessment of two wolf populations in North America and combined with satellite telemetry data revealed significant genetic and morphological differences between one population that migrated with and preyed upon caribou, and another territorial ecotype population that remained in a boreal coniferous forest. Though these two populations spend a period of the year in the same place, and though there was evidence of gene flow between them, the difference in prey–habitat specialization has been sufficient to maintain genetic and even coloration divergence. A study has identified the remains of a population of extinct Pleistocene Beringian wolves with unique mitochondrial signatures. The skull shape, tooth wear, and isotopic signatures suggested these were specialist megafauna hunters and scavengers that became extinct while less specialized wolf ecotypes survived. Analogous to the modern wolf ecotype that has evolved to track and prey upon caribou, a Pleistocene wolf population could have begun following mobile hunter-gatherers, thus slowly acquiring genetic and phenotypic differences that would have allowed them to more successfully adapt to the human habitat.\n', 'Even today, the wolves on Ellesmere Island do not fear humans, which is thought to be due to them seeing humans so little, and they will approach humans cautiously, curiously and closely.\n', 'Section::::Dog domestication.:Post-domestication gene flow.\n', 'Since domestication, dogs have traveled alongside humans across most of the planet, often hybridizing\n', 'with local wild canids. This has resulted in complex patterns of ancient and recent admixture among both the wild and the domestic canids.\n', 'Some studies have found greater diversity in the genetic markers of dogs from East and Central Asia compared to Europe and have concluded that dogs originated from these regions, despite no archaeological evidence to support the conclusions. One reason for these discrepancies is the sustained admixture between different dog and wolf populations across the Old and New Worlds over at least the last 10,000 years, which has blurred the genetic signatures and confounded efforts of researchers at pinpointing the origins of dogs. Another reason is that none of the modern wolf populations are related to the Pleistocene wolves that were first domesticated. In other words, the extinction of the wolves that were the direct ancestors of dogs has muddied efforts to pinpoint the time and place of dog domestication.\n', 'Section::::Dog domestication.:Post-domestication gene flow.:Dog-Wolf admixture.\n', 'There is evidence of admixture between dog and regional wolf populations, except on the Tibetan Plateau and in the New World wolves. This admixture has occurred throughout history and as dogs expanded across the landscape. There are some dog populations that show recent admixture with wolves.\n', 'Phylogenetic analysis shows that modern dog mDNA haplotypes resolve into four monophyletic clades with strong statistical support, and these have been designated by researchers as clades A-D. Other studies that included a wider sample of specimens have reported two rare East Asian clades E-F with weaker statistical support. In 2009, a study found that haplogroups A, B and C included 98% of dogs and are found universally distributed across Eurasia, indicating that they were the result of a single domestication event, and that haplogroups D, E, and F were rare and appeared to be the result of regional hybridization with local wolves post-domestication. Haplogroups A and B contained subclades that appeared to be the result of hybridization with wolves post-domestication, because each haplotype within each of these subclades was the result of a female wolf/male dog pairing.\n', 'Haplogroup A: Includes 64-65% of dogs. Haplotypes of subclades a2–a6 are derived from post-domestication wolf–dog hybridization.\n', 'Haplogroup B: Includes 22-23% of dogs. haplotypes of subclade b2 are derived from post-domestication wolf–dog hybridization.\n', 'Haplogroup C: Includes 10-12% of dogs.\n', 'Haplogroup D: Derived from post-domestication wolf–dog hybridization in subclade d1 (Scandinavia) and d2 (South-West Asia). The northern Scandinavian subclade d1 hybrid haplotypes originated 480-3,000 YBP and are found in all Sami-related breeds: Finnish Lapphund, Swedish Lapphund, Lapponian Herder, Jamthund, Norwegian Elkhound and Hällefors Elkhound. The maternal wolf sequence that contributed to them has not been matched across Eurasia and its branch is phylogenetically rooted in the same sequence as the Altai dog (not a direct ancestor). The subclade d2 hybrid haplotypes are found in 2.6% of South-West Asian dogs.\n', 'Haplogroup E: Derived from post-domestication wolf–dog hybridization in East Asia, (rare distribution in South-East Asia, Korea and Japan).\n', 'Haplogroup F: Derived from post-domestication wolf–dog hybridization in Japan. A study of 600 dog specimens found only one dog whose sequence indicated hybridization with the extinct Japanese wolf.\n', 'It is not known whether this hybridization was the result of humans selecting for phenotypic traits from local wolf populations or the result of natural introgression as the dog expanded across Eurasia.\n', 'In 2018, a study found a small amount of dog ancestry in 62% of Eurasian wolf specimens looked at, that hybridization had occurred across a wide number of timescales and not just recently, however in contrast there was almost no admixture detected in the North American specimens. There was introgression of the male dog into the wolf, but also one hybrid detected which was the result of a male wolf crossed with a female dog. Wolves have maintained their phenotype differences from the dog, which indicates low-frequency hybridization. The conclusion is that phenotype is no indication of "purity" and the definition of pure wolves is ambiguous. Free-ranging dogs across Eurasia show introgression from wolves. Another study found that the β-defensin gene responsible for the black coat of North American wolves was the result of a single introgression from dogs in the Yukon dated between 1,600-7,200 years ago. The study proposes that early Native American dogs were the source.\n', 'Section::::Dog domestication.:Post-domestication gene flow.:Taimyr wolf admixture.\n', 'In May 2015, a study compared the ancestry of the Taimyr-1 wolf lineage to that of dogs and gray wolves.\n', 'Comparison to the gray wolf lineage indicated that Taimyr-1 was basal to gray wolves from the Middle East, China, Europe and North America but shared a substantial amount of history with the present-day gray wolves after their divergence from the coyote. This implies that the ancestry of the majority of gray wolf populations today stems from an ancestral population that lived less than 35,000 years ago but before the inundation of the Bering Land Bridge with the subsequent isolation of Eurasian and North American wolves.\n', 'A comparison of the ancestry of the Taimyr-1 lineage to the dog lineage indicated that some modern dog breeds have a closer association with either the gray wolf or Taimyr-1 due to admixture. The Saarloos wolfdog showed more association with the gray wolf, which is in agreement with the documented historical crossbreeding with gray wolves in this breed. Taimyr-1 shared more alleles (i.e. gene expressions) with those breeds that are associated with high latitudes - the Siberian husky and Greenland dog that are also associated with arctic human populations, and to a lesser extent the Shar Pei and Finnish spitz. An admixture graph of the Greenland dog indicates a best-fit of 3.5% shared material, although an ancestry proportion ranging between 1.4% and 27.3% is consistent with the data. This indicates admixture between the Taimyr-1 population and the ancestral dog population of these four high-latitude breeds. These results can be explained either by a very early presence of dogs in northern Eurasia or by the genetic legacy of Taimyr-1 being preserved in northern wolf populations until the arrival of dogs at high latitudes. This introgression could have provided early dogs living in high latitudes with phenotypic variation beneficial for adaption to a new and challenging environment. It also indicates that the ancestry of present-day dog breeds descends from more than one region.\n', 'An attempt to explore admixture between Taimyr-1 and gray wolves produced unreliable results.\n', 'As the Taimyr wolf had contributed to the genetic makeup of the Arctic breeds, a later study suggested that descendants of the Taimyr wolf survived until dogs were domesticated in Europe and arrived at high latitudes where they mixed with local wolves, and these both contributed to the modern Arctic breeds. Based on the most widely accepted oldest zooarchaeological dog remains, domestic dogs most likely arrived at high latitudes within the last 15,000 years. The mutation rates calibrated from both the Taimyr wolf and the Newgrange dog genomes suggest that modern wolf and dog populations diverged from a common ancestor between 20,000 and 60,000 YBP. This indicates that either dogs were domesticated much earlier than their first appearance in the archaeological record, or they arrived in the Arctic early, or both.\n', 'Section::::Dog domestication.:Positive selection.\n', 'Charles Darwin recognized the small number of traits that made domestic species different from their wild ancestors. He was also the first to recognize the difference between conscious selective breeding in which humans directly select for desirable traits, and unconscious selection where traits evolve as a by-product of natural selection or from selection on other traits. Domestic animals have variations in coat color as well as texture, dwarf and giant varieties, and changes in their reproductive cycle, and many others have tooth crowding and floppy ears.\n', 'Although it is easy to assume that each of these traits was uniquely selected for by hunter-gatherers and early farmers, beginning in 1959 Dmitry Belyayev tested the reactions of silver foxes to a hand placed in their cage and selected the tamest, least aggressive individuals to breed. His hypothesis was that, by selecting a behavioral trait, he could also influence the phenotype of subsequent generations, making them more domestic in appearance. Over the next 40 years, he succeeded in producing foxes with traits that were never directly selected for, including piebald coats floppy ears, upturned tails, shortened snouts, and shifts in developmental timing. In the 1980s, a researcher used a set of behavioral, cognitive, and visible phenotypic markers, such as coat colour, to produce domesticated fallow deer within a few generations. Similar results for tameness and fear have been found for mink and Japanese quail. In addition to demonstrating that domestic phenotypic traits could arise through selection for a behavioral trait, and domestic behavioral traits could arise through the selection for a phenotypic trait, these experiments provided a mechanism to explain how the animal domestication process could have begun without deliberate human forethought and action.\n', 'The genetic difference between domestic and wild populations can be framed within two considerations. The first distinguishes between domestication traits that are presumed to have been essential at the early stages of domestication, and improvement traits that have appeared since the split between wild and domestic populations. Domestication traits are generally fixed within all domesticates and were selected during the initial episode of domestication, whereas improvement traits are present only in a proportion of domesticates, though they may be fixed in individual breeds or regional populations. A second issue is whether traits associated with the domestication syndrome resulted from a relaxation of selection as animals exited the wild environment or from positive selection resulting from intentional and unintentional human preference. Some recent genomic studies on the genetic basis of traits associated with the domestication syndrome have shed light on both of these issues. A study published in 2016 suggested that there have been negative genetic consequences of the domestication process as well, that enrichment of disease-related gene variants accompanied positively selected traits.\n', 'In 2010, a study identified 51 regions of the dog genome that were associated with phenotypic variation among breeds in 57 traits studied, which included body, cranial, dental, and long bone shape and size. There were 3 quantitative trait loci that explained most of the phenotypic variation. Indicators of recent selection were shown by many of the 51 genomic regions that were associated with traits that define a breed, which include body size, coat characteristics, and ear floppiness. Geneticists have identified more than 300 genetic loci and 150 genes associated with coat color variability. Knowing the mutations associated with different colors has allowed the correlation between the timing of the appearance of variable coat colors in horses with the timing of their domestication. Other studies have shown how human-induced selection is responsible for the allelic variation in pigs. Together, these insights suggest that, although natural selection has kept variation to a minimum before domestication, humans have actively selected for novel coat colors as soon as they appeared in managed populations.\n', 'In 2015, a study looked at over 100 pig genome sequences to ascertain their process of domestication. A model that fitted the data included admixture with a now extinct ghost population of wild pigs during the Pleistocene. The study also found that despite back-crossing with wild pigs, the genomes of domestic pigs have strong signatures of selection at genetic loci that affect behavior and morphology. The study concluded that human selection for domestic traits likely counteracted the homogenizing effect of gene flow from wild boars and created domestication islands in the genome. The same process may also apply to other domesticated animals.\n', 'In 2014, a whole genome study of the DNA differences between wolves and dogs found that dogs did not show a reduced fear response but did show greater synaptic plasticity. Synaptic plasticity is widely believed to be the cellular correlate of learning and memory, and this change may have altered the learning and memory abilities of dogs in comparison to wolves.\n', 'Section::::Dog domestication.:Positive selection.:Behavior.\n', 'Unlike other domestic species which were primarily selected for production-related traits, dogs were initially selected for their behaviors. In 2016, a study found that there were only 11 fixed genes that showed variation between wolves and dogs. These gene variations were unlikely to have been the result of natural evolution, and indicate selection on both morphology and behavior during dog domestication. There was evidence of selection during dog domestication of genes that affect the adrenaline and noradrenaline biosynthesis pathway. These genes are involved in the synthesis, transport and degradation of a variety of neurotransmitters, particularly the catecholamines, which include dopamine and noradrenaline. Recurrent selection on this pathway and its role in emotional processing and the fight-or-flight response suggests that the behavioral changes we see in dogs compared to wolves may be due to changes in this pathway, leading to tameness and an emotional processing ability. Dogs generally show reduced fear and aggression compared to wolves. Some of these genes have been associated with aggression in some dog breeds, indicating their importance in both the initial domestication and then later in breed formation.\n', 'In 2018, a study identified 429 genes that differed between modern dogs and modern wolves. As the differences in these genes could also be found in ancient dog fossils, these were regarded as being the result of the initial domestication and not from recent breed formation. These genes are linked to neural crest and central nervous system development. These genes affect embryogenesis and can confer tameness, smaller jaws, floppy ears, and diminished craniofacial development, which distinguish domesticated dogs from wolves and are considered to reflect domestication syndrome. The study proposes that domestication syndrome is caused by alterations in the migration or activity of neural crest cells during their development. The study concluded that during early dog domestication, the initial selection was for behavior. This trait is influenced by those genes which act in the neural crest, which led to the phenotypes observed in modern dogs.\n', 'Section::::Dog domestication.:Positive selection.:Dietary adaption.\n', "AMY2B (Alpha-Amylase 2B) is a gene that codes a protein that assists with the first step in the digestion of dietary starch and glycogen. An expansion of this gene in dogs would enable early dogs to exploit a starch-rich diet as they fed on refuse from agriculture. In a study in 2014, the data indicated that the wolves and dingo had just two copies of the gene and the Siberian Husky that is associated with hunter-gatherers had just three or four copies, whereas the Saluki that is associated with the Fertile Crescent where agriculture originated had 29 copies. The results show that on average, modern dogs have a high copy number of the gene, whereas wolves and dingoes do not. The high copy number of AMY2B variants likely already existed as a standing variation in early domestic dogs, but expanded more recently with the development of large agriculturally based civilizations. This suggests that at the beginning of the domestication process, dogs may have been characterized by a more carnivorous diet than their modern-day counterparts, a diet held in common with early hunter-gatherers. A later study indicated that because most dogs had a high copy number of the AMY2B gene but the arctic breeds and the dingo did not, that the dog's dietary change may not have been caused by initial domestication but by the subsequent spread of agriculture to most - but not all - regions of the planet.\n", 'In 2016, a study of the dog genome compared to the wolf genome looked for genes that showed signs of having undergone positive selection. The study identified genes relating to brain function and behavior, and to lipid metabolism. This ability to process lipids indicates a dietary target of selection that was important when proto-dogs hunted and fed alongside hunter-gatherers. The evolution of the dietary metabolism genes may have helped process the increased lipid content of early dog diets as they scavenged on the remains of carcasses left by hunter-gatherers. Prey capture rates may have increased in comparison to wolves and with it the amount of lipid consumed by the assisting proto-dogs. A unique dietary selection pressure may have evolved both from the amount consumed, and the shifting composition of, tissues that were available to proto-dogs once humans had removed the most desirable parts of the carcass for themselves. A study of the mammal biomass during modern human expansion into the northern Mammoth steppe found that it had occurred under conditions of unlimited resources, and that many of the animals were killed with only a small part consumed or left unused.\n', 'Section::::Dog domestication.:Natural selection.\n', 'Dogs can infer the name of an object and have been shown to learn the names of over 1,000 objects. Dogs can follow the human pointing gesture; even nine-week-old puppies can follow a basic human pointing gesture without being taught. New Guinea singing dogs, a half-wild proto-dog endemic to the remote alpine regions of New Guinea, as well as dingoes in the remote Outback of Australia are also capable of this. These examples demonstrate an ability to read human gestures that arose early in domestication and did not require human selection. "Humans did not develop dogs, we only fine-tuned them down the road."\n', "A dog's cranium is 15% smaller than an equally heavy wolf's, and the dog is less aggressive and more playful. Other species pairs show similar differences. Bonobos, like chimpanzees, are a close genetic cousin to humans, but unlike the chimpanzees, bonobos are not aggressive and do not participate in lethal inter-group aggression or kill within their own group. The most distinctive features of a bonobo are its cranium, which is 15% smaller than a chimpanzee's, and its less aggressive and more playful behavior. In other examples, the guinea pig's cranium is 13% smaller than its wild cousin the cavy, and domestic fowl show a similar reduction to their wild cousins. Possession of a smaller cranium for holding a smaller brain is a telltale sign of domestication. Bonobos appear to have domesticated themselves. In the farm fox experiment, humans selectively bred foxes against aggression, causing domestication syndrome. The foxes were not selectively bred for smaller craniums and teeth, floppy ears, or skills at using human gestures, but these traits were demonstrated in the friendly foxes. Natural selection favors those that are the most successful at reproducing, not the most aggressive. Selection against aggression made possible the ability to cooperate and communicate among foxes, dogs and bonobos. Perhaps it did the same thing for humans. The more docile animals have been found to have less testosterone than their more aggressive counterparts, and testosterone controls aggression and brain size. One researcher has argued that in becoming more social, we humans have developed a smaller brain than those of humans 20,000 years ago.\n", 'Section::::Dog domestication.:Dog and human convergent evolution.\n', 'As a result of the domestication process there is also evidence of convergent evolution having occurred between dogs and humans. The history of the two is forever intertwined. Dogs suffer from the same diseases as humans, which include cancer, diabetes, heart disease, and neurological disorders. The underlying disease pathology is similar to humans, as is their responses and outcomes to treatment.\n', 'Section::::Dog domestication.:Dog and human convergent evolution.:Parallel evolution.\n', 'There are patterns of genes which are related by their function and these patterns can be found in both dogs and humans. This fact can be used to study the coevolution of gene function. Dogs accompanied humans when they first migrated into new environments. Both dogs and humans have adapted to different environmental conditions, with their genomes showing parallel evolution. These include adaptation to high altitude, low oxygen hypoxia conditions, and genes that play a role in digestion, metabolism, neurological processes, and some related to cancer. It can be inferred from those genes which act on the serotonin system in the brain that these have given rise to less aggressive behavior when living in a crowded environment.\n', 'In 2007, a study found that dog domestication was accompanied by selection at three genes with key roles in starch digestion: AMY2B, MGAMand SGLT1, and was a striking case of parallel evolution when coping with an increasingly starch-rich diet caused similar adaptive responses in dogs and humans.\n', 'Section::::Dog domestication.:Dog and human convergent evolution.:Behavioral evidence.\n', 'Convergent evolution is when distantly related species independently evolve similar solutions to the same problem. For example, fish, penguins and dolphins have each separately evolved flippers as a solution to the problem of moving through the water. What has been found between dogs and humans is something less frequently demonstrated: psychological convergence. Dogs have independently evolved to be cognitively more similar to humans than we are to our closest genetic relatives. Dogs have evolved specialized skills for reading human social and communicative behavior. These skills seem more flexible – and possibly more human-like – than those of other animals more closely related to humans phylogenetically, such as chimpanzees, bonobos and other great apes. This raises the possibility that convergent evolution has occurred: both "Canis familiaris" and "Homo sapiens" might have evolved some similar (although obviously not identical) social-communicative skills – in both cases adapted for certain kinds of social and communicative interactions with human beings.\n', "The pointing gesture is a human-specific signal, is referential in its nature, and is a foundation building-block of human communication. Human infants acquire it weeks before the first spoken word. In 2009, a study compared the responses to a range of pointing gestures by dogs and human infants. The study showed little difference in the performance of 2-year-old children and dogs, while 3-year-old children's performance was higher. The results also showed that all subjects were able to generalize from their previous experience to respond to relatively novel pointing gestures. These findings suggest that dogs demonstrating a similar level of performance as 2-year-old children can be explained as a joint outcome of their evolutionary history as well as their socialization in a human environment.\n", 'Later studies support coevolution in that dogs can discriminate the emotional expressions of human faces, and that most people can tell from a bark whether a dog is alone, being approached by a stranger, playing, or being aggressive, and can tell from a growl how big the dog is.\n', 'In 2015, a study found that when dogs and their owners interact, extended eye contact (mutual gaze) increases oxytocin levels in both the dog and its owner. As oxytocin is known for its role in maternal bonding, it is considered likely that this effect has supported the coevolution of human-dog bonding. One observer has stated, "The dog could have arisen only from animals predisposed to human society by lack of fear, attentiveness, curiosity, necessity, and recognition of advantage gained through collaboration...the humans and wolves involved in the conversion were sentient, observant beings constantly making decisions about how they lived and what they did, based on the perceived ability to obtain at a given time and place what they needed to survive and thrive. They were social animals willing, even eager, to join forces with another animal to merge their sense of group with the others\' sense and create an expanded super-group that was beneficial to both in multiple ways. They were individual animals and people involved, from our perspective, in a biological and cultural process that involved linking not only their lives but the evolutionary fate of their heirs in ways, we must assume, they could never have imagined. Powerful emotions were in play that many observers today refer to as love – boundless, unquestioning love."\n', 'Section::::Dog domestication.:Dog and human convergent evolution.:Human adoption of some wolf behaviors.\n', 'In 2002, a study proposed that immediate human ancestors and wolves may have domesticated each other through a strategic alliance that would change both respectively into humans and dogs. The effects of human psychology, hunting practices, territoriality and social behavior would have been profound. Early humans moved from scavenging and small-game hunting to big-game hunting by living in larger, socially more-complex groups, learning to hunt in packs, and developing powers of cooperation and negotiation in complex situations. As these are characteristics of wolves, dogs and humans, it can be argued that these behaviors were enhanced once wolves and humans began to cohabit. Communal hunting led to communal defense. Wolves actively patrol and defend their scent-marked territory, and perhaps humans had their sense of territoriality enhanced by living with wolves. One of the keys to recent human survival has been the forming of partnerships. Strong bonds exist between same-sex wolves, dogs and humans and these bonds are stronger than exist between other same-sex animal pairs. Today, the most widespread form of inter-species bonding occurs between humans and dogs. The concept of friendship has ancient origins but it may have been enhanced through the inter-species relationship to give a survival advantage.\n', 'In 2003, a study compared the behavior and ethics of chimpanzees, wolves and humans. Cooperation among humans\' closest genetic relative is limited to occasional hunting episodes or the persecution of a competitor for personal advantage, which had to be tempered if humans were to become domesticated. The closest approximation to human morality that can be found in nature is that of the gray wolf, "Canis lupus". Wolves are among the most gregarious and cooperative of animals on the planet, and their ability to cooperate in well-coordinated drives to hunt prey, carry items too heavy for an individual, provisioning not only their own young but also the other pack members, babysitting etc. are rivaled only by that of human societies. Similar forms of cooperation are observed in two closely related canids, the African wild dog and the Asian dhole, therefore it is reasonable to assume that canid sociality and cooperation are old traits that in terms of evolution predate human sociality and cooperation. Today\'s wolves may even be less social than their ancestors, as they have lost access to big herds of ungulates and now tend more toward a lifestyle similar to coyotes, jackals, and even foxes. Social sharing within families may be a trait that early humans learned from wolves, and with wolves digging dens long before humans constructed huts it is not clear who domesticated whom.\n', 'On the mammoth steppe the wolf\'s ability to hunt in packs, to share risk fairly among pack members, and to cooperate moved them to the top of the food chain above lions, hyenas and bears. Some wolves followed the great reindeer herds, eliminating the unfit, the weaklings, the sick and the aged, and therefore improved the herd. These wolves had become the first pastoralists hundreds of thousands of years before humans also took to this role. The wolves\' advantage over their competitors was that they were able to keep pace with the herds, move fast and enduringly, and make the most efficient use of their kill by their ability to "wolf down" a large part of their quarry before other predators had detected the kill. The study proposed that during the Last Glacial Maximum, some of our ancestors teamed up with those pastoralist wolves and learned their techniques. Many of our ancestors remained gatherers and scavengers, or specialized as fish-hunters, hunter-gatherers, and hunter-gardeners. However, some ancestors adopted the pastoralist wolves\' lifestyle as herd followers and herders of reindeer, horses, and other hoofed animals. They harvested the best stock for themselves while the wolves kept the herd strong, and this group of humans was to become the first herders and this group of wolves was to become the first dogs.\n', 'Section::::First dogs.\n', "The dog was the first species and the only large carnivore to have been domesticated. Over the past 200 years, dogs have undergone rapid phenotypic change and were formed into today's modern dog breeds due to artificial selection imposed by humans. These breeds can vary in size and weight from a teacup poodle to a giant mastiff. The skull, body, and limb proportions vary significantly between breeds, with dogs displaying more phenotypic diversity than can be found within the entire order of carnivores. Some breeds demonstrate outstanding skills in herding, retrieving, scent detection, and guarding, which demonstrates the functional and behavioral diversity of dogs. There have been major advances in understanding the genes that gave rise to the phenotypic traits of dogs. The first dogs were certainly wolflike, however the phenotypic changes that coincided with the dog–wolf genetic divergence are not known.\n", 'Section::::First dogs.:Bonn–Oberkassel dog.\n', 'In 1914, on the eve of the First World War, two human skeletons were discovered during basalt quarrying at Oberkassel, Bonn in Germany. With them were found a right mandible of a "wolf" and other animal bones. After the end of the First World War, in 1919 a full study was made of these remains. The mandible was recorded as ""Canis lupus", the wolf" and some of the other animal bones were assigned to it. The remains were then stored and forgotten for fifty years. In the late 1970s there was renewed interest in the Oberkassel remains and the mandible was re-examined and reclassified as belonging to a domesticated dog. The mitochondrial DNA sequence of the mandible was matched to "Canis lupus familiaris" - dog, and confirms that the Oberkassel dog is a direct ancestor of today\'s dogs. The bodies were dated to 14,223 YBP. This implies that in Western Europe there were morphologically and genetically "modern" dogs in existence around 14,500 years ago.\n', 'Later studies assigned more of the other animal bones to the dog until most of a skeleton could be assembled. The humans were a man aged 40 years and a woman aged 25 years. All three skeletal remains were found covered with large 20\xa0cm thick basalt blocks and were sprayed with red hematite powder. The consensus is that a dog was buried along with two humans. A tooth belonging to a smaller and older dog was also identified but it had not been sprayed with red powder. The cause of the death of the two humans is not known. A pathology study of the dog remains suggests that it had died young after suffering from canine distemper between ages 19 and 23 weeks. The dog could not have survived during this period without intensive human care. During this period the dog was of no utilitarian use to humans, and suggests the existence of emotional or symbolic ties between these humans and this dog. In conclusion, near the end of the Late Pleistocene at least some humans regarded dogs not just materialistically, but had developed emotional and caring bonds for their dogs.\n', 'Section::::First dogs.:First dogs as a hunting technology.\n', 'During the Upper Paleolithic (50,000-10,000 YBP), the increase in human population density, advances in blade and hunting technology, and climate change may have altered prey densities and made scavenging crucial to the survival of some wolf populations. Adaptations to scavenging such as tameness, small body size, and a decreased age of reproduction would reduce their hunting efficiency further, eventually leading to obligated scavenging. Whether these earliest dogs were simply human-commensal scavengers or they played some role as companions or hunters that hastened their spread is unknown.\n', 'Researchers have proposed that in the past a hunting partnership existed between humans and dogs that was the basis for dog domestication. \n', "Petroglyph rock art dating to 8,000 YBP at the sites of Shuwaymis and Jubbah, in northwestern Saudi Arabia, depict large numbers of dogs participating in hunting scenes with some being controlled on leashes. The transition from the Late Pleistocene into the early Holocene was marked by climatic change from cold and dry to warmer, wetter conditions and rapid shifts in flora and fauna, with much of the open habitat of large herbivores being replaced by forests. In the early Holocene, it is proposed that along with changes in arrow-head technology that hunting dogs were used by hunters to track and retrieve wounded game in thick forests. The dog's ability to chase, track, sniff out and hold prey can significantly increase the success of hunters in forests, where human senses and location skills are not as sharp as in more open habitats. Dogs are still used for hunting in forests today.\n", 'Section::::First dogs.:Dogs enter North America from Siberia.\n', 'In North America, the earliest dog remains were found in Illinois and radiocarbon dating indicates 9,900 YBP. These include three isolated burials at the Koster Site near the lower Illinois River in Greene County, and one burial 35 km away at the Stilwell II site in Pike County. These dogs were medium-sized adults around in height and around in weight, with very active lifestyles and varied morphologies. Isotope analysis can be used to identify some chemical elements, allowing researchers to make inferences about the diet of a species. An isotope analysis of bone collagen indicates a diet consisting largely of freshwater fish. Similar dog burials across Eurasia are thought to be due to the dog’s importance in hunting to people who were trying to adapt to the changing environments and prey species during the Pleistocene-Holocene transition. In these places, the dog had gained an elevated social status. \n', 'In 2018, a study compared sequences of North American dog fossils with Siberian dog fossils and modern dogs. The nearest relative to the North American fossils was a 9,000 YBP fossil discovered on Zhokhov Island, arctic north-eastern Siberia, which was connected to the mainland at that time. The study inferred from mDNA that all of the North American dogs shared a common ancestor dated 14,600 YBP, and this ancestor had diverged along with the ancestor of the Zhokhov dog from their common ancestor 15,600 YBP. The timing of the Koster dogs shows that dogs entered North America from Siberia 4,500 years after humans did, were isolated for the next 9,000 years, and after contact with Europeans these no longer exist because they were replaced by Eurasian dogs. The pre-contact dogs exhibit a unique genetic signature that is now gone, with nDNA indicating that their nearest genetic relatives today are the arctic breed dogs - Alaskan malamutes, Greenland dogs, and Alaskan huskies and Siberian huskies.\n', 'Section::::First dogs.:First dog breeds developed in Siberia.\n', 'In 2017, a study showed that 9,000 YBP the domestic dog was present at what is now Zhokhov Island. The dogs were selectively bred as either sled dogs or as hunting dogs, which implies that a sled dog standard and a hunting dog standard existed at that time. The optimal maximum size for a sled dog is 20–25\xa0kg based on thermo-regulation, and the ancient sled dogs were between 16–25\xa0kg. The same standard has been found in the remains of sled dogs from this region 2,000 YBP and in the modern Siberian husky breed standard. Other dogs were more massive at 30\xa0kg and appear to be dogs that had been crossed with wolves and used for polar bear hunting. At death, the heads of the dogs had been carefully separated from their bodies by humans, probably for ceremonial reasons.\n', 'The study proposes that after having diverged from the common ancestor along with the grey wolf, the evolution of "Canis familiaris" proceeded in three stages. The first was natural selection based on feeding behavior within the ecological niche that had been formed through human activity. The second was artificial selection based on tamability. The third was directed selection based on forming breeds that possessed qualities to help with specific tasks within the human economy. The process commenced 40,000-30,000 YBP with its speed increasing with each stage until domestication became complete.\n', 'Section::::First dogs.:Dogs enter Japan.\n', "In Japan, temperate deciduous forests rapidly spread onto the main island of Honshu and caused an adaption away from hunting megafauna (Naumann's elephant and Yabe's giant deer) to hunting the quicker sika deer and wild boar in dense forest. With this came a change in hunting technology, including a shift to smaller, triangular points for arrows. A study of the Jōmon people that lived on the Pacific coast of Honshu during the early Holocene shows that they were conducting individual dog burials and were probably using dogs as tools for hunting sika deer and wild boar, as hunters in Japan still do today.\n", 'Hunting dogs make major contributions to forager societies and the ethnographic record shows them being given proper names, treated as family members, and considered separate to other types of dogs. This special treatment includes separate burials with markers and grave-goods, with those that were exceptional hunters or that were killed on the hunt often venerated. A dog\'s value as a hunting partner gives them status as a living weapon and the most skilled elevated to taking on a "personhood", with their social position in life and in death similar to that of the skilled hunters.\n', 'Intentional dog burials together with ungulate hunting is also found in other early Holocene deciduous forest forager societies in Europe and North America, indicating that across the Holarctic temperate zone hunting dogs were a widespread adaptation to forest ungulate hunting.\n'), (KiltDocAnchor(text='domestic dog', href='Dog', paragraph_id=1, start=18, end=30), KiltDocAnchor(text='evolutionary divergence', href='evolutionary%20divergence', paragraph_id=1, start=50, end=73), KiltDocAnchor(text='domestication', href='domestication', paragraph_id=1, start=93, end=106), KiltDocAnchor(text='dog types', href='dog%20types', paragraph_id=1, start=133, end=142), KiltDocAnchor(text='dog breeds', href='dog%20breeds', paragraph_id=1, start=147, end=157), KiltDocAnchor(text='genus', href='genus', paragraph_id=1, start=186, end=191), KiltDocAnchor(text='Canis', href='Canis', paragraph_id=1, start=193, end=198), KiltDocAnchor(text='wolf-like canids', href='Evolution%20of%20the%20wolf%23Wolf-like%20canids', paragraph_id=1, start=225, end=241), KiltDocAnchor(text='carnivore', href='carnivore', paragraph_id=1, start=288, end=297), KiltDocAnchor(text='sister taxa', href='sister%20taxa', paragraph_id=1, start=362, end=373), KiltDocAnchor(text='domesticated', href='Domestication', paragraph_id=1, start=459, end=471), KiltDocAnchor(text='genetic divergence', href='genetic%20divergence', paragraph_id=2, start=4, end=22), KiltDocAnchor(text='Last Glacial Maximum', href='Last%20Glacial%20Maximum', paragraph_id=2, start=115, end=135), KiltDocAnchor(text='domestication of animals', href='domestication%20of%20animals', paragraph_id=2, start=319, end=343), KiltDocAnchor(text='grey wolf', href='grey%20wolf', paragraph_id=2, start=396, end=405), KiltDocAnchor(text='hunter-gatherers', href='hunter-gatherers', paragraph_id=2, start=433, end=449), KiltDocAnchor(text='Bonn–Oberkassel dog', href='%23Bonn%E2%80%93Oberkassel%20dog', paragraph_id=2, start=522, end=541), KiltDocAnchor(text='Near East', href='Ancient%20Near%20East', paragraph_id=2, start=722, end=731), KiltDocAnchor(text='aurochs', href='aurochs', paragraph_id=2, start=784, end=791), KiltDocAnchor(text='Western Europe', href='Western%20Europe', paragraph_id=3, start=106, end=120), KiltDocAnchor(text='Central Asia', href='Central%20Asia', paragraph_id=3, start=122, end=134), KiltDocAnchor(text='East Asia', href='East%20Asia', paragraph_id=3, start=139, end=148), KiltDocAnchor(text='Eurasian', href='Eurasian', paragraph_id=3, start=266, end=274), KiltDocAnchor(text='extinct', href='Extinction', paragraph_id=3, start=314, end=321), KiltDocAnchor(text='Miocene', href='Miocene', paragraph_id=5, start=48, end=55), KiltDocAnchor(text='glaciations', href='glaciations', paragraph_id=5, start=122, end=133), KiltDocAnchor(text='Pliocene', href='Pliocene', paragraph_id=5, start=141, end=149), KiltDocAnchor(text='Pleistocene', href='Pleistocene', paragraph_id=5, start=158, end=169), KiltDocAnchor(text='Ice Age', href='Ice%20Age', paragraph_id=5, start=209, end=216), KiltDocAnchor(text='forests', href='forests', paragraph_id=5, start=233, end=240), KiltDocAnchor(text='savannahs', href='savannahs', paragraph_id=5, start=245, end=254), KiltDocAnchor(text='steppes', href='steppes', paragraph_id=5, start=274, end=281), KiltDocAnchor(text='grasslands', href='grasslands', paragraph_id=5, start=285, end=295), KiltDocAnchor(text='North America', href='North%20America', paragraph_id=6, start=12, end=25), KiltDocAnchor(text='fox', href='fox', paragraph_id=6, start=42, end=45), KiltDocAnchor(text='Canis', href='Canis', paragraph_id=6, start=139, end=144), KiltDocAnchor(text='coyotes', href='coyotes', paragraph_id=6, start=176, end=183), KiltDocAnchor(text='wolves', href='wolves', paragraph_id=6, start=185, end=191), KiltDocAnchor(text='domestic dog', href='domestic%20dog', paragraph_id=6, start=200, end=212), KiltDocAnchor(text='eastern Africa', href='eastern%20Africa', paragraph_id=6, start=217, end=231), KiltDocAnchor(text='primates', href='primates', paragraph_id=6, start=266, end=274), KiltDocAnchor(text='Eurasia', href='Eurasia', paragraph_id=6, start=544, end=551), KiltDocAnchor(text="Ju'wasi", href='%C7%83Kung%20people', paragraph_id=7, start=136, end=143), KiltDocAnchor(text='Namibia', href='Namibia', paragraph_id=7, start=154, end=161), KiltDocAnchor(text='canids', href='canids', paragraph_id=7, start=643, end=649), KiltDocAnchor(text='domestication of animals', href='domestication%20of%20animals', paragraph_id=9, start=4, end=28), KiltDocAnchor(text='years before present', href='years%20before%20present', paragraph_id=9, start=51, end=71), KiltDocAnchor(text='grey wolf', href='grey%20wolf', paragraph_id=9, start=98, end=107), KiltDocAnchor(text='hunter-gatherers', href='hunter-gatherers', paragraph_id=9, start=135, end=151), KiltDocAnchor(text='Near East', href='Ancient%20Near%20East', paragraph_id=9, start=207, end=216), KiltDocAnchor(text='aurochs', href='aurochs', paragraph_id=9, start=269, end=276), KiltDocAnchor(text='commensal pathway', href='Domestication%20of%20animals%23Commensal%20pathway', paragraph_id=9, start=388, end=405), KiltDocAnchor(text='Bonn–Oberkassel dog', href='%23Bonn%E2%80%93Oberkassel%20dog', paragraph_id=9, start=723, end=742), KiltDocAnchor(text='Paleolithic dog', href='Paleolithic%20dog', paragraph_id=9, start=809, end=824), KiltDocAnchor(text='genetic divergence', href='genetic%20divergence', paragraph_id=9, start=914, end=932), KiltDocAnchor(text='evolutionary divergence', href='Divergent%20evolution', paragraph_id=11, start=27, end=50), KiltDocAnchor(text="Przewalski's horse", href='Przewalski%27s%20horse', paragraph_id=11, start=298, end=316), KiltDocAnchor(text='Late Pleistocene glaciation', href='Quaternary%20glaciation', paragraph_id=13, start=11, end=38), KiltDocAnchor(text='mammoth steppe', href='mammoth%20steppe', paragraph_id=13, start=47, end=61), KiltDocAnchor(text='Spain', href='Spain', paragraph_id=13, start=77, end=82), KiltDocAnchor(text='Eurasia', href='Eurasia', paragraph_id=13, start=100, end=107), KiltDocAnchor(text='Bering land bridge', href='Beringia', paragraph_id=13, start=121, end=139), KiltDocAnchor(text='Alaska', href='Alaska', paragraph_id=13, start=145, end=151), KiltDocAnchor(text='Yukon', href='Yukon', paragraph_id=13, start=160, end=165), KiltDocAnchor(text='megafaunal', href='Pleistocene%20megafauna', paragraph_id=13, start=335, end=345), KiltDocAnchor(text='extinction', href='Quaternary%20extinction%20event', paragraph_id=13, start=346, end=356), KiltDocAnchor(text='Last Glacial Maximum', href='Last%20Glacial%20Maximum', paragraph_id=13, start=427, end=447), KiltDocAnchor(text='genus', href='genus', paragraph_id=13, start=620, end=625), KiltDocAnchor(text='biogeography', href='biogeography', paragraph_id=14, start=42, end=54), KiltDocAnchor(text='Beringia', href='Beringia', paragraph_id=14, start=178, end=186), KiltDocAnchor(text='Old Crow, Yukon', href='Old%20Crow%2C%20Yukon', paragraph_id=14, start=190, end=205), KiltDocAnchor(text='Fairbanks', href='Fairbanks', paragraph_id=14, start=241, end=250), KiltDocAnchor(text='rostrum', href='Rostrum%20%28anatomy%29%23Vertebrates', paragraph_id=14, start=510, end=517), KiltDocAnchor(text='temporalis', href='temporalis', paragraph_id=14, start=553, end=563), KiltDocAnchor(text='premolars', href='premolars', paragraph_id=14, start=583, end=592), KiltDocAnchor(text='Pleistocene megafauna', href='Pleistocene%20megafauna', paragraph_id=14, start=743, end=764), KiltDocAnchor(text='dire wolf', href='dire%20wolf', paragraph_id=14, start=900, end=909), KiltDocAnchor(text='spotted hyena', href='spotted%20hyena', paragraph_id=14, start=1149, end=1162), KiltDocAnchor(text='gray wolf', href='gray%20wolf', paragraph_id=16, start=34, end=43), KiltDocAnchor(text='canine', href='Canis', paragraph_id=16, start=117, end=123), KiltDocAnchor(text='phylogenetic analysis', href='phylogenetic%20analysis', paragraph_id=16, start=208, end=229), KiltDocAnchor(text='DNA sequences', href='DNA%20sequences', paragraph_id=16, start=233, end=246), KiltDocAnchor(text='Late Pleistocene', href='Late%20Pleistocene', paragraph_id=16, start=368, end=384), KiltDocAnchor(text='genetic divergence', href='genetic%20divergence', paragraph_id=16, start=503, end=521), KiltDocAnchor(text='incomplete lineage sorting', href='incomplete%20lineage%20sorting', paragraph_id=16, start=666, end=692), KiltDocAnchor(text='gene flow', href='gene%20flow', paragraph_id=16, start=842, end=851), KiltDocAnchor(text='mutations', href='mutations', paragraph_id=16, start=968, end=977), KiltDocAnchor(text='whole genome sequencing', href='whole%20genome%20sequencing', paragraph_id=17, start=13, end=36), KiltDocAnchor(text='genome sequence', href='genome%20sequence', paragraph_id=17, start=174, end=189), KiltDocAnchor(text='Taymyr Peninsula', href='Taymyr%20Peninsula', paragraph_id=17, start=241, end=257), KiltDocAnchor(text='Radiocarbon dating', href='Radiocarbon%20dating', paragraph_id=17, start=357, end=375), KiltDocAnchor(text='mutation rate', href='mutation%20rate', paragraph_id=17, start=471, end=484), KiltDocAnchor(text='genetic admixture', href='genetic%20admixture', paragraph_id=19, start=265, end=282), KiltDocAnchor(text='incomplete lineage sorting', href='incomplete%20lineage%20sorting', paragraph_id=19, start=326, end=352), KiltDocAnchor(text='Paleogenomics', href='Paleogenomics', paragraph_id=19, start=513, end=526), KiltDocAnchor(text='fossil', href='fossil', paragraph_id=19, start=572, end=578), KiltDocAnchor(text='mitochondrial genome', href='Nuclear%20DNA%23Mitochondrial%20DNA', paragraph_id=21, start=40, end=60), KiltDocAnchor(text='phylogenetic tree', href='phylogenetic%20tree', paragraph_id=21, start=175, end=192), KiltDocAnchor(text='genetic variation', href='genetic%20variation', paragraph_id=21, start=233, end=250), KiltDocAnchor(text='Goyet Cave canids', href='Paleolithic%20dog%23Goyet%20dog', paragraph_id=21, start=441, end=458), KiltDocAnchor(text='ecomorph', href='Ecomorphology', paragraph_id=21, start=720, end=728), KiltDocAnchor(text='Razboinichya Cave canid', href='Paleolithic%20dog%23Altai%20dog', paragraph_id=22, start=24, end=47), KiltDocAnchor(text='mitochondrial DNA', href='mitochondrial%20DNA', paragraph_id=22, start=83, end=100), KiltDocAnchor(text='basal', href='Basal%20%28biology%29', paragraph_id=22, start=281, end=286), KiltDocAnchor(text='phylogenetic', href='phylogenetic', paragraph_id=23, start=4, end=16), KiltDocAnchor(text='clades', href='clades', paragraph_id=23, start=69, end=75), KiltDocAnchor(text='mandible', href='mandible', paragraph_id=25, start=177, end=185), KiltDocAnchor(text='"Canis c.f. variabilis"', href='Canis%20mosbachensis%23Canis%20variabilis', paragraph_id=25, start=211, end=234), KiltDocAnchor(text='c.f.', href='Open%20nomenclature%23Usage%20of%20open%20nomenclature', paragraph_id=25, start=242, end=246), KiltDocAnchor(text='Latin', href='Latin', paragraph_id=25, start=252, end=257), KiltDocAnchor(text='haplotypes', href='haplotypes', paragraph_id=25, start=340, end=350), KiltDocAnchor(text='mutations', href='mutations', paragraph_id=25, start=677, end=686), KiltDocAnchor(text='gene pool', href='gene%20pool', paragraph_id=25, start=795, end=804), KiltDocAnchor(text='Victorian era', href='Victorian%20era', paragraph_id=27, start=117, end=130), KiltDocAnchor(text='genetic admixture', href='genetic%20admixture', paragraph_id=27, start=271, end=288), KiltDocAnchor(text='Siberian huskies', href='Siberian%20Husky', paragraph_id=28, start=0, end=16), KiltDocAnchor(text='mitochondrial DNA', href='mitochondrial%20DNA', paragraph_id=29, start=30, end=47), KiltDocAnchor(text='whole-genome sequences', href='Whole%20genome%20sequencing', paragraph_id=29, start=52, end=74), KiltDocAnchor(text='nuclear genome', href='nuclear%20genome', paragraph_id=29, start=200, end=214), KiltDocAnchor(text='Late Neolithic', href='Late%20Neolithic', paragraph_id=29, start=264, end=278), KiltDocAnchor(text='Newgrange', href='Newgrange', paragraph_id=29, start=296, end=305), KiltDocAnchor(text='Ireland', href='Ireland', paragraph_id=29, start=307, end=314), KiltDocAnchor(text='radiocarbon dated', href='radiocarbon%20dated', paragraph_id=29, start=319, end=336), KiltDocAnchor(text='Paleolithic dog', href='Paleolithic%20dog', paragraph_id=29, start=748, end=763), KiltDocAnchor(text='phylogenetic tree', href='phylogenetic%20tree', paragraph_id=29, start=858, end=875), KiltDocAnchor(text='Sarloos wolfdog', href='Sarloos%20wolfdog', paragraph_id=29, start=940, end=955), KiltDocAnchor(text='German Shepherd', href='German%20Shepherd', paragraph_id=29, start=1025, end=1040), KiltDocAnchor(text='C and D', href='%23Dog-Wolf%20admixture', paragraph_id=30, start=100, end=107), KiltDocAnchor(text='population bottleneck', href='population%20bottleneck', paragraph_id=30, start=440, end=461), KiltDocAnchor(text='Central Asia', href='Central%20Asia', paragraph_id=30, start=667, end=679), KiltDocAnchor(text='Early Neolithic', href='Early%20Neolithic', paragraph_id=34, start=220, end=235), KiltDocAnchor(text='Herxheim', href='Herxheim%20%28archaeological%20site%29', paragraph_id=34, start=244, end=252), KiltDocAnchor(text='Late Neolithic', href='Late%20Neolithic', paragraph_id=34, start=292, end=306), KiltDocAnchor(text='Forchheim', href='Forchheim', paragraph_id=34, start=350, end=359), KiltDocAnchor(text='Near East', href='Near%20East', paragraph_id=35, start=391, end=400), KiltDocAnchor(text='y-chromosome', href='y-chromosome', paragraph_id=35, start=485, end=497), KiltDocAnchor(text='Indian wolf', href='Indian%20wolf', paragraph_id=35, start=976, end=987), KiltDocAnchor(text='morphological', href='Morphology%20%28biology%29', paragraph_id=37, start=175, end=188), KiltDocAnchor(text='zooarchaeologists', href='Zooarchaeology', paragraph_id=37, start=217, end=234), KiltDocAnchor(text='postcranial', href='postcranial', paragraph_id=37, start=385, end=396), KiltDocAnchor(text='Hohle Fels', href='Hohle%20Fels', paragraph_id=39, start=224, end=234), KiltDocAnchor(text='Goyet Caves', href='Goyet%20Caves', paragraph_id=39, start=247, end=258), KiltDocAnchor(text='Predmosti', href='P%C5%99edmost%C3%AD%20u%20P%C5%99erova%20%28archeology%29', paragraph_id=39, start=271, end=280), KiltDocAnchor(text='Altai Republic', href='Altai%20Republic', paragraph_id=39, start=355, end=369), KiltDocAnchor(text='Kostyonki-8', href='Kostyonki-Borshchyovo%20archaeological%20complex', paragraph_id=39, start=371, end=382), KiltDocAnchor(text='Sakha Republic', href='Sakha%20Republic', paragraph_id=39, start=405, end=419), KiltDocAnchor(text='Chauvet Cave', href='Chauvet%20Cave', paragraph_id=39, start=477, end=489), KiltDocAnchor(text='Teufelsbrucke', href='Devil%27s%20Bridge', paragraph_id=40, start=170, end=183), KiltDocAnchor(text='Mezin', href='Mezine', paragraph_id=40, start=260, end=265), KiltDocAnchor(text='Mezhirich', href='Mezhyrich', paragraph_id=40, start=267, end=276), KiltDocAnchor(text='Montespan', href='Montespan', paragraph_id=40, start=503, end=512), KiltDocAnchor(text='Bonn-Oberkassel', href='Oberkassel%2C%20Bonn', paragraph_id=40, start=564, end=579), KiltDocAnchor(text='ecomorphs', href='Ecomorphology', paragraph_id=41, start=826, end=835), KiltDocAnchor(text='wolfdogs', href='wolfdogs', paragraph_id=41, start=1159, end=1167), KiltDocAnchor(text='Victorian era', href='Victorian%20era', paragraph_id=46, start=760, end=773), KiltDocAnchor(text='selection', href='Selection%20%28biology%29', paragraph_id=46, start=790, end=799), KiltDocAnchor(text='dog breeds', href='dog%20breeds', paragraph_id=46, start=821, end=831), KiltDocAnchor(text='population bottlenecks', href='population%20bottlenecks', paragraph_id=47, start=28, end=50), KiltDocAnchor(text='dog breeds', href='dog%20breeds', paragraph_id=47, start=156, end=166), KiltDocAnchor(text='apex predator', href='apex%20predator', paragraph_id=49, start=3, end=16), KiltDocAnchor(text='trophic level', href='trophic%20level', paragraph_id=49, start=52, end=65), KiltDocAnchor(text='mesopredator', href='mesopredator', paragraph_id=49, start=93, end=105), KiltDocAnchor(text='Late Pleistocene', href='Late%20Pleistocene', paragraph_id=49, start=358, end=374), KiltDocAnchor(text='mitogenome', href='mitogenome', paragraph_id=50, start=62, end=72), KiltDocAnchor(text='Last Glacial Maximum', href='Last%20Glacial%20Maximum', paragraph_id=50, start=309, end=329), KiltDocAnchor(text='first divergence', href='Origin%20of%20the%20domestic%20dog%23Time%20of%20divergence', paragraph_id=50, start=337, end=353), KiltDocAnchor(text='anthropologists', href='anthropologists', paragraph_id=52, start=209, end=224), KiltDocAnchor(text='ungulates', href='ungulates', paragraph_id=52, start=286, end=295), KiltDocAnchor(text='socialized', href='Socialization', paragraph_id=52, start=528, end=538), KiltDocAnchor(text='Romeo', href='Romeo%20%28wolf%29', paragraph_id=52, start=932, end=937), KiltDocAnchor(text='coevolution', href='coevolution', paragraph_id=54, start=26, end=37), KiltDocAnchor(text='niche', href='Ecological%20niche', paragraph_id=54, start=128, end=133), KiltDocAnchor(text='commensal', href='Commensalism', paragraph_id=55, start=73, end=82), KiltDocAnchor(text='Pleistocene', href='Pleistocene', paragraph_id=55, start=227, end=238), KiltDocAnchor(text='Ancient DNA', href='Ancient%20DNA', paragraph_id=56, start=0, end=11), KiltDocAnchor(text='Last Glacial Maximum', href='Last%20Glacial%20Maximum', paragraph_id=56, start=128, end=148), KiltDocAnchor(text='megafauna', href='megafauna', paragraph_id=56, start=192, end=201), KiltDocAnchor(text='mitochondrial', href='mtDNA%20control%20region', paragraph_id=57, start=11, end=24), KiltDocAnchor(text='Y chromosome', href='Y%20chromosome', paragraph_id=57, start=35, end=47), KiltDocAnchor(text='microsatellite', href='Microsatellite%23Analysis%20of%20microsatellites', paragraph_id=57, start=53, end=67), KiltDocAnchor(text='ecotype', href='ecotype', paragraph_id=57, start=313, end=320), KiltDocAnchor(text='boreal', href='Taiga', paragraph_id=57, start=351, end=357), KiltDocAnchor(text='Pleistocene', href='Pleistocene', paragraph_id=57, start=689, end=700), KiltDocAnchor(text='Beringian wolves', href='Beringian%20wolf', paragraph_id=57, start=701, end=717), KiltDocAnchor(text='megafauna', href='megafauna', paragraph_id=57, start=841, end=850), KiltDocAnchor(text='Ellesmere Island', href='Ellesmere%20Island', paragraph_id=58, start=26, end=42), KiltDocAnchor(text='Pleistocene', href='Pleistocene', paragraph_id=62, start=607, end=618), KiltDocAnchor(text='Tibetan Plateau', href='Tibetan%20Plateau', paragraph_id=64, start=88, end=103), KiltDocAnchor(text='New World', href='New%20World', paragraph_id=64, start=115, end=124), KiltDocAnchor(text='Phylogenetic', href='Phylogenetic', paragraph_id=65, start=0, end=12), KiltDocAnchor(text='mDNA', href='mitochondrial%20DNA', paragraph_id=65, start=44, end=48), KiltDocAnchor(text='haplotype', href='haplotype', paragraph_id=65, start=49, end=58), KiltDocAnchor(text='monophyletic', href='Monophyly', paragraph_id=65, start=78, end=90), KiltDocAnchor(text='clades', href='clades', paragraph_id=65, start=91, end=97), KiltDocAnchor(text='haplogroups', href='haplogroups', paragraph_id=65, start=354, end=365), KiltDocAnchor(text='Sami', href='Sami%20people', paragraph_id=69, start=226, end=230), KiltDocAnchor(text='Finnish Lapphund', href='Finnish%20Lapphund', paragraph_id=69, start=247, end=263), KiltDocAnchor(text='Swedish Lapphund', href='Swedish%20Lapphund', paragraph_id=69, start=265, end=281), KiltDocAnchor(text='Lapponian Herder', href='Lapponian%20Herder', paragraph_id=69, start=283, end=299), KiltDocAnchor(text='Jamthund', href='Jamthund', paragraph_id=69, start=301, end=309), KiltDocAnchor(text='Norwegian Elkhound', href='Norwegian%20Elkhound', paragraph_id=69, start=311, end=329), KiltDocAnchor(text='Hällefors Elkhound', href='H%C3%A4llefors%20Elkhound', paragraph_id=69, start=334, end=352), KiltDocAnchor(text='Altai dog', href='Origin%20of%20the%20domestic%20dog%23Altai%20dog%20%26amp%3Bndash%3B%2033%2C000%20BP', paragraph_id=69, start=512, end=521), KiltDocAnchor(text='Japanese wolf', href='Honshu%20Wolf', paragraph_id=71, start=183, end=196), KiltDocAnchor(text='phenotypic', href='phenotype', paragraph_id=72, start=82, end=92), KiltDocAnchor(text='introgression', href='introgression', paragraph_id=72, start=153, end=166), KiltDocAnchor(text='β-defensin', href='Beta%20defensin%23genes', paragraph_id=73, start=742, end=752), KiltDocAnchor(text='Bering Land Bridge', href='Beringia', paragraph_id=76, start=439, end=457), KiltDocAnchor(text='admixture', href='Genetic%20admixture', paragraph_id=77, start=181, end=190), KiltDocAnchor(text='Saarloos wolfdog', href='Saarloos%20wolfdog', paragraph_id=77, start=196, end=212), KiltDocAnchor(text='Siberian husky', href='Siberian%20husky', paragraph_id=77, start=473, end=487), KiltDocAnchor(text='Greenland dog', href='Greenland%20dog', paragraph_id=77, start=492, end=505), KiltDocAnchor(text='Shar Pei', href='Shar%20Pei', paragraph_id=77, start=589, end=597), KiltDocAnchor(text='Finnish spitz', href='Finnish%20spitz', paragraph_id=77, start=602, end=615), KiltDocAnchor(text='introgression', href='introgression', paragraph_id=77, start=1144, end=1157), KiltDocAnchor(text='Newgrange dog', href='Origin%20of%20the%20domestic%20dog%23Two%20domestication%20events', paragraph_id=79, start=527, end=540), KiltDocAnchor(text='Charles Darwin', href='Charles%20Darwin', paragraph_id=81, start=0, end=14), KiltDocAnchor(text='selective breeding', href='selective%20breeding', paragraph_id=81, start=185, end=203), KiltDocAnchor(text='natural selection', href='natural%20selection', paragraph_id=81, start=323, end=340), KiltDocAnchor(text='Dmitry Belyayev', href='Dmitry%20Belyayev%20%28zoologist%29', paragraph_id=82, start=139, end=154), KiltDocAnchor(text='mink', href='mink', paragraph_id=82, start=902, end=906), KiltDocAnchor(text='Japanese quail', href='Japanese%20quail', paragraph_id=82, start=911, end=925), KiltDocAnchor(text='positive selection', href='positive%20selection', paragraph_id=83, start=770, end=788), KiltDocAnchor(text='quantitative trait loci', href='Quantitative%20trait%20locus', paragraph_id=84, start=220, end=243), KiltDocAnchor(text='ghost population', href='ghost%20population', paragraph_id=85, start=169, end=185), KiltDocAnchor(text='Pleistocene', href='Pleistocene', paragraph_id=85, start=210, end=221), KiltDocAnchor(text='genetic loci', href='Locus%20%28genetics%29', paragraph_id=85, start=355, end=367), KiltDocAnchor(text='domestication islands', href='domestication%20islands', paragraph_id=85, start=551, end=572), KiltDocAnchor(text='synaptic plasticity', href='synaptic%20plasticity', paragraph_id=86, start=151, end=170), KiltDocAnchor(text='adrenaline', href='Epinephrine%23Mechanism%20of%20action', paragraph_id=88, start=492, end=502), KiltDocAnchor(text='noradrenaline', href='Neuromodulation', paragraph_id=88, start=507, end=520), KiltDocAnchor(text='biosynthesis', href='biosynthesis', paragraph_id=88, start=521, end=533), KiltDocAnchor(text='catecholamine', href='catecholamine', paragraph_id=88, start=664, end=677), KiltDocAnchor(text='dopamine', href='dopamine', paragraph_id=88, start=694, end=702), KiltDocAnchor(text='noradrenaline', href='noradrenaline', paragraph_id=88, start=707, end=720), KiltDocAnchor(text='neural crest', href='neural%20crest', paragraph_id=89, start=301, end=313), KiltDocAnchor(text='central nervous system', href='central%20nervous%20system', paragraph_id=89, start=318, end=340), KiltDocAnchor(text='embryogenesis', href='embryogenesis', paragraph_id=89, start=373, end=386), KiltDocAnchor(text='domestication syndrome', href='domestication%20syndrome', paragraph_id=89, start=558, end=580), KiltDocAnchor(text='AMY2B', href='AMY2B', paragraph_id=91, start=0, end=5), KiltDocAnchor(text='starch', href='starch', paragraph_id=91, start=117, end=123), KiltDocAnchor(text='glycogen', href='glycogen', paragraph_id=91, start=128, end=136), KiltDocAnchor(text='Saluki', href='Saluki', paragraph_id=91, start=470, end=476), KiltDocAnchor(text='Fertile Crescent', href='Fertile%20Crescent', paragraph_id=91, start=505, end=521), KiltDocAnchor(text='carnivorous', href='carnivorous', paragraph_id=91, start=996, end=1007), KiltDocAnchor(text='dingo', href='dingo', paragraph_id=91, start=1218, end=1223), KiltDocAnchor(text='genome', href='genome', paragraph_id=92, start=28, end=34), KiltDocAnchor(text='lipid', href='lipid', paragraph_id=92, start=213, end=218), KiltDocAnchor(text='Mammoth steppe', href='Mammoth%20steppe', paragraph_id=92, start=1018, end=1032), KiltDocAnchor(text='New Guinea singing dog', href='New%20Guinea%20singing%20dog', paragraph_id=94, start=234, end=256), KiltDocAnchor(text='New Guinea', href='New%20Guinea', paragraph_id=94, start=321, end=331), KiltDocAnchor(text='dingo', href='dingo', paragraph_id=94, start=344, end=349), KiltDocAnchor(text='Outback', href='Outback', paragraph_id=94, start=366, end=373), KiltDocAnchor(text='Bonobo', href='Bonobo', paragraph_id=95, start=156, end=162), KiltDocAnchor(text='chimpanzee', href='Common%20chimpanzee', paragraph_id=95, start=170, end=180), KiltDocAnchor(text='guinea pig', href='guinea%20pig', paragraph_id=95, start=539, end=549), KiltDocAnchor(text='cavy', href='cavy', paragraph_id=95, start=600, end=604), KiltDocAnchor(text='convergent evolution', href='convergent%20evolution', paragraph_id=97, start=67, end=87), KiltDocAnchor(text='pathology', href='pathology', paragraph_id=97, start=320, end=329), KiltDocAnchor(text='genes', href='genes', paragraph_id=99, start=22, end=27), KiltDocAnchor(text='parallel evolution', href='parallel%20evolution', paragraph_id=99, start=358, end=376), KiltDocAnchor(text='hypoxia', href='Hypoxia%20%28environmental%29', paragraph_id=99, start=432, end=439), KiltDocAnchor(text='serotonin', href='serotonin', paragraph_id=99, start=610, end=619), KiltDocAnchor(text='Convergent evolution', href='Convergent%20evolution', paragraph_id=102, start=0, end=20), KiltDocAnchor(text='penguin', href='penguin', paragraph_id=102, start=134, end=141), KiltDocAnchor(text='dolphin', href='dolphin', paragraph_id=102, start=147, end=154), KiltDocAnchor(text='flippers', href='flipper%20%28anatomy%29', paragraph_id=102, start=185, end=193), KiltDocAnchor(text='great ape', href='great%20ape', paragraph_id=102, start=756, end=765), KiltDocAnchor(text='human communication', href='human%20communication', paragraph_id=103, start=117, end=136), KiltDocAnchor(text='coevolution', href='coevolution', paragraph_id=104, start=22, end=33), KiltDocAnchor(text='eye contact', href='eye%20contact', paragraph_id=105, start=74, end=85), KiltDocAnchor(text='gaze', href='gaze', paragraph_id=105, start=94, end=98), KiltDocAnchor(text='oxytocin', href='oxytocin', paragraph_id=105, start=110, end=118), KiltDocAnchor(text='maternal bond', href='maternal%20bond', paragraph_id=105, start=194, end=207), KiltDocAnchor(text='sentient', href='sentient', paragraph_id=105, start=572, end=580), KiltDocAnchor(text='love', href='love', paragraph_id=105, start=1308, end=1312), KiltDocAnchor(text='territoriality', href='territoriality', paragraph_id=107, start=240, end=254), KiltDocAnchor(text='African wild dog', href='African%20wild%20dog', paragraph_id=108, start=816, end=832), KiltDocAnchor(text='dhole', href='dhole', paragraph_id=108, start=847, end=852), KiltDocAnchor(text='ungulate', href='ungulate', paragraph_id=108, start=1113, end=1121), KiltDocAnchor(text='mammoth steppe', href='mammoth%20steppe', paragraph_id=109, start=7, end=21), KiltDocAnchor(text='food chain', href='food%20chain', paragraph_id=109, start=146, end=156), KiltDocAnchor(text='reindeer', href='reindeer', paragraph_id=109, start=219, end=227), KiltDocAnchor(text='Last Glacial Maximum', href='Last%20Glacial%20Maximum', paragraph_id=109, start=757, end=777), KiltDocAnchor(text='carnivore', href='carnivore', paragraph_id=111, start=49, end=58), KiltDocAnchor(text='phenotypic', href='Phenotype', paragraph_id=111, start=137, end=147), KiltDocAnchor(text='dog breeds', href='dog%20breeds', paragraph_id=111, start=191, end=201), KiltDocAnchor(text='artificial selection', href='artificial%20selection', paragraph_id=111, start=209, end=229), KiltDocAnchor(text='poodle', href='poodle', paragraph_id=111, start=304, end=310), KiltDocAnchor(text='mastiff', href='mastiff', paragraph_id=111, start=322, end=329), KiltDocAnchor(text='genetic divergence', href='genetic%20divergence', paragraph_id=111, start=885, end=903), KiltDocAnchor(text='First World War', href='World%20War%20I', paragraph_id=113, start=27, end=42), KiltDocAnchor(text='Oberkassel, Bonn', href='Oberkassel%2C%20Bonn', paragraph_id=113, start=107, end=123), KiltDocAnchor(text='mitochondrial DNA', href='mitochondrial%20DNA', paragraph_id=113, start=628, end=645), KiltDocAnchor(text='pathology', href='pathology', paragraph_id=114, start=528, end=537), KiltDocAnchor(text='canine distemper', href='canine%20distemper', paragraph_id=114, start=616, end=632), KiltDocAnchor(text='Upper Paleolithic', href='Upper%20Paleolithic', paragraph_id=116, start=11, end=28), KiltDocAnchor(text='Petroglyph', href='Petroglyph', paragraph_id=118, start=0, end=10), KiltDocAnchor(text='Shuwaymis', href='Rock%20Art%20in%20the%20Ha%27il%20Region', paragraph_id=118, start=56, end=65), KiltDocAnchor(text='Jubbah', href='Jubbah%2C%20Saudi%20Arabia', paragraph_id=118, start=70, end=76), KiltDocAnchor(text='Late Pleistocene', href='Late%20Pleistocene', paragraph_id=118, start=232, end=248), KiltDocAnchor(text='Holocene', href='Holocene', paragraph_id=118, start=264, end=272), KiltDocAnchor(text='Holocene', href='Holocene', paragraph_id=118, start=477, end=485), KiltDocAnchor(text='Illinois', href='Illinois', paragraph_id=120, start=57, end=65), KiltDocAnchor(text='radiocarbon dating', href='radiocarbon%20dating', paragraph_id=120, start=70, end=88), KiltDocAnchor(text='Koster Site', href='Koster%20Site', paragraph_id=120, start=154, end=165), KiltDocAnchor(text='Illinois River', href='Illinois%20River', paragraph_id=120, start=181, end=195), KiltDocAnchor(text='Greene County', href='Greene%20County%2C%20Illinois', paragraph_id=120, start=199, end=212), KiltDocAnchor(text='Pike County', href='Pike%20County%2C%20Illinois', paragraph_id=120, start=267, end=278), KiltDocAnchor(text='Isotope analysis', href='Isotope%20analysis', paragraph_id=120, start=408, end=424), KiltDocAnchor(text='collagen', href='collagen', paragraph_id=120, start=570, end=578), KiltDocAnchor(text='Zhokhov Island', href='Zhokhov%20Island', paragraph_id=121, start=197, end=211), KiltDocAnchor(text='Siberian husky', href='Siberian%20husky', paragraph_id=123, start=493, end=507), KiltDocAnchor(text="Naumann's elephant", href='Palaeoloxodon%20naumanni', paragraph_id=126, start=136, end=154), KiltDocAnchor(text='sika deer', href='sika%20deer', paragraph_id=126, start=201, end=210), KiltDocAnchor(text='wild boar', href='wild%20boar', paragraph_id=126, start=215, end=224), KiltDocAnchor(text='Jōmon', href='J%C5%8Dmon', paragraph_id=126, start=364, end=369), KiltDocAnchor(text='Honshu', href='Honshu', paragraph_id=126, start=412, end=418), KiltDocAnchor(text='Holarctic', href='Holarctic', paragraph_id=128, start=184, end=193)), ('Origins', 'Dogs'), 'Q39110', '908221312', '2019-07-28T08:49:25Z', '908221228', '5141410', 'https://en.wikipedia.org/w/index.php?title=Origin%20of%20the%20domestic%20dog&oldid=908221312'), }) def test_queries(self): self._test_queries('kilt/codec', count=42, items={ 0: CodecQuery('economics-1', "How has the UK's Open Banking Regulation benefited challenger banks?", 'finance', 'UK’s Open Banking regulation, which has parallels to the EU’s second payment service directive (PSD2), went live in January 2018. This piece of legislation “will require banks to open their payments infrastructure and customer data assets to third parties”. As a result, banks no longer have a monopoly on user data if clients grant permission. \n\nChallenger banks are small, recently created retail banks that compete directly with the longer-established banks in the UK. Specifically, seeking market share from the "big four" UK retail banks (Barclays, HSBC, Lloyds Banking Group, and NatWest Group). The banks distinguish themselves from the historic banks by modern financial technology practices, such as online-only operations, that avoid the costs and complexities of traditional banking. The largest UK-operating challenger banks include Atom Bank, Revolut, Starling, N26, and Tide.\n\nRelevant documents and entities will discuss how challenger banks have used open banking to develop new products or capture market share from traditional retail banks in the UK.'), 9: CodecQuery('economics-18', 'Was the crash that followed the dot-com bubble an overreaction considering the ultimate success of the internet?', 'finance', 'The dot-com buddle from 1995-2000 saw incredible growth in stocks that were considered to have anything to do with the internet. NASDAQ rose 400% only to fall 78% from its peak by 2002. Several high-profile failures included Pets.com, Boo.com, Woldcom, and Global Crossing. However, many internet-based companies survived and went on to thrive, i.e. eBay, Amazon, Qualcomm, Cisco Systems. Venture Capital pulled back from the internet space for a period.\n\nHowever, with FANNG stocks and incredible software/internet-based companies performance from the mid-2000s until the present day, it could be argued that the direction of the dot-com bubble was absolutely correct. Nonetheless, the magnitude of equity growth and capital allocation to companies with limited commercial value was irrational and required a market correction.'), 41: CodecQuery('politics-23', 'Is the rise of European populism a threat to the European Union?', 'politics', '"Populist" is a broad term that describes, typically a politician, who targets people who feel that established elite groups disregard their concerns. Some critics highlight negative connotations, including criticising foreign migration or minorities.\n\nIn recent years, populism has been stronger in Eastern Europe, i.e. Bulgaria, Hungary, Austrian, and Poland, etc. have seen the rise of populist politicians. Right-wing populist movements have also gained momentum in France, Spain, the United Kingdom and other parts of Europe. In Hungary and Poland, some critics argue this has led to an erosion of the rule of law, increased persecution, and authoritarianism.\n\nEuropean Union is a political union of democratic nations. However, radical right-wing politics reject what the EU stands for and how it works, i.e. against European supranational integration and push for national policies. Populists also criticise the EU\'s perceived bureaucracy and failures - common arguments during the Brexit Leave campaign. The EU is founded based on shared democratic values that countries need to be relatively ideologically aligned to function within a political union. There is also the threat that disillusioned Eastern European countries will turn away from the EU and toward Russia.'), }) self._test_queries('kilt/codec/economics', count=14, items={ 0: CodecQuery('economics-1', "How has the UK's Open Banking Regulation benefited challenger banks?", 'finance', 'UK’s Open Banking regulation, which has parallels to the EU’s second payment service directive (PSD2), went live in January 2018. This piece of legislation “will require banks to open their payments infrastructure and customer data assets to third parties”. As a result, banks no longer have a monopoly on user data if clients grant permission. \n\nChallenger banks are small, recently created retail banks that compete directly with the longer-established banks in the UK. Specifically, seeking market share from the "big four" UK retail banks (Barclays, HSBC, Lloyds Banking Group, and NatWest Group). The banks distinguish themselves from the historic banks by modern financial technology practices, such as online-only operations, that avoid the costs and complexities of traditional banking. The largest UK-operating challenger banks include Atom Bank, Revolut, Starling, N26, and Tide.\n\nRelevant documents and entities will discuss how challenger banks have used open banking to develop new products or capture market share from traditional retail banks in the UK.'), 9: CodecQuery('economics-18', 'Was the crash that followed the dot-com bubble an overreaction considering the ultimate success of the internet?', 'finance', 'The dot-com buddle from 1995-2000 saw incredible growth in stocks that were considered to have anything to do with the internet. NASDAQ rose 400% only to fall 78% from its peak by 2002. Several high-profile failures included Pets.com, Boo.com, Woldcom, and Global Crossing. However, many internet-based companies survived and went on to thrive, i.e. eBay, Amazon, Qualcomm, Cisco Systems. Venture Capital pulled back from the internet space for a period.\n\nHowever, with FANNG stocks and incredible software/internet-based companies performance from the mid-2000s until the present day, it could be argued that the direction of the dot-com bubble was absolutely correct. Nonetheless, the magnitude of equity growth and capital allocation to companies with limited commercial value was irrational and required a market correction.'), 13: CodecQuery('economics-23', 'Offering non-accounting services arguably creates a conflict of interest for the Big Four. Is this the reason for their inability to uncover recent financial scandals?', 'finance', 'The Big Four are the four largest global accounting firms that dominate corporate accounting, i.e. Deloitte, Ernst & Young (EY), PricewaterhouseCoopers (PwC), and Klynveld Peat Marwick Goerdeler (KPMG). As well as offering accounting services, these firms also over other many higher-margin services such as tax, consultancy, and technology services (~80% revenue). \n\nInternal controls (firewalls, departments, etc.) are meant to prohibit any cooperation between audit and non-audit services in winning customer contracts or favour. However, critics would argue that this does not go far enough and can lead to less rigorous audits, calling for the Big Four to be broken up. Critics highlight poor practices in recent accounting scandals, including Wirecard, Carillion, Satyam Computer Services, South Africa examples, etc. \n\nHowever, it should be highlighted that even if the Big Four were separated, this would not necessarily lead to no accounting scandals. Some fraudulent practices may be hard to catch even if objective and following best practices. Also, there are other areas of possible conflict of interest that non-accounting work, including how the corporate hires the auditor.'), }) self._test_queries('kilt/codec/history', count=14, items={ 0: CodecQuery('history-1', 'Would the United Kingdom have been ready for WWII without the time gained through Appeasement?', 'history', "Many argue Britain's army was depleted in the early 1930s and stretched across the globe. UK defence spending had fallen significantly during the 1920s, from over £700 million in 1919 to 100 million in 1931.\n\nBetween 1934 and 1939, the UK launched a substantial programme of re-arming, recognising that war with Hitler was becoming increasingly likely. Although Appeasement was also motivated by Chamberlain's desire to end war, some argue this meant that the UK was more prepared in 1939 when war eventually broke out. \n\nDespite these efforts, Germany was still better prepared for war under Hilter's single-minded preparation since he came to power in 1933. However, without Appeasement, the differential might have been much worse."), 9: CodecQuery('history-19', 'How close did the world come to nuclear war during the Cuban Missile Crisis?', 'history', 'During the Cuban Missile Crisis, leaders of the United States and the Soviet Union engaged in a 13-day political and military standoff in October 1962 over the installation of nuclear-armed Soviet missiles on Cuba. This was the peak of the Cold War and a high-stakes political and military situation, given the potential devastation of nuclear weapons. However, exactly how close we came to nuclear armageddon is still debated. \n\nThe political leaders within this crisis were JFK for the United States, Nikita Khrushchev of the Soviet Union, and Fidel Casto of Cuba. Many highlights that JFK and Khrushchev were measured in their leadership styles and understood nuclear war meant mutual destruction. Some highlight exchanging of letters and other communications to prevent a nuclear war. Nonetheless, many highlight the mistrust and fear between both sides and how a single false move could have led to a disaster. Both sides were actively preparing for a nuclear war, and some within each camp through nuclear strikes was likely.'), 13: CodecQuery('history-25', 'How responsible was Rasputin for the fall of the Romanov dynasty?', 'history', "The Russian Tsar, Nicholas II, abdicated from power in 1917, bringing the 300-year-old Romanov dynasty to an end. Some historians suggest that Grigori Rasputin's scandalous reputation helped discredit the Tsarist government and helped to lay the foundation for the Russian Revolution. \n\nHowever, many historians argue that although Rasputin was a useful propaganda tool against the Tsar, much larger factors were at play. For example, Nicholas II was viewed as a weak leader, the Russo-Japanese War, Bloody Sunday, Tsarina unpopularity, and WWI. There was also significant economic issues, including inflation and food shortages."), }) self._test_queries('kilt/codec/politics', count=14, items={ 0: CodecQuery('politics-1', 'Is Scottish Independence inevitable?', 'politics', "This questions focuses on the long-term political, economic and social reasoning behind whether Scotland will likely become independent. Short-term facts and opinions are less central to this question.\n\nSome argue that Scottish independence is inevitable given the surge of support towards SNP in recent decades. Labour has lost political weight in Scotland since the Blair era, and Conservatives historically struggle to penetrate a more left-wing Scottish demographic. Brexit further exacerbated this political unease, i.e. right-leaning Britain and left-leaning Scotland. While the younger demographic is more likely to be independence supporters and the older demographic is more likely to be pro-union.\n\nThere are several arguments that Scotland independence is not inevitable. Historically there has been a union for 300+ years covering a full range of circumstances (world wars, successes). Economically, Britain funds lots of Scottish spending through Barnett-based funding, Scotland's oil is less valuable than previously, and there is much economic uncertainly around currency and debt. Some also argue that Scotland has large political independence due to devolution; thus, independence is unnecessary."), 9: CodecQuery('politics-16', 'Why did Hilary Clinton lose the 2016 US presidential election?', 'politics', 'Hillary Clinton lost the 2016 US presidential election to Donald Trump in 2016. Political commentators highlight many reasons for Clinton\'s loss. \n\nFor example, Donald Trump managed to craft a strong populist message that resonated with many voters who were disenfranchised with current politics, particularly with the "political elite" who some felt Clinton represented. The Democratats were also divided, specifically far-left factions led by Bernie Sanders. Political gridlock under an Obama Administration that, rightly or wrongly, some felt the public wanting change. Hilary also had relatively low personal approval ratings, which were not helped by the FBI investigation into her use of email. Some also point to external factors, including Russian interference.'), 13: CodecQuery('politics-23', 'Is the rise of European populism a threat to the European Union?', 'politics', '"Populist" is a broad term that describes, typically a politician, who targets people who feel that established elite groups disregard their concerns. Some critics highlight negative connotations, including criticising foreign migration or minorities.\n\nIn recent years, populism has been stronger in Eastern Europe, i.e. Bulgaria, Hungary, Austrian, and Poland, etc. have seen the rise of populist politicians. Right-wing populist movements have also gained momentum in France, Spain, the United Kingdom and other parts of Europe. In Hungary and Poland, some critics argue this has led to an erosion of the rule of law, increased persecution, and authoritarianism.\n\nEuropean Union is a political union of democratic nations. However, radical right-wing politics reject what the EU stands for and how it works, i.e. against European supranational integration and push for national policies. Populists also criticise the EU\'s perceived bureaucracy and failures - common arguments during the Brexit Leave campaign. The EU is founded based on shared democratic values that countries need to be relatively ideologically aligned to function within a political union. There is also the threat that disillusioned Eastern European countries will turn away from the EU and toward Russia.'), }) def test_qrels(self): self._test_qrels('kilt/codec', count=11323, items={ 0: TrecQrel('economics-8', '10489969', 0, 'Q0'), 9: TrecQrel('economics-8', '1380940', 1, 'Q0'), 11322: TrecQrel('politics-23', '9800598', 1, 'Q0'), }) self._test_qrels('kilt/codec/economics', count=1970, items={ 0: TrecQrel('economics-8', '089a22846f6ba15fb4ef4cca0a884dd4', 2, 'Q0'), 9: TrecQrel('economics-8', '24de81ea95c7df32941e8bd200d3528a', 2, 'Q0'), 1969: TrecQrel('economics-23', 'ebd36155b9808933bbbfc26af6d18dec', 1, 'Q0'), }) self._test_qrels('kilt/codec/history', count=2024, items={ 0: TrecQrel('history-20', '00aa648a657bdf73369bcb093030cc41', 0, 'Q0'), 9: TrecQrel('history-20', '0d239d8dea605cd079d1a0144aa6ed46', 1, 'Q0'), 2023: TrecQrel('history-25', 'dba9433750c2505f2a6a69661d4eb2fd', 0, 'Q0'), }) self._test_qrels('kilt/codec/politics', count=2192, items={ 0: TrecQrel('politics-12', '02d8176599da0bfc15caaf7e0b3bba6b', 3, 'Q0'), 9: TrecQrel('politics-12', '0e23e45de303e5f440a5a17ccd7974ec', 1, 'Q0'), 2191: TrecQrel('politics-23', 'f7975664f7fc45416c7256d12d06fbdf', 1, 'Q0'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/lotte.py ================================================ import re import unittest import ir_datasets from ir_datasets.formats import GenericDoc, GenericQuery, TrecQrel from .base import DatasetIntegrationTest class TestLotte(DatasetIntegrationTest): def test_docs(self): self._test_docs('lotte/lifestyle/dev', count=268893, items={ 0: GenericDoc('0', re.compile('^In my experience rabbits are very easy to housebreak\\. They like to pee and poop in the same place ev.{1292}oiding kicking soiled litter out of the box, which is the biggest cause of failure in my experience\\.$', flags=48)), 9: GenericDoc('9', re.compile('^There was a trick Cesar Milan \\(the Dog Whisperer\\) used to break a puppy of their fear of living in a.{445}sperer: http://movies\\.netflix\\.com/WiMovie/The_Very_Best_of_Dog_Whisperer_with_Cesar_Millan/70270440\\.$', flags=48)), 268892: GenericDoc('268892', re.compile("^I would say there are two good choices, and several other choices\\. Find another junk yard or search .{117}d flat several years ago and it must not have been very expensive because I can't remember the cost\\.$", flags=48)), }) self._test_docs('lotte/lifestyle/test', count=119461, items={ 0: GenericDoc('0', re.compile('^Normal double\\-acting baking powder makes CO2 \\(thus giving a rising effect\\) in two ways: when it gets.{581}ween mixing and baking less critical, and this is the type most widely available to consumers today\\.$', flags=48)), 9: GenericDoc('9', 'Try bacon! It smells wonderful cooked on a plank.'), 119460: GenericDoc('119460', re.compile('^\\[Will try to find some links supporting my answer\\] Although goalies are awarded shutouts when their .{519}is a goal allowed while no goalie in the net, neither the team or the goalie is awarded the shutout\\.$', flags=48)), }) self._test_docs('lotte/recreation/dev', count=263025, items={ 0: GenericDoc('0', re.compile('^Multiclassing no longer takes an XP hit, and your favored class gives you one of two bonuses at ever.{1} level: \\+1 hp \\+1 skill point Advanced Players Guide add other options for specific Race/Class combos$', flags=48)), 9: GenericDoc('9', re.compile("^Well, if we're only counting the WotC ones, in no particular order: Dark Sun \\-\\- 3\\.x material in Drag.{1609}not listing all the third party settings\\. Lots of those! A full list can also be found on Wikipedia\\.$", flags=48)), 263024: GenericDoc('263024', re.compile('^Reflected light meters by necessity assume a normal reflectance level\\. At one time this was referred.{493} dress exposure more toward gray\\. Incident metering removes the guesswork as to reflectance effects\\.$', flags=48)), }) self._test_docs('lotte/recreation/test', count=166975, items={ 0: GenericDoc('0', "No, you can fight the sword master without knowing all the comebacks. It's all a matter of luck which insults she throws at you."), 9: GenericDoc('9', re.compile('^My suggestion would be to install DOSBox, and run it through the emulator\\. I use DOSBox for several .{146}er=S Here is a good guide for getting DOSBox setup: http://vogons\\.zetafleet\\.com/viewtopic\\.php\\?t=2502$', flags=48)), 166974: GenericDoc('166974', re.compile("^As explained by EW: He's ultimately arrested because Cassie had a backup plan \\(a cache of incriminat.{332}\\. Whatever Cassie sent as proof of her disappearance \\(we never see it\\), the police find her remains\\.$", flags=48)), }) self._test_docs('lotte/science/dev', count=343642, items={ 0: GenericDoc('0', re.compile("^As you move from left to right across a period, the number of protons in the nucleus increases\\. The .{347}lence electrons from the attractive effects of the atom's nucleus, so the atomic radius gets larger\\.$", flags=48)), 9: GenericDoc('9', re.compile("^It's a very general statement, but it's not always true\\. I'll explain why it's often true, and give .{917}lly as the mix changes from pure silver to pure gold\\. The corresponding phase diagram is as follows:$", flags=48)), 343641: GenericDoc('343641', re.compile('^Let\'s take a cold, hard \\(quasi\\-Kantian\\) look at what you\'re suggesting\\. So, if there is a one\\-in\\-a\\-b.{1851} to be\\." If we stop making up our minds to be miserable, suffering wretches, that\'s half the battle\\.$', flags=48)), }) self._test_docs('lotte/science/test', count=1694164, items={ 0: GenericDoc('0', "More or Less is a BBC Radio 4 programme about maths and statistics in the news, and there is a free podcast. It's presented by Tim Harford, the Undercover Economist from the Financial Times."), 9: GenericDoc('9', re.compile("^You can use Binet's formula, described at http://mathworld\\.wolfram\\.com/BinetsFibonacciNumberFormula\\..{5}\\(see also Wikipedia for a proof: http://en\\.wikipedia\\.org/wiki/Binet_formula\\#Closed_form_expression \\)$", flags=48)), 1694163: GenericDoc('1694163', re.compile('^I think that "morph" is a substantially generic term that it would cover environmental as well as ge.{432}his is a well\\-known concept in entomology and no one will be angry/surprised when you use this term\\.$', flags=48)), }) self._test_docs('lotte/technology/dev', count=1276222, items={ 0: GenericDoc('0', re.compile('^You definitely need some sort of software to filter out the noise\\. Some of the other answers here ca.{142}aker\\. By the way, just in case anyone is curious, these can be used for small desktop speakers, too\\.$', flags=48)), 9: GenericDoc('9', re.compile("^su is a command to change to another user, either to run a shell or execute a specific command\\. You .{1647}meone leaves, or assuming that it's ok for them to have full access to your systems after you leave\\.$", flags=48)), 1276221: GenericDoc('1276221', 'Try these formulas: Max: =MAXIFS(B3:B10,B3:B10,"<>#N/A") Min: =MINIFS(B3:B10,B3:B10,"<>#N/A") Sample'), }) self._test_docs('lotte/technology/test', count=638509, items={ 0: GenericDoc('0', re.compile("^One option would be to clone your startup drive to an external disk using something like SuperDuper!.{217}y to run the repair\\. After you're done, re\\-select the internal drive as the Startup Disk and reboot\\.$", flags=48)), 9: GenericDoc('9', re.compile('^This is so called "Hibernation" \\(my first met in windows\\)\\. When battery dies, the OS dumps whole RAM.{90}p, it loads the information back from HDD to RAM \\(hence you see the progress with those white bars\\)\\.$', flags=48)), 638508: GenericDoc('638508', re.compile("^First of all, you don't use NAT between private subnets\\. You only use NAT when it is required, most .{1247} 19\\) permitting everything \\- remember that ACL rules are applied on a first\\-hit base, top to bottom\\.$", flags=48)), }) self._test_docs('lotte/writing/dev', count=277072, items={ 0: GenericDoc('0', 'A native speaker would interpret them as having the same meaning. You could say "I\'m ill," or you could say "I\'m sick". "I\'m ill" could be classed as more formal language.'), 9: GenericDoc('9', re.compile("^From a British perspective, I'm ill is more common and general term for when you're unwell\\. Being si.{1}k can refer to actually throwing up or vomiting, but it can also be used for being generally unwell\\.$", flags=48)), 277071: GenericDoc('277071', re.compile("^Any investor who gets his hands on too many is targeted by the others, who aren't too concerned with.{442}a\\. Three almost ensures that one of them will figure it out\\. Much better to be faithful to one book\\.$", flags=48)), }) self._test_docs('lotte/writing/test', count=199994, items={ 0: GenericDoc('0', "It's the fifth element after earth, air, fire, and water, so it is presumably superior to those or completing those."), 9: GenericDoc('9', re.compile('^Here is a good description of when to use shall: \\.\\.\\.shall is used for the future tense with the firs.{761}nd which social echelons using "shall" in statements is actually still practiced by native speakers\\.$', flags=48)), 199993: GenericDoc('199993', re.compile('^The following is a summary of the entry for the suffix "\\-ee" in the OED: The \\-ee suffix has two func.{526}"bootee", "setee", "goatee", etc\\., and does not fit your context\\. In short \\- modifyee does not work\\.$', flags=48)), }) self._test_docs('lotte/pooled/dev', count=2428854, items={ 0: GenericDoc('0', 'A native speaker would interpret them as having the same meaning. You could say "I\'m ill," or you could say "I\'m sick". "I\'m ill" could be classed as more formal language.'), 9: GenericDoc('9', re.compile("^From a British perspective, I'm ill is more common and general term for when you're unwell\\. Being si.{1}k can refer to actually throwing up or vomiting, but it can also be used for being generally unwell\\.$", flags=48)), 2428853: GenericDoc('2428853', re.compile("^I would say there are two good choices, and several other choices\\. Find another junk yard or search .{117}d flat several years ago and it must not have been very expensive because I can't remember the cost\\.$", flags=48)), }) self._test_docs('lotte/pooled/test', count=2819103, items={ 0: GenericDoc('0', "It's the fifth element after earth, air, fire, and water, so it is presumably superior to those or completing those."), 9: GenericDoc('9', re.compile('^Here is a good description of when to use shall: \\.\\.\\.shall is used for the future tense with the firs.{761}nd which social echelons using "shall" in statements is actually still practiced by native speakers\\.$', flags=48)), 2819102: GenericDoc('2819102', re.compile('^\\[Will try to find some links supporting my answer\\] Although goalies are awarded shutouts when their .{519}is a goal allowed while no goalie in the net, neither the team or the goalie is awarded the shutout\\.$', flags=48)), }) def test_queries(self): self._test_queries('lotte/lifestyle/dev/forum', count=2076, items={ 0: GenericQuery('0', 'Why does my cat keep patting my face?'), 9: GenericQuery('9', 'Is this normal first day home behavior for my kitten, or should I be concerned?'), 2075: GenericQuery('2075', 'Direct Pull (V-Brake) vs. Center Pull Cantilevers (pros and cons)'), }) self._test_queries('lotte/lifestyle/dev/search', count=417, items={ 0: GenericQuery('0', 'how much should i feed my 1 year old english mastiff?'), 9: GenericQuery('9', 'is my corn snake male or female?'), 416: GenericQuery('416', 'is there a difference between red and clear power steering fluid?'), }) self._test_queries('lotte/lifestyle/test/forum', count=2002, items={ 0: GenericQuery('0', 'OK were all adults here, so really, how on earth should I use a squat toilet?'), 9: GenericQuery('9', 'I dont know my nationality. How can I visit Denmark?'), 2001: GenericQuery('2001', 'What is each side of a 4-sided grater for?'), }) self._test_queries('lotte/lifestyle/test/search', count=661, items={ 0: GenericQuery('0', 'are clear pomegranate seeds good to eat?'), 9: GenericQuery('9', 'is lumpy coconut milk ok?'), 660: GenericQuery('660', 'is zone allowed in the nba?'), }) self._test_queries('lotte/recreation/dev/forum', count=2002, items={ 0: GenericQuery('0', 'Would the One Ring even work for anyone but Sauron?'), 9: GenericQuery('9', 'Which 2015 technologies were correctly predicted by Back to the Future II?'), 2001: GenericQuery('2001', 'Does priority matter in Magic?'), }) self._test_queries('lotte/recreation/dev/search', count=563, items={ 0: GenericQuery('0', 'do bards have to sing?'), 9: GenericQuery('9', 'do attacks of opportunity stop movement?'), 562: GenericQuery('562', 'are nikon and minolta lenses interchangeable?'), }) self._test_queries('lotte/recreation/test/forum', count=2002, items={ 0: GenericQuery('0', 'How did they make cars fall apart in old movies?'), 9: GenericQuery('9', 'Is the title The Last Jedi singular or plural?'), 2001: GenericQuery('2001', 'Is there any specific reason why female voice actors act for male roles in anime?'), }) self._test_queries('lotte/recreation/test/search', count=924, items={ 0: GenericQuery('0', 'how can you tell if someone blocked you on xbox one?'), 9: GenericQuery('9', 'are xbox games compatible with ps4?'), 923: GenericQuery('923', 'are laurel and hardy jewish?'), }) self._test_queries('lotte/science/dev/forum', count=2013, items={ 0: GenericQuery('0', 'Making sense of principal component analysis, eigenvectors & eigenvalues'), 9: GenericQuery('9', 'Bayesian and frequentist reasoning in plain English'), 2012: GenericQuery('2012', 'How can I tell if I have simplified my talk too much?'), }) self._test_queries('lotte/science/dev/search', count=538, items={ 0: GenericQuery('0', 'is sudan iv hydrophobic or hydrophilic?'), 9: GenericQuery('9', 'how many atoms are present in one molecule of urea?'), 537: GenericQuery('537', 'is both objective and subjective?'), }) self._test_queries('lotte/science/test/forum', count=2017, items={ 0: GenericQuery('0', 'Cooling a cup of coffee with help of a spoon'), 9: GenericQuery('9', 'Why dont metals bond when touched together?'), 2016: GenericQuery('2016', 'Why does cracking a joint make noise?'), }) self._test_queries('lotte/science/test/search', count=617, items={ 0: GenericQuery('0', 'mutually exclusive events are independent?'), 9: GenericQuery('9', 'is tan x a function?'), 616: GenericQuery('616', 'what is the relationship between polarity and hydrophobicity?'), }) self._test_queries('lotte/technology/dev/forum', count=2003, items={ 0: GenericQuery('0', 'Strikethrough with GitHub Markdown'), 9: GenericQuery('9', 'GitHub - Whats this Pro tag on my profile?'), 2002: GenericQuery('2002', 'I have a hardware detection problem, what logs do I need to look into?'), }) self._test_queries('lotte/technology/dev/search', count=916, items={ 0: GenericQuery('0', 'how many devices can you connect to bluetooth?'), 9: GenericQuery('9', 'do docking stations have mac addresses?'), 915: GenericQuery('915', 'what does it mean when someone is active but no green dot?'), }) self._test_queries('lotte/technology/test/forum', count=2004, items={ 0: GenericQuery('0', 'Why does man print gimme gimme gimme at 00:30?'), 9: GenericQuery('9', 'How do I grep for multiple patterns with pattern having a pipe character?'), 2003: GenericQuery('2003', 'Can I automatically log in to open WiFi that requires web login/password?'), }) self._test_queries('lotte/technology/test/search', count=596, items={ 0: GenericQuery('0', 'which ipods are no longer supported?'), 9: GenericQuery('9', 'how to change the name of my apple pencil?'), 595: GenericQuery('595', 'is ping tcp or udp?'), }) self._test_queries('lotte/writing/dev/forum', count=2003, items={ 0: GenericQuery('0', 'The Rules of Writing'), 9: GenericQuery('9', 'How do I translate into a gendered language where the gender would be a spoiler?'), 2002: GenericQuery('2002', 'Can I say I Java, or does it have to be I do Java?'), }) self._test_queries('lotte/writing/dev/search', count=497, items={ 0: GenericQuery('0', 'how are you doing lately meaning?'), 9: GenericQuery('9', 'what is the difference between sign in and sign up?'), 496: GenericQuery('496', 'can a tv screen be used as a camera?'), }) self._test_queries('lotte/writing/test/forum', count=2000, items={ 0: GenericQuery('0', 'How do you quote a passage that has used [sic] mistakenly?'), 9: GenericQuery('9', 'Is there a word or phrase for the feeling you get after looking at a word for too long?'), 1999: GenericQuery('1999', 'Opposite of a diet'), }) self._test_queries('lotte/writing/test/search', count=1071, items={ 0: GenericQuery('0', 'what is the difference between a college and an academy?'), 9: GenericQuery('9', 'what is the difference between present continuous tense and past continuous tense?'), 1070: GenericQuery('1070', 'what is the difference between pricey and pricey?'), }) self._test_queries('lotte/pooled/dev/forum', count=10097, items={ 0: GenericQuery('0', 'The Rules of Writing'), 9: GenericQuery('9', 'How do I translate into a gendered language where the gender would be a spoiler?'), 10096: GenericQuery('10096', 'Direct Pull (V-Brake) vs. Center Pull Cantilevers (pros and cons)'), }) self._test_queries('lotte/pooled/dev/search', count=2931, items={ 0: GenericQuery('0', 'how are you doing lately meaning?'), 9: GenericQuery('9', 'what is the difference between sign in and sign up?'), 2930: GenericQuery('2930', 'is there a difference between red and clear power steering fluid?'), }) self._test_queries('lotte/pooled/test/forum', count=10025, items={ 0: GenericQuery('0', 'How do you quote a passage that has used [sic] mistakenly?'), 9: GenericQuery('9', 'Is there a word or phrase for the feeling you get after looking at a word for too long?'), 10024: GenericQuery('10024', 'What is each side of a 4-sided grater for?'), }) self._test_queries('lotte/pooled/test/search', count=3869, items={ 0: GenericQuery('0', 'what is the difference between a college and an academy?'), 9: GenericQuery('9', 'what is the difference between present continuous tense and past continuous tense?'), 3868: GenericQuery('3868', 'is zone allowed in the nba?'), }) def test_qrels(self): self._test_qrels('lotte/lifestyle/dev/forum', count=12823, items={ 0: TrecQrel('0', '116', 1, '0'), 9: TrecQrel('1', '9573', 1, '0'), 12822: TrecQrel('2075', '229252', 1, '0'), }) self._test_qrels('lotte/lifestyle/dev/search', count=1376, items={ 0: TrecQrel('0', '1615', 1, '0'), 9: TrecQrel('1', '7323', 1, '0'), 1375: TrecQrel('416', '255775', 1, '0'), }) self._test_qrels('lotte/lifestyle/test/forum', count=10278, items={ 0: TrecQrel('0', '50103', 1, '0'), 9: TrecQrel('0', '60904', 1, '0'), 10277: TrecQrel('2001', '29723', 1, '0'), }) self._test_qrels('lotte/lifestyle/test/search', count=1804, items={ 0: TrecQrel('0', '14700', 1, '0'), 9: TrecQrel('2', '5365', 1, '0'), 1803: TrecQrel('660', '112612', 1, '0'), }) self._test_qrels('lotte/recreation/dev/forum', count=12752, items={ 0: TrecQrel('0', '130975', 1, '0'), 9: TrecQrel('0', '135480', 1, '0'), 12751: TrecQrel('2001', '107742', 1, '0'), }) self._test_qrels('lotte/recreation/dev/search', count=1754, items={ 0: TrecQrel('0', '37577', 1, '0'), 9: TrecQrel('4', '26042', 1, '0'), 1753: TrecQrel('562', '224939', 1, '0'), }) self._test_qrels('lotte/recreation/test/forum', count=6947, items={ 0: TrecQrel('0', '156504', 1, '0'), 9: TrecQrel('2', '137362', 1, '0'), 6946: TrecQrel('2001', '128682', 1, '0'), }) self._test_qrels('lotte/recreation/test/search', count=1991, items={ 0: TrecQrel('0', '38021', 1, '0'), 9: TrecQrel('5', '67168', 1, '0'), 1990: TrecQrel('923', '145431', 1, '0'), }) self._test_qrels('lotte/science/dev/forum', count=12271, items={ 0: TrecQrel('0', '41234', 1, '0'), 9: TrecQrel('0', '43389', 1, '0'), 12270: TrecQrel('2012', '245863', 1, '0'), }) self._test_qrels('lotte/science/dev/search', count=1480, items={ 0: TrecQrel('0', '17292', 1, '0'), 9: TrecQrel('5', '28427', 1, '0'), 1479: TrecQrel('537', '331634', 1, '0'), }) self._test_qrels('lotte/science/test/forum', count=15515, items={ 0: TrecQrel('0', '1468504', 1, '0'), 9: TrecQrel('0', '1468548', 1, '0'), 15514: TrecQrel('2016', '1677617', 1, '0'), }) self._test_qrels('lotte/science/test/search', count=1738, items={ 0: TrecQrel('0', '396636', 1, '0'), 9: TrecQrel('1', '417869', 1, '0'), 1737: TrecQrel('616', '1675847', 1, '0'), }) self._test_qrels('lotte/technology/dev/forum', count=15741, items={ 0: TrecQrel('0', '1248849', 1, '0'), 9: TrecQrel('1', '1274089', 1, '0'), 15740: TrecQrel('2002', '658976', 1, '0'), }) self._test_qrels('lotte/technology/dev/search', count=2676, items={ 0: TrecQrel('0', '281401', 1, '0'), 9: TrecQrel('3', '314964', 1, '0'), 2675: TrecQrel('915', '1270172', 1, '0'), }) self._test_qrels('lotte/technology/test/forum', count=15890, items={ 0: TrecQrel('0', '319429', 1, '0'), 9: TrecQrel('1', '310098', 1, '0'), 15889: TrecQrel('2003', '133429', 1, '0'), }) self._test_qrels('lotte/technology/test/search', count=2045, items={ 0: TrecQrel('0', '101347', 1, '0'), 9: TrecQrel('1', '101391', 1, '0'), 2044: TrecQrel('595', '637834', 1, '0'), }) self._test_qrels('lotte/writing/dev/forum', count=15098, items={ 0: TrecQrel('0', '113455', 1, '0'), 9: TrecQrel('0', '113612', 1, '0'), 15097: TrecQrel('2002', '83828', 1, '0'), }) self._test_qrels('lotte/writing/dev/search', count=1287, items={ 0: TrecQrel('0', '9032', 1, '0'), 9: TrecQrel('4', '29678', 1, '0'), 1286: TrecQrel('496', '246118', 1, '0'), }) self._test_qrels('lotte/writing/test/forum', count=12906, items={ 0: TrecQrel('0', '14298', 1, '0'), 9: TrecQrel('0', '14356', 1, '0'), 12905: TrecQrel('1999', '179657', 1, '0'), }) self._test_qrels('lotte/writing/test/search', count=3546, items={ 0: TrecQrel('0', '84481', 1, '0'), 9: TrecQrel('4', '13250', 1, '0'), 3545: TrecQrel('1070', '19407', 1, '0'), }) self._test_qrels('lotte/pooled/dev/forum', count=68685, items={ 0: TrecQrel('0', '113455', 1, '0'), 9: TrecQrel('0', '113612', 1, '0'), 68684: TrecQrel('10096', '2389213', 1, '0'), }) self._test_qrels('lotte/pooled/dev/search', count=8573, items={ 0: TrecQrel('0', '9032', 1, '0'), 9: TrecQrel('4', '29678', 1, '0'), 8572: TrecQrel('2930', '2415736', 1, '0'), }) self._test_qrels('lotte/pooled/test/forum', count=61536, items={ 0: TrecQrel('0', '14298', 1, '0'), 9: TrecQrel('0', '14356', 1, '0'), 61535: TrecQrel('10024', '2729365', 1, '0'), }) self._test_qrels('lotte/pooled/test/search', count=11124, items={ 0: TrecQrel('0', '84481', 1, '0'), 9: TrecQrel('4', '13250', 1, '0'), 11123: TrecQrel('3868', '2812254', 1, '0'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/medline.py ================================================ import re import unittest import ir_datasets from ir_datasets.datasets.medline import MedlineDoc, TrecGenomicsQuery, TrecPmQuery, TrecPm2017Query from ir_datasets.formats import GenericQuery, TrecQrel from .base import DatasetIntegrationTest _logger = ir_datasets.log.easy() class TestMedline(DatasetIntegrationTest): def test_medline_docs(self): self._test_docs('medline/2004', count=3672808, items={ 0: MedlineDoc('10605436', 'Concerning the localization of steroids in centrioles and basal bodies by immunofluorescence.', re.compile('^Specific steroid antibodies, by the immunofluorescence technique, regularly reveal fluorescent centr.{374}y affect cell growth and differentiation in some way different from the two\\-step receptor mechanism\\.$', flags=48)), 9: MedlineDoc('10605445', 'Intracellular divalent cation release in pancreatic acinar cells during stimulus-secretion coupling. II. Subcellular localization of the fluorescent probe chlorotetracycline.', re.compile('^Subcellular distribution of the divalent cation\\-sensitive probe chlorotetracycline \\(CTC\\) was observe.{1665}ase of calcium from either mitochondria or another organelle that requires ATP to sequester calcium\\.$', flags=48)), 3672807: MedlineDoc('9864604', '[Is the treatment of left ventricular systolic dysfunction different according to the etiology?]', re.compile('^Cardiac failure is the terminal stage of evolution, the finality of many valvular, vascular, myocard.{836}negligible: the medications are based on the results of large scale, controlled, therapeutic trials\\.$', flags=48)), }) self._test_docs('medline/2017', count=26740025, items={ 0: MedlineDoc('AACR_2014-3448', re.compile('^Fibroblast growth factor receptor is expressed as a constitutively active receptor tyrosine kinase i.{2}chronic lymphocytic leukemia B cells and exists in an active complex with Axl: Dual targeting in CLL$', flags=48), re.compile('^B cell chronic lymphocytic leukemia \\(CLL\\) is an incurable disease and represents a significant healt.{2636}gation as a way to develop a more effective and efficient therapeutic intervention for CLL patients\\.$', flags=48)), 9: MedlineDoc('ASCO_189985-199', 'Comprehensive molecular and immune profiling of non-small cell lung cancer and matched distant metastases to suggest distinct molecular mechanisms underlying metastasis.', re.compile('^Background: Despite complete resection, many non\\-small cell lung cancer \\(NSCLC\\) patients still devel.{1949}nts\\. Furthermore, immune suppression may be a characteristic of cancer cells of metastatic capacity\\.$', flags=48)), 26740024: MedlineDoc('27868941', 'Vote of no confidence in trust board.', 'PROPOSALS TO close two medical wards have led nurses at Llandough Hospital, South Glamorgan, to pass a vote of no confidence in their trust board.'), }) def test_medline_queries(self): self._test_queries('medline/2004/trec-genomics-2004', count=50, items={ 0: TrecGenomicsQuery('1', 'Ferroportin-1 in humans', 'Find articles about Ferroportin-1, an iron transporter, in humans.', 'Ferroportin1 (also known as SLC40A1; Ferroportin 1; FPN1; HFE4; IREG1; Iron regulated gene 1; Iron-regulated transporter 1; MTP1; SLC11A3; and Solute carrier family 11 (proton-coupled divalent metal ion transporters), member 3) may play a role in iron transport.'), 9: TrecGenomicsQuery('10', 'NEIL1', 'Find articles about the role of NEIL1 in repair of DNA.', 'Interested in role that NEIL1 plays in DNA repair.'), 49: TrecGenomicsQuery('50', 'Low temperature protein expression in E. coli', 'Find research on improving protein expressions at low temperature in Escherichia coli bacteria.', 'The researcher is not satisfied with the yield of expressing a protein in E. coli when grown at low temperature and is searching for a better solution. The researcher is willing to try a different organism and/or method.'), }) self._test_queries('medline/2004/trec-genomics-2005', count=50, items={ 0: GenericQuery('100', 'Describe the procedure or methods for how to "open up" a cell through a process called "electroporation."'), 9: GenericQuery('109', "Describe the procedure or methods for fluorogenic 5'-nuclease assay."), 49: GenericQuery('149', 'Provide information about Mutations of the alpha 4-GABAA receptor and its/their impact on behavior.'), }) self._test_queries('medline/2017/trec-pm-2017', count=30, items={ 0: TrecPm2017Query('1', 'Liposarcoma', 'CDK4 Amplification', '38-year-old male', 'GERD'), 9: TrecPm2017Query('10', 'Lung adenocarcinoma', 'KRAS (G12C)', '61-year-old female', 'Hypertension, Hypercholesterolemia'), 29: TrecPm2017Query('30', 'Pancreatic adenocarcinoma', 'RB1, TP53, KRAS', '57-year-old female', 'None'), }) self._test_queries('medline/2017/trec-pm-2018', count=50, items={ 0: TrecPmQuery('1', 'melanoma', 'BRAF (V600E)', '64-year-old male'), 9: TrecPmQuery('10', 'melanoma', 'KIT (L576P)', '65-year-old female'), 49: TrecPmQuery('50', 'acute myeloid leukemia', 'FLT3', '13-year-old male'), }) def test_medline_qrels(self): self._test_qrels('medline/2004/trec-genomics-2004', count=8268, items={ 0: TrecQrel('1', '10077651', 2, '0'), 9: TrecQrel('1', '10449402', 2, '0'), 8267: TrecQrel('50', '9951698', 1, '0'), }) self._test_qrels('medline/2004/trec-genomics-2005', count=39958, items={ 0: TrecQrel('100', '10023709', 0, '0'), 9: TrecQrel('100', '10138840', 0, '0'), 39957: TrecQrel('149', '9989364', 0, '0'), }) self._test_qrels('medline/2017/trec-pm-2017', count=22642, items={ 0: TrecQrel('1', '10065107', 0, '0'), 9: TrecQrel('1', '10755400', 2, '0'), 22641: TrecQrel('30', 'ASCO_88462-115', 2, '0'), }) self._test_qrels('medline/2017/trec-pm-2018', count=22429, items={ 0: TrecQrel('1', '1007359', 0, '0'), 9: TrecQrel('1', '13188512', 0, '0'), 22428: TrecQrel('50', 'ASCO_35470-65', 2, '0'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/miracl.py ================================================ import re import unittest import ir_datasets from ir_datasets.datasets.miracl import MiraclDoc from ir_datasets.formats import GenericQuery, TrecQrel from .base import DatasetIntegrationTest _logger = ir_datasets.log.easy() class TestMiracl(DatasetIntegrationTest): def test_docs(self): self._test_docs('miracl/ar', count=2061414, items={ 0: MiraclDoc('7#0', 'ماء', re.compile('^الماء مادةٌ شفافةٌ عديمة اللون والرائحة، وهو المكوّن الأساسي للجداول والبحيرات والبحار والمحيطات وكذ.{230} يكون الماء سائلاً، ولكنّ حالاته الأخرى شائعة الوجود أيضاً؛ وهي حالة الجليد الصلبة والبخار الغازيّة\\.$', flags=48)), 9: MiraclDoc('7#9', 'ماء', 'كما يوجد الجليد على شكل صفائح جليديّة في الأرض وفي الفوّهات والصخور البركانيّة في القمر، وفي أقمار أخرى مثل قمر شارون.'), 2061413: MiraclDoc('5272574#0', 'شاهيه', re.compile('^الشاهيه او الشوهيه هو مرادف للكافيه وهو مكان لتقديم الشاي بدلاً من القهوة \\. الشاهيه غير منتشرة عالمي.{208}ث العرش و والي الممالك السبعة ايمن بن فهد اول من وضع لبنة لهذا النوع من المحلات و من مهد الطريق لهم\\.$', flags=48)), }) self._test_docs('miracl/bn', count=297265, items={ 0: MiraclDoc('608#0', 'বাংলা ভাষা', re.compile('^বাংলা ভাষা \\(; \\) দক্ষিণ এশিয়ার বঙ্গ অঞ্চলের মানুষের স্থানীয় ভাষা, এই অঞ্চলটি বর্তমানে রাজনৈতিকভাবে .{410}, এবং ভারতের জাতীয় স্তোত্র এই ভাষাতেই রচিত এবং তা থেকেই দক্ষিণ এশিয়ায় এই ভাষার গুরুত্ব বোঝা যায়।$', flags=48)), 9: MiraclDoc('608#9', 'বাংলা ভাষা', re.compile('^বাংলাদেশ রাষ্ট্রের রাষ্ট্রভাষা ও সরকারি ভাষা হলো বাংলা। এছাড়াও ভারতীয় সংবিধান দ্বারা স্বীকৃত ২৩টি .{481} বাহিনীর ৫,৩০০ বাংলাদেশি সৈনিকের সেবার স্বীকৃতিস্বরূপ বাংলা ভাষাকে সরকারি ভাষার মর্যাদা প্রদান করেন।$', flags=48)), 297264: MiraclDoc('719190#0', 'কোনরাড এলস্ট', re.compile('^কোনরাড এলস্ট \\(জন্ম 7 অগাস্ট 1959\\) একজন বেলজীয় প্রাচ্যবিদ এবং ভারতবিদ যিনি তুলনামূলক ধর্মতত্ত্ব, হিন.{454}ভূত। এলস্ট হিন্দু জাতীয়তাবাদের বিষয়ে ডক্টরেট করেছেন, এবং হিন্দু জাতীয়তাবাদ আন্দোলনের সমর্থন করেন।$', flags=48)), }) self._test_docs('miracl/de', count=15866222, items={ 0: MiraclDoc('1#0', 'Alan Smithee', re.compile('^Alan Smithee steht als Pseudonym für einen fiktiven Regisseur, der Filme verantwortet, bei denen der.{93}on 1968 bis 2000 wurde es von der Directors Guild of America \\(DGA\\) für solche Situationen empfohlen\\.$', flags=48)), 9: MiraclDoc('1#9', 'Alan Smithee', 'Zu den Drehbuchautoren, die das Pseudonym benutzt haben, gehören Sam Raimi und Ivan Raimi, die das Drehbuch zu "Die total beknackte Nuß" als "Alan Smithee, Jr." und "Alan Smithee, Sr." schrieben.'), 15866221: MiraclDoc('12150081#3', 'Marcel Lotka', re.compile('^Nach seinem Vertragsende bei Hertha BSC kehrt Lotka zur Saison 2022/23 in das Ruhrgebiet zurück und .{7}t zur zweiten Mannschaft von Borussia Dortmund\\. Er unterschrieb einen Vertrag bis zum 30\\. Juni 2024\\.$', flags=48)), }) self._test_docs('miracl/en', count=32893221, items={ 0: MiraclDoc('12#0', 'Anarchism', re.compile('^Anarchism is a political philosophy that advocates self\\-governed societies based on voluntary, coope.{285}lds capitalism, the state, and representative democracy to be undesirable, unnecessary, and harmful\\.$', flags=48)), 9: MiraclDoc('12#9', 'Anarchism', re.compile('^The French Pierre\\-Joseph Proudhon is regarded as the first self\\-proclaimed anarchist, a label he ado.{1030}ractices inspired subsequent anarchists and made him one of the leading social thinkers of his time\\.$', flags=48)), 32893220: MiraclDoc('59828278#1', 'Jacqueline Casalegno', 'Casalegno died on 23 January 2019 at the age of 93.'), }) self._test_docs('miracl/es', count=10373953, items={ 0: MiraclDoc('7#0', 'Andorra', re.compile('^Andorra, oficialmente Principado de Andorra \\(\\), es un micro\\-Estado soberano sin litoral ubicado en e.{573}a —con los departamentos de Ariège y Pirineos Orientales \\(Occitania\\)—\\. Pertenece a la Europa latina\\.$', flags=48)), 9: MiraclDoc('7#9', 'Andorra', re.compile('^Una hipótesis relaciona a la palabra Andorra con "andurrial", palabra cuyo origen sigue siendo incie.{91}rrales", o bien, según Francisco Martínez Marina, con el árabe "al darra"h, que significa "boscosa"\\.$', flags=48)), 10373952: MiraclDoc('10172220#4', 'Diario de SSR', re.compile('^A día de hoy ella gana su propio dinero, y con tan solo 16 años colabora con algunas fundaciones con.{11}er y niños sin fronteras\\. También es apadrinadora de chimpancés del país natal de su madre, Senegal\\.$', flags=48)), }) self._test_docs('miracl/fa', count=2207172, items={ 0: MiraclDoc('594#0', 'ویکی\u200cپدیا', re.compile('^ویکی\u200cپدیا \\(کوته\u200cنوشت به\u200cصورت «وپ» و «WP»\\) یک دانشنامه برخط چندزبانه مبتنی بر وب با محتوای آزاد و همک.{277}ست\\. هدف ویکی\u200cپدیا آفرینش و انتشار جهانی یک دانشنامه با محتوای آزاد به تمامی زبان\u200cهای زندهٔ دنیا است\\.$', flags=48)), 9: MiraclDoc('594#9', 'ویکی\u200cپدیا', re.compile('^کاربران اسپانیایی در فوریه ۲۰۰۲ از ترس تبلیغات تجاری و نبود کنترل در یک ویکی\u200cپدیای انگلیسی\u200cمحور از ".{208}ص به جاهای دیگری نظیر ویکی\u200cاینفو انتقال داده شده\u200cاند\\. در ویکی\u200cاینفو خبری از «دیدگاه بی\u200cطرفانه» نیست\\.$', flags=48)), 2207171: MiraclDoc('5934583#0', 'ادوکسودین', 'ادوکسودین () یک داروی ضد ویروسی است. این دارو یک آنالوگ نوکلئوزیدی از تیمیدین است، این دارو در برابر ویروس هرپس سیمپلکس اثربخشی نشان داده است.'), }) self._test_docs('miracl/fi', count=1883509, items={ 0: MiraclDoc('1#0', 'Amsterdam', re.compile('^Amsterdam on Alankomaiden pääkaupunki\\. Amsterdam on väkiluvultaan Alankomaiden suurin kaupunki, huht.{399} pääkaupunki, sijaitsevat niin kuningashuone, hallitus, parlamentti kuin korkein oikeuskin Haagissa\\.$', flags=48)), 9: MiraclDoc('1#9', 'Amsterdam', re.compile('^Amsterdamia johtaa muiden Alankomaiden kuntien tapaan valtuusto\\. Amsterdamin kaupunginvaltuustoon va.{281}ginvaltuuston että raatimieskollegion puheenjohtaja, mutta hänellä ei ole äänioikeutta valtuustossa\\.$', flags=48)), 1883508: MiraclDoc('1494441#5', 'Suomen urheilu 1990', 'Vuoden valmentajaksi valittiin Päivi Alafrantin valmentaja Eino Maksimainen.'), }) self._test_docs('miracl/fr', count=14636953, items={ 0: MiraclDoc('3#0', 'Antoine Meillet', 'Paul Jules Antoine Meillet, né le à Moulins (Allier) et mort le à Châteaumeillant (Cher), est le principal linguiste français des premières décennies du . Il est aussi philologue.'), 9: MiraclDoc('3#9', 'Antoine Meillet', re.compile("^Selon le linguiste allemand Walter Porzig, Meillet est un « grand précurseur »\\. Il montre, par exemp.{26}s indo\\-européens, les groupes indo\\-européens sont le résultat historique d'une variation diatopique\\.$", flags=48)), 14636952: MiraclDoc('14821800#8', 'Exploitation aurifère au Tchad', re.compile("^La dangerosité de la prospection est telle que les employeurs recourent à la torture pour forcer les.{107}rs recrutés vivent en outre une situation d'esclavage, sans paie, avec insuffisamment de nourriture\\.$", flags=48)), }) self._test_docs('miracl/hi', count=506264, items={ 0: MiraclDoc('10#0', 'हम होंगे कामयाब', re.compile('^हम होंगे कामयाब \\( का गिरिजा कुमार माथुर द्वारा किया गया हिंदी भावानुवाद\\) एक प्रतिरोध गीत है। यह गीत .{144}ाता है, जो चार्ल्स अल्बर्ट टिंडले द्वारा गाया गया था और जिसे 1900 में पहली बार प्रकाशित किया गया था।$', flags=48)), 9: MiraclDoc('14#7', 'दैनिक पूजा', 'अगर सम्भव हो तो गणेश के 108 नाम जपें :'), 506263: MiraclDoc('1355429#0', 'चिर्रावूरु यज्ञेश्वर चिन्तामणि', re.compile('^चिर्रावूरु यज्ञेश्वर चिन्तामणि \\(10 अप्रैल 1880 – 1 जुलाई 1941\\) भारत के एक सम्पादक, पत्रकार, उदार राज.{16}। उनका जन्म युगादि \\(तेलुगु नववर्ष दिवस\\) को विजयनगरम् में हुआ था जो वर्तमान में आन्ध्र प्रदेश में है।$', flags=48)), }) self._test_docs('miracl/id', count=1446315, items={ 0: MiraclDoc('1#0', 'Asam deoksiribonukleat', re.compile('^Asam deoksiribonukleat, lebih dikenal dengan singkatan DNA \\(bahasa Inggris: "deoxyribonucleic acid"\\).{1083}n G\\), ikatan hidrogen mengikat basa\\-basa dari kedua unting polinukleotida membentuk DNA unting ganda$', flags=48)), 9: MiraclDoc('1#9', 'Asam deoksiribonukleat', re.compile('^Nukleobasa diklasifikasikan ke dalam dua jenis: purina \\(A dan G\\) yang berupa fusi senyawa heteroling.{371}atan yang telah disintesis untuk mengkaji sifat\\-sifat asam nukleat dan digunakan dalam bioteknologi\\.$', flags=48)), 1446314: MiraclDoc('2733009#0', 'Poggio a Caiano', re.compile('^Poggio a Caiano merupakan sebuah munisipalitas di provinsi Prato, Toskana \\(Italia\\)\\. Di wilayah kotam.{10}erdapat salah satu vila Medici yang disebut Villa di Poggio a Caiano yang mendominasi kota tersebut\\.$', flags=48)), }) self._test_docs('miracl/ja', count=6953614, items={ 0: MiraclDoc('5#0', 'アンパサンド', 'アンパサンド (&、英語名:) とは並立助詞「…と…」を意味する記号である。ラテン語の の合字で、Trebuchet MSフォントでは、と表示され "et" の合字であることが容易にわかる。ampersa、すなわち "and per se and"、その意味は"and [the symbol which] by itself [is] and"である。'), 9: MiraclDoc('10#0', '言語', 'この記事では言語(げんご)、特に自然言語について述べる。'), 6953613: MiraclDoc('3899088#4', 'DJ SAAT', '2017年 音楽プロデューサーとして始動。'), }) self._test_docs('miracl/ko', count=1486752, items={ 0: MiraclDoc('5#0', '지미 카터', '제임스 얼 "지미" 카터 주니어(, 1924년 10월 1일 ~ )는 민주당 출신 미국 39번째 대통령 (1977년 ~ 1981년)이다.'), 9: MiraclDoc('5#9', '지미 카터', '퇴임 이후 민간 자원을 적극 활용한 비영리 기구인 카터 재단을 설립한 뒤 민주주의 실현을 위해 제 3세계의 선거 감시 활동 및 기니 벌레에 의한 드라쿤쿠르스 질병 방재를 위해 힘썼다. 미국의 빈곤층 지원 활동, 사랑의 집짓기 운동, 국제 분쟁 중재 등의 활동도 했다.'), 1486751: MiraclDoc('2431764#0', '갤러거', '갤러거의 다른 뜻은 다음과 같다.'), }) self._test_docs('miracl/ru', count=9543918, items={ 0: MiraclDoc('7#0', 'Литва', 'Литва́ (), официальное название\xa0— Лито́вская Респу́блика ()\xa0— государство, расположенное в северо-восточной части Европы. Столица страны\xa0— Вильнюс.'), 9: MiraclDoc('7#9', 'Литва', re.compile('^Традиционно считается, что этническая основа Литвы сформирована носителями археологочической культур.{94}ной Литвы и Северо\\-Западной Белоруссии\\. Около VII века н\\.\xa0э\\. литовский язык отделился от латышского\\.$', flags=48)), 9543917: MiraclDoc('7761282#4', 'Грин, Роберт Фрэнсис', 'Скончался Роберт Грин 5 октября 1946 года.'), }) self._test_docs('miracl/sw', count=131924, items={ 0: MiraclDoc('2#0', 'Akiolojia', re.compile('^Akiolojia \\(kutoka Kiyunani αρχαίος = "zamani" na λόγος = "neno, usemi"\\) ni somo linalohusu mabaki ya.{94}kwa kuchimba ardhi na kutafuta mabaki ya majengo, makaburi, silaha, vifaa, vyombo na mifupa ya watu\\.$', flags=48)), 9: MiraclDoc('10#2', 'Daktari', '2) Kwa kufuata tabia za lugha nyingine "daktari" hutumiwa pia kama jina la heshima kwa mtu aliyepata shahada ya uzamivu au "PhD" ambayo ni shahada ya juu kabisa.'), 131923: MiraclDoc('108712#3', 'Nafanikiwa Inspiration', 'Instagram: @nafanikiwa_inspiration'), }) self._test_docs('miracl/te', count=518079, items={ 0: MiraclDoc('1#0', 'మొదటి పేజీ', '__NOEDITSECTION__'), 9: MiraclDoc('786#8', 'గుంటూరు జిల్లా', re.compile('^గుంటూరు జిల్లా సగటున 33 మీటర్లు ఎత్తులో ఉంది\\. చాలవరకు సమతల ప్రదేశం\\. కొన్ని కొండలు కూడా ఉన్నాయి\\. కృష్.{408}ో ఎత్తిపోతల అనబడే జలపాతం నల్లమలై కొండలపై చంద్రవంక నదిపై ఉంది\\. దీనిలో 21మీ ఎత్తునుండి నీరు పారుతుంది\\.$', flags=48)), 518078: MiraclDoc('273584#1', 'లక్ష్మణ్\u200cచాందా మండలం', '2011 భారత జనగణన గణాంకాల ప్రకారం - మొత్తం 34,739 - పురుషులు 17,515 - స్త్రీలు 19,224'), }) self._test_docs('miracl/th', count=542166, items={ 0: MiraclDoc('1#0', 'หน้าหลัก', 'วิกิพีเดียดำเนินการโดยมูลนิธิวิกิมีเดีย องค์กรไม่แสวงผลกำไร ผู้ดำเนินการอีกหลาย ได้แก่'), 9: MiraclDoc('545#7', 'ดาราศาสตร์', re.compile('^เมื่อสังคมมีวิวัฒนาการขึ้นในดินแดนต่าง ๆ การสังเกตการณ์ทางดาราศาสตร์ก็ซับซ้อนมากขึ้น โดยเฉพาะอย่างยิ.{553}ดาวต่าง ๆ เคลื่อนที่ไปโดยรอบ แนวคิดนี้เรียกว่า แบบจำลองแบบโลกเป็นศูนย์กลางจักรวาล \\(geocentric model\\)$', flags=48)), 542165: MiraclDoc('1001591#3', 'มาเลยาเซมเลีย', "การต่อสู้ยังเป็นส่วนหนึ่งของหนังสือเล่มแรกของ Brezhnev's trilogy ซึ่งเป็นการที่ทำให้เกินจริงต่อบทบาทของเลโอนิด เบรจเนฟ ในช่วงมหาสงครามของผู้รักชาติ"), }) self._test_docs('miracl/yo', count=49043, items={ 0: MiraclDoc('598#0', 'A', 'aa ab ac ad ae af ag ah ai aj ak al am an ao ap aq ar as at au av aw ax ay az'), 9: MiraclDoc('604#0', 'G', 'ga gb gc gd ge gf gg gh gi gj gk gl gm gn go gp gq gr gs gt gu gv gw gx gy gz'), 49042: MiraclDoc('71570#11', 'Federal University of Technology', '• Uche Jombo Rodriguez'), }) self._test_docs('miracl/zh', count=4934368, items={ 0: MiraclDoc('13#0', '数学', '数学,是研究數量、结构以及空间等概念及其变化的一門学科,从某种角度看屬於形式科學的一種。數學利用抽象化和邏輯推理,從計數、計算、量度、對物體形狀及運動的觀察發展而成。數學家們拓展這些概念,以公式化新的猜想,以及從選定的公理及定義出發,嚴謹地推導出一些定理。'), 9: MiraclDoc('13#9', '数学', '到了16世纪,算术、初等代数以及三角学等初等数学已大体完备。17世纪变量概念的产生使人们开始研究变化中的量与量的互相关系和图形间的互相变换,微积分的概念也在此時形成。随着數學轉向形式化,为研究数学基础而产生的集合论和数理逻辑等也开始发展。数学的重心从求解实际问题转变到对一般形式上的思考。'), 4934367: MiraclDoc('7887526#10', '嘉寶潭之役', '三日之期已到,陳耀見部署已定,收下紅旗之後,仍豎立白旗,叛軍黨怒不可遏,攻勢洶洶而來,大有剿滅陳耀部隊的態勢,唯剛交戰沒多久,柑仔井林家、湳仔阮家收拾餘兵,再度集結而來,高舉白旗夾殺叛軍部隊。首領陳耀親上戰場,奮勇殺敵,陳姓家丁亦不落人後。待到自鹿港商船運來的四尊大炮,炮擊敵軍,賊黨鑒於頹勢已現,倉促收兵,嘉寶潭之危遂解。'), }) def test_queries(self): self._test_queries('miracl/ar/train', count=3495, items={ 0: GenericQuery('1', 'ما هي المسألة الشرقية ؟'), 9: GenericQuery('12', 'ماهو الأمن البشري ؟'), 3494: GenericQuery('15481', 'من قاد الثورة العرابية في مصر؟'), }) self._test_queries('miracl/ar/dev', count=2896, items={ 0: GenericQuery('0', 'هل عدم القيام بجهد جسماني ممكن ان يسبب الأرق؟'), 9: GenericQuery('60', 'من هو مؤلف سلسلة حرب النجوم؟'), 2895: GenericQuery('15511', 'من ابتكر المثلجات؟'), }) self._test_queries('miracl/ar/test-a', count=936, items={ 0: GenericQuery('15513', 'كم عدد مرات فوز الأوروغواي ببطولة كاس العالم لكرو القدم؟'), 9: GenericQuery('15523', 'من رئيس ألمانيا النازية في الحرب العالمية الثانية؟'), 935: GenericQuery('16594', 'ماهى أول مؤلفة لي ج. ك. رولينغ ؟'), }) self._test_queries('miracl/ar/test-b', count=1405, items={ 0: GenericQuery('1002902#0', 'ما هي قوة انتشار الدمار الذي تحدثه القنبلة النووية؟'), 9: GenericQuery('1034420#0', 'أين تقع دولة تونس في أفريقيا ؟'), 1404: GenericQuery('999091#0', 'من هو الحاكم في السعوديه؟'), }) self._test_queries('miracl/bn/train', count=1631, items={ 0: GenericQuery('0', 'চেঙ্গিস খান কোন বংশের রাজা ছিলেন ?'), 9: GenericQuery('12', 'বাংলাদেশের অষ্টম রাষ্ট্রপতি জিয়াউর রহমানের বাবার নাম কী ?'), 1630: GenericQuery('2151', 'জামাতি ইসলাম দলটির প্রতিষ্ঠাতা কে ?'), }) self._test_queries('miracl/bn/dev', count=411, items={ 0: GenericQuery('4', 'ইংরেজ আন্তর্জাতিক ক্রিকেট তারকা জর্জ গিবসন ম্যাকাউলি কি একজন ডানহাতি ব্যাটসম্যান ছিলেন ?'), 9: GenericQuery('63', 'বাংলাদেশের অষ্টম রাষ্ট্রপতি জিয়াউর রহমান কবে জন্মগ্রহণ করেন ?'), 410: GenericQuery('2152', 'বাংলা ব্যাকরণ মতে বিশেষণ কয় প্রকার ?'), }) self._test_queries('miracl/bn/test-a', count=102, items={ 0: GenericQuery('2153', 'পশ্চিম ভারতের মহারাষ্ট্র রাজ্যের মুম্বাই শহরে নির্মিত গেটওয়ে অব ইন্ডিয়া স্থাপত্যটির ভিত্তিপ্রস্তর স্থাপন করেন কে ?'), 9: GenericQuery('2162', 'মাওলানা ভাসানী বিজ্ঞান ও প্রযুক্তি বিশ্ববিদ্যালয়টির মোট আয়তন কত ?'), 101: GenericQuery('2263', 'চট্টগ্রাম কি কখনো ধর্মপালের অধীনে ছিল ?'), }) self._test_queries('miracl/bn/test-b', count=1130, items={ 0: GenericQuery('597183#0', 'বর্তমান সালে বাংলাদেশের জনসংখ্যা কত?'), 9: GenericQuery('572931#0', 'এরিস্টটল কোন দেশের দার্শনিক ছিলেন?'), 1129: GenericQuery('78447#0', 'কোন সাল থেকে কোন সাল পর্যন্ত ব্রিটিশ রাজ চলেছিল?'), }) self._test_queries('miracl/de/dev', count=305, items={ 0: GenericQuery('81674#0', 'Wo ist das Gebiet der Irokesen-Indianer in Kanada?'), 9: GenericQuery('7484600#0', 'Welcher Bahnhof ist der älteste in den USA?'), 304: GenericQuery('3223301#0', 'Wie viele Sprachen gibt es insgesamt auf der Welt?'), }) self._test_queries('miracl/de/test-b', count=712, items={ 0: GenericQuery('11531728#0', 'Wie lange muss man ein Polizist sein bevor man ein Detektiv werden kann in Deutschland?'), 9: GenericQuery('11700929#0', 'Welche Religion ist am größten in Japan?'), 711: GenericQuery('6333890#0', 'In welcher Sprache kommunizierten russische und norwegische Fischer miteinander?'), }) self._test_queries('miracl/en/train', count=2863, items={ 0: GenericQuery('1', 'When was quantum field theory developed?'), 9: GenericQuery('13', 'When were bluebonnets named the state flower of Texas?'), 2862: GenericQuery('4446', 'What is the population of Mahwah, NJ?'), }) self._test_queries('miracl/en/dev', count=799, items={ 0: GenericQuery('0', 'Is Creole a pidgin of French?'), 9: GenericQuery('47', 'When did Aristagoras become leader of Miletus?'), 798: GenericQuery('4443', 'Who was costume designer for the first Star Wars movie?'), }) self._test_queries('miracl/en/test-a', count=734, items={ 0: GenericQuery('4447', 'Do zebra finches have stripes?'), 9: GenericQuery('4456', 'When was Quezon City founded?'), 733: GenericQuery('5193', 'When did the Bundaberg Central State School become a heritage-listed site?'), }) self._test_queries('miracl/en/test-b', count=1790, items={ 0: GenericQuery('23819476#0', "Which summer dessert is often eaten during Wimbledon's tennis matches?"), 9: GenericQuery('1765998#0', 'John F. Kennedy was assassinated in which city?'), 1789: GenericQuery('56426977#0', 'When did Théodore Chabert enlisted in Frencg Royal Army?'), }) self._test_queries('miracl/es/train', count=2162, items={ 0: GenericQuery('1769696#0', '¿Cuáles son las principales plantas fanerógamas?'), 9: GenericQuery('5810974#0', '¿Cómo calcular el vértice de un triángulo?'), 2161: GenericQuery('508901#0', '¿Qué trata de explicar los conceptos de la filosofía?'), }) self._test_queries('miracl/es/dev', count=648, items={ 0: GenericQuery('1177652#0', '¿Qué es Anónimo de Rávena?'), 9: GenericQuery('188191#0', '¿Es Rumanía una monarquía?'), 647: GenericQuery('7523671#0', '¿En qué categoría fue campeón el piloto Ricardo Tormo?'), }) self._test_queries('miracl/es/test-b', count=1515, items={ 0: GenericQuery('8541928#0', '¿En qué país se encuentra Santa Cruz de la Sierra?'), 9: GenericQuery('8324682#0', '¿En qué continente está Azerbaiyán?'), 1514: GenericQuery('2534161#0', '¿En qué países se publica la revista ¡HOLA!?'), }) self._test_queries('miracl/fa/train', count=2107, items={ 0: GenericQuery('5682911#0', 'خواننده سنتی در چه دستگاهی آواز می خواتد؟'), 9: GenericQuery('4334823#0', 'کاربرد برنامه نویسی در کامپیوتر چیست؟'), 2106: GenericQuery('2528634#0', 'گیاه اسطخودوس از چه گونه ای است؟'), }) self._test_queries('miracl/fa/dev', count=632, items={ 0: GenericQuery('5428682#0', 'سیاستمداران آمریکایی را چه کسی انتخاب می کند؟'), 9: GenericQuery('4809078#0', 'از فناوری\u200cهای فیلمبرداری در کجا استفاده می شود؟'), 631: GenericQuery('5174269#0', 'سینمای میانماربه چه فیلم هایی شهرت دارد؟'), }) self._test_queries('miracl/fa/test-b', count=1476, items={ 0: GenericQuery('239796#0', 'کاربرد آمار در ریاضیات چیست؟'), 9: GenericQuery('5090616#0', 'چرا سن قانونی برای اجرای حکم مهم است؟'), 1475: GenericQuery('58872#0', 'از فتوشاپ در چه مواردی استفاده میشود؟'), }) self._test_queries('miracl/fi/train', count=2897, items={ 0: GenericQuery('0', 'Milloin Charles Fort syntyi?'), 9: GenericQuery('10', 'Montako kuuta Saturnuksella on?'), 2896: GenericQuery('8271', 'Montako kaupunkia Suomessa on?'), }) self._test_queries('miracl/fi/dev', count=1271, items={ 0: GenericQuery('1', 'Mitä on altruismi?'), 9: GenericQuery('52', 'Mistä nimitys markka tulee?'), 1270: GenericQuery('8305', 'Onko Bowser Marion arkkivihollinen?'), }) self._test_queries('miracl/fi/test-a', count=1060, items={ 0: GenericQuery('8318', 'Milloin Kokemäki on perustettu?'), 9: GenericQuery('8327', 'Minä vuonna Charles Darwin julkaisi kuuluisan teoriansa lajien synnystä?'), 1059: GenericQuery('9571', 'Milloin käytiin Persianlahden sota?'), }) self._test_queries('miracl/fi/test-b', count=711, items={ 0: GenericQuery('870432#0', 'Mitä ovat tilannekomediat?'), 9: GenericQuery('347723#0', 'Mitä kieltä käytettiin tuohi kirjeissä?'), 710: GenericQuery('3896#0', 'Mitä tarkoittaa käyttää strategista silmää?'), }) self._test_queries('miracl/fr/train', count=1143, items={ 0: GenericQuery('10756354#0', 'Qui était Princesse Diana?'), 9: GenericQuery('5873198#0', 'Qui est Walt Disney?'), 1142: GenericQuery('8564179#0', 'Quelles sont les caractéristiques d’un vrai dictateur?'), }) self._test_queries('miracl/fr/dev', count=343, items={ 0: GenericQuery('5607241#0', "Qu'est-ce que c'est une famille patricienne?"), 9: GenericQuery('13910718#0', 'Où est situé Chūgoku au Japon?'), 342: GenericQuery('10579479#0', "Combien y a-t-il d'éléments chimiques au total?"), }) self._test_queries('miracl/fr/test-b', count=801, items={ 0: GenericQuery('4716921#0', "Qu'est-ce que c'est un explosif?"), 9: GenericQuery('13260096#0', 'Qui est le président de Brésil?'), 800: GenericQuery('11003060#0', 'Qui est la personne mongole historique la plus célèbre?'), }) self._test_queries('miracl/hi/train', count=1169, items={ 0: GenericQuery('231591#0', 'भारत में कुल कितने राज्य है?'), 9: GenericQuery('5138#0', 'विश्व में सबसे ज़ादा बोली जाने वाली भाषा क्या है ?'), 1168: GenericQuery('1058385#0', 'संयुक्त राष्ट्र ने किस अमेरिकी अभिनेत्री को अपना विशेष दूत बनाया है?'), }) self._test_queries('miracl/hi/dev', count=350, items={ 0: GenericQuery('1033752#0', 'कांग्रेस दल का नेता कौन है ?'), 9: GenericQuery('229394#0', 'ईसा मसीह ने किस देश में जन्म लिया था?'), 349: GenericQuery('727434#0', 'हाल ही में पद्म श्री पुरस्कार से सम्मानित किस भोजपुरी गायक का निधन हो गया है?'), }) self._test_queries('miracl/hi/test-b', count=819, items={ 0: GenericQuery('217614#0', 'अमेरिका में कितने राज्य है?'), 9: GenericQuery('1343627#0', 'अमेरिका के सबसे ऊँचे पर्वत का नाम क्या है?'), 818: GenericQuery('575512#0', 'सबसे पहली मोटर गाढ़ी किस देश ने बनाई थी?'), }) self._test_queries('miracl/id/train', count=4071, items={ 0: GenericQuery('5', 'siapakah orang tua John Fitzgerald Kennedy?'), 9: GenericQuery('23', 'Kapan Operasi hari Kiamat terjadi ?'), 4070: GenericQuery('6147', 'Kapan Kaisar Tang Gaozu mulai menjabat ?'), }) self._test_queries('miracl/id/dev', count=960, items={ 0: GenericQuery('3', 'Dimana James Hepburn meninggal?'), 9: GenericQuery('41', 'Kapan perdagangan melalui armada mulai dilakukan oleh bangsa Eropa ?'), 959: GenericQuery('6138', 'Kapan sepeda motor ditemukan ?'), }) self._test_queries('miracl/id/test-a', count=731, items={ 0: GenericQuery('6148', 'Siapakah yang menemuka benua Amerika ?'), 9: GenericQuery('6158', 'Berapa luas kota Blitar?'), 730: GenericQuery('6976', 'Berapakah berat Ikan pari manta yag terbesar?'), }) self._test_queries('miracl/id/test-b', count=611, items={ 0: GenericQuery('2684524#0', 'Gimaba cara membuat atap?'), 9: GenericQuery('1044017#0', 'Bagiamana cara mesin bekerja?'), 610: GenericQuery('1200317#0', 'Pada tahun berapa Amerika Serikat memiliki presiden pertama mereka?'), }) self._test_queries('miracl/ja/train', count=3477, items={ 0: GenericQuery('1', 'サー・マイケル・フィリップ・ジャガーの出身は?'), 9: GenericQuery('19', '桂 銀淑のデビュー曲は何?'), 3476: GenericQuery('4630', 'ブラームスの出身はどこ?'), }) self._test_queries('miracl/ja/dev', count=860, items={ 0: GenericQuery('0', '“ダン” ダニエル・ジャドソン・キャラハンの出身はどこ'), 9: GenericQuery('26', 'パメラ・コールマン・スミスはいつ生まれた?'), 859: GenericQuery('4629', '彭 徳懐はいつ生まれた?'), }) self._test_queries('miracl/ja/test-a', count=650, items={ 0: GenericQuery('4631', '化学兵器禁止条約はどこで採択された?'), 9: GenericQuery('4640', 'ダニエル・レイ・エインジはどこのプロ野球チームに所属した?'), 649: GenericQuery('5352', 'クイズ番組『アメリカ横断ウルトラクイズ』の出場者はどのようなひとでしたか?'), }) self._test_queries('miracl/ja/test-b', count=1141, items={ 0: GenericQuery('832900#0', 'シカゴの市長は誰ですか?'), 9: GenericQuery('1213346#0', '飛行機を発明したのは誰ですか。'), 1140: GenericQuery('900297#0', '名古屋市長は誰ですか。'), }) self._test_queries('miracl/ko/train', count=868, items={ 0: GenericQuery('1', '로마의 면적은 서울시의 2배인가요?'), 9: GenericQuery('16', '오버워치 출시일은 언제인가요?'), 867: GenericQuery('1597', '오행이라는 것은 목·화·토·금·수의 오원소를 말하나요?'), }) self._test_queries('miracl/ko/dev', count=213, items={ 0: GenericQuery('2', '합성생물학을 연구하는 방식은 탑다운 외 다른 방식은 무엇이 있나요?'), 9: GenericQuery('58', '나폴레옹 1세는 언제 황제로 즉위하나요?'), 212: GenericQuery('1582', '인체에는 몇 개의 세포가 있는가?'), }) self._test_queries('miracl/ko/test-a', count=263, items={ 0: GenericQuery('1602', '유비 출생일은 언제인가요?'), 9: GenericQuery('1615', '매슬로우의 5단계 욕구 중 제 1수준은 생리적 욕구인가요?'), 262: GenericQuery('2016', '광주 비엔날레는 몇년마다 열리나요?'), }) self._test_queries('miracl/ko/test-b', count=1417, items={ 0: GenericQuery('2386699#0', '일본은 어떤 언어를 사용하나요?'), 9: GenericQuery('1229231#0', '어느 나라 소금이 유명한가요?'), 1416: GenericQuery('279678#0', '털 제거는 어떻게 하나?'), }) self._test_queries('miracl/ru/train', count=4683, items={ 0: GenericQuery('1', 'Когда был спущен на воду первый миноносец «Спокойный»?'), 9: GenericQuery('21', 'Какой процент населения Земли ездит на правостороннем движении?'), 4682: GenericQuery('6761', 'Какая площадь Аппенинского полуострова?'), }) self._test_queries('miracl/ru/dev', count=1252, items={ 0: GenericQuery('0', 'Когда начался Кари́бский кризис?'), 9: GenericQuery('33', 'Возле какой реки произошло Мианкальское восстание?'), 1251: GenericQuery('6762', 'Кто убил Зорана Джинджича?'), }) self._test_queries('miracl/ru/test-a', count=911, items={ 0: GenericQuery('6763', 'Когда появился термин правово́е госуда́рство?'), 9: GenericQuery('6772', 'Какая максимальная скорость ЭД9?'), 910: GenericQuery('7762', 'Когда впервые начали применять компьютерную графику в кинематографе?'), }) self._test_queries('miracl/ru/test-b', count=718, items={ 0: GenericQuery('2514517#0', 'Когда вышел мультфильм “Король Лев”?'), 9: GenericQuery('4130787#0', 'Как юридически оформить наследство?'), 717: GenericQuery('269810#0', 'Сколько лет прожил Лев Гумилёв?'), }) self._test_queries('miracl/sw/train', count=1901, items={ 0: GenericQuery('0', 'Je,nchi gani yenye kuzalisha chungwa kwa wingi zaidi duniani?'), 9: GenericQuery('15', 'Je,nani mwanzilishi wa kampuni ya Visa Inc?'), 1900: GenericQuery('2600', 'Nini maana ya data?'), }) self._test_queries('miracl/sw/dev', count=482, items={ 0: GenericQuery('6', 'Bandari kubwa nchini Kenya iko wapi?'), 9: GenericQuery('38', 'Jina kamili la Pelé ni lipi?'), 481: GenericQuery('2597', 'Paul Schulze alizaliwa mwaka upi?'), }) self._test_queries('miracl/sw/test-a', count=638, items={ 0: GenericQuery('2601', 'Je,Sarah Wayne Callies ana mume?'), 9: GenericQuery('2611', 'Kuna kundinyota ngapi?'), 637: GenericQuery('3270', 'Michael Wamalwa Kijana alifariki akiwa na miaka mingapi?'), }) self._test_queries('miracl/sw/test-b', count=465, items={ 0: GenericQuery('20328#0', 'Dar-Es Salaam ni mji wa nchi gani?'), 9: GenericQuery('39828#0', 'Helikopta ina gurudumu ngapi?'), 464: GenericQuery('89888#0', 'Ni nini hutumiwa kupanga lugha kwa vikundi?'), }) self._test_queries('miracl/te/train', count=3452, items={ 0: GenericQuery('0', 'వేప చెట్టు యొక్క శాస్త్రీయ నామం ఏమిటి?'), 9: GenericQuery('11', 'తమరాం గ్రామ పిన్ కోడ్ ఏంటి?'), 3451: GenericQuery('4867', 'ప్యూమా సంస్థని ఎప్పుడు స్థాపించారు?'), }) self._test_queries('miracl/te/dev', count=828, items={ 0: GenericQuery('2', 'ఆర్మేనియా దేశంలో మొత్తం జిల్లాలు ఉన్నాయి?'), 9: GenericQuery('50', 'మొట్టమొదటి పజెరో నమూనాను ఎక్కడ ప్రవేశపెట్టారు?'), 827: GenericQuery('4869', 'తమన్నా భాటియా నటించిన మొదటి తెలుగు సినిమా ఏది?'), }) self._test_queries('miracl/te/test-a', count=594, items={ 0: GenericQuery('4870', 'మహా సముద్రాలు ఎన్ని ఉన్నాయి?'), 9: GenericQuery('4879', 'నండవ గ్రామ విస్తీర్ణత ఎంత?'), 593: GenericQuery('5516', 'తాడేపల్లి రాఘవ నారాయణ శాస్త్రి ఎక్కడ జన్మించాడు?'), }) self._test_queries('miracl/te/test-b', count=793, items={ 0: GenericQuery('28113#0', 'విశాఖపట్నం జిల్లా ఏ రాష్ట్రంలో ఉంది?'), 9: GenericQuery('27851#0', 'అరకులోయ ఎత్తు ఏ ఎంత?'), 792: GenericQuery('33638#0', 'పాలవంచ నుంచి హైదరాబాద్ కు గల దూరం ఎంత?'), }) self._test_queries('miracl/th/train', count=2972, items={ 0: GenericQuery('0', 'มหาวิทยาลัยมหาสารคาม เปิดสอนกี่สาขาวิชา?'), 9: GenericQuery('16', 'พระเจ้าบรมวงศ์เธอ กรมขุนสุพรรณภาควดี มีพี่น้องกี่คน ?'), 2971: GenericQuery('4130', 'ใครเป็นผู้ก่อตั้ง โรงเรียนสาธิตแห่งมหาวิทยาลัยเกษตรศาสตร์?'), }) self._test_queries('miracl/th/dev', count=733, items={ 0: GenericQuery('4', 'บันทึกเหตุการณ์และเรื่องราวต่าง ๆ ในยุคสามก๊กฉบับแรก ที่มีการบันทึกเป็นลายลักษณ์อักษรเรียกว่าอะไร?'), 9: GenericQuery('51', 'สมาชิกสภาผู้แทนราษฎร มีหน้าที่หลักคืออะไร ?'), 732: GenericQuery('4113', 'กรุงเทพมหานครมีกี่เขต?'), }) self._test_queries('miracl/th/test-a', count=992, items={ 0: GenericQuery('4131', 'อยุธยามีกี่อำเภอ ?'), 9: GenericQuery('4140', 'เชฟกระทะเหล็ก ประเทศไทย ออกอากาศครั้งแรกเมื่อใด ?'), 991: GenericQuery('5321', 'หม่อมราชวงศ์สุขุมพันธุ์ บริพัตร เกิดเมื่อไหร่?'), }) self._test_queries('miracl/th/test-b', count=650, items={ 0: GenericQuery('895001#0', 'วันชาติของประเทศไทยตรงกับวันที่เท่าไหร่'), 9: GenericQuery('781364#0', 'วันวาเลนไทน์อยู่ในเดือนอะไร'), 649: GenericQuery('140245#0', 'ปลานิล มีชื่อวิทยาศาตร์ว่าอย่างไร'), }) self._test_queries('miracl/yo/dev', count=119, items={ 0: GenericQuery('10020#0', 'Odun wo ni wọn ṣe idije Olympiiki akọkọ?'), 9: GenericQuery('13935#0', 'Awon orile ede wo ni won pe ni Balkani?'), 118: GenericQuery('9593#0', 'Kí ni orúkọ bibi akọrin Naira Marley?'), }) self._test_queries('miracl/yo/test-b', count=288, items={ 0: GenericQuery('10494#0', 'ọdun wo ni wọn pa funsho william?'), 9: GenericQuery('12525#0', 'Ki ni oluilu orile ede Hungari?'), 287: GenericQuery('8837#0', 'ẹgbẹ oloṣelu wo ni Boris Johnson wa pẹlu?'), }) self._test_queries('miracl/zh/train', count=1312, items={ 0: GenericQuery('5455987#0', '月球到地球的距离是多少?'), 9: GenericQuery('986470#0', '四川美食有哪些?'), 1311: GenericQuery('1885930#0', '世界上哪些国家信天主教?'), }) self._test_queries('miracl/zh/dev', count=393, items={ 0: GenericQuery('1719936#0', '二战是什么时候开始的?'), 9: GenericQuery('67347#0', '电视剧《继承者们》的女主角是谁?'), 392: GenericQuery('5517620#0', '中国女足有哪些成绩?'), }) self._test_queries('miracl/zh/test-b', count=920, items={ 0: GenericQuery('59350#0', '最后一任法国国王是谁?'), 9: GenericQuery('596861#0', '非洲有多少个国家?'), 919: GenericQuery('3534377#0', '浙江有哪些旅游景点?'), }) def test_qrels(self): self._test_qrels('miracl/ar/train', count=25382, items={ 0: TrecQrel('1', '26569#0', 1, 'Q0'), 9: TrecQrel('1', '26569#4', 0, 'Q0'), 25381: TrecQrel('15481', '120888#3', 0, 'Q0'), }) self._test_qrels('miracl/ar/dev', count=29197, items={ 0: TrecQrel('0', '151236#1', 1, 'Q0'), 9: TrecQrel('0', '3645940#4', 0, 'Q0'), 29196: TrecQrel('15511', '1295397#0', 0, 'Q0'), }) self._test_qrels('miracl/bn/train', count=16754, items={ 0: TrecQrel('0', '59523#1', 1, 'Q0'), 9: TrecQrel('0', '296930#46', 0, 'Q0'), 16753: TrecQrel('2151', '483238#12', 0, 'Q0'), }) self._test_qrels('miracl/bn/dev', count=4206, items={ 0: TrecQrel('4', '717942#0', 1, 'Q0'), 9: TrecQrel('4', '713734#0', 0, 'Q0'), 4205: TrecQrel('2152', '14254#2', 0, 'Q0'), }) self._test_qrels('miracl/de/dev', count=3144, items={ 0: TrecQrel('10055536#0', '2653#14', 1, 'Q0'), 9: TrecQrel('10055536#0', '7006449#14', 0, 'Q0'), 3143: TrecQrel('9943409#0', '173950#12', 0, 'Q0'), }) self._test_qrels('miracl/en/train', count=29416, items={ 0: TrecQrel('1', '2078963#10', 1, 'Q0'), 9: TrecQrel('1', '25312#23', 0, 'Q0'), 29415: TrecQrel('4446', '124989#21', 0, 'Q0'), }) self._test_qrels('miracl/en/dev', count=8350, items={ 0: TrecQrel('0', '462221#4', 1, 'Q0'), 9: TrecQrel('0', '11236157#2', 0, 'Q0'), 8349: TrecQrel('4443', '8132416#3', 0, 'Q0'), }) self._test_qrels('miracl/es/train', count=21531, items={ 0: TrecQrel('10000698#0', '541735#4', 1, 'Q0'), 9: TrecQrel('10000698#0', '6383339#7', 0, 'Q0'), 21530: TrecQrel('9996500#0', '39277#14', 0, 'Q0'), }) self._test_qrels('miracl/es/dev', count=6443, items={ 0: TrecQrel('10036600#0', '8156619#0', 1, 'Q0'), 9: TrecQrel('10036600#0', '4779177#0', 0, 'Q0'), 6442: TrecQrel('9997781#0', '8493995#1', 0, 'Q0'), }) self._test_qrels('miracl/fa/train', count=21844, items={ 0: TrecQrel('1030103#0', '88822#0', 1, 'Q0'), 9: TrecQrel('1030103#0', '910079#32', 0, 'Q0'), 21843: TrecQrel('998155#0', '521645#12', 0, 'Q0'), }) self._test_qrels('miracl/fa/dev', count=6571, items={ 0: TrecQrel('10163#0', '10162#31', 1, 'Q0'), 9: TrecQrel('10163#0', '5352636#11', 0, 'Q0'), 6570: TrecQrel('981101#0', '1206494#0', 0, 'Q0'), }) self._test_qrels('miracl/fi/train', count=20350, items={ 0: TrecQrel('0', '254561#0', 1, 'Q0'), 9: TrecQrel('2', '34783#36', 0, 'Q0'), 20349: TrecQrel('8271', '16787#1', 0, 'Q0'), }) self._test_qrels('miracl/fi/dev', count=12008, items={ 0: TrecQrel('1', '18044#1', 1, 'Q0'), 9: TrecQrel('1', '18044#8', 0, 'Q0'), 12007: TrecQrel('8305', '122347#4', 0, 'Q0'), }) self._test_qrels('miracl/fr/train', count=11426, items={ 0: TrecQrel('1000816#0', '2797090#3', 1, 'Q0'), 9: TrecQrel('1000816#0', '2797090#1', 0, 'Q0'), 11425: TrecQrel('998997#0', '71269#10', 0, 'Q0'), }) self._test_qrels('miracl/fr/dev', count=3429, items={ 0: TrecQrel('10037625#0', '148020#0', 1, 'Q0'), 9: TrecQrel('10037625#0', '148020#6', 0, 'Q0'), 3428: TrecQrel('9970463#0', '936867#66', 0, 'Q0'), }) self._test_qrels('miracl/hi/train', count=11668, items={ 0: TrecQrel('100443#0', '1051327#15', 1, 'Q0'), 9: TrecQrel('100443#0', '91720#8', 0, 'Q0'), 11667: TrecQrel('997148#0', '10866#1', 0, 'Q0'), }) self._test_qrels('miracl/hi/dev', count=3494, items={ 0: TrecQrel('1004960#0', '69380#11', 1, 'Q0'), 9: TrecQrel('1004960#0', '538245#7', 0, 'Q0'), 3493: TrecQrel('9978#0', '221542#9', 0, 'Q0'), }) self._test_qrels('miracl/id/train', count=41358, items={ 0: TrecQrel('5', '12980#4', 1, 'Q0'), 9: TrecQrel('5', '12980#9', 0, 'Q0'), 41357: TrecQrel('6147', '436242#3', 0, 'Q0'), }) self._test_qrels('miracl/id/dev', count=9668, items={ 0: TrecQrel('3', '115796#6', 1, 'Q0'), 9: TrecQrel('3', '279066#2', 0, 'Q0'), 9667: TrecQrel('6138', '41402#28', 0, 'Q0'), }) self._test_qrels('miracl/ja/train', count=34387, items={ 0: TrecQrel('1', '119071#0', 1, 'Q0'), 9: TrecQrel('1', '3317224#0', 0, 'Q0'), 34386: TrecQrel('4630', '3846925#1', 0, 'Q0'), }) self._test_qrels('miracl/ja/dev', count=8354, items={ 0: TrecQrel('0', '2681119#1', 1, 'Q0'), 9: TrecQrel('0', '2681119#4', 0, 'Q0'), 8353: TrecQrel('4629', '197987#3', 0, 'Q0'), }) self._test_qrels('miracl/ko/train', count=12767, items={ 0: TrecQrel('1', '3228#0', 1, 'Q0'), 9: TrecQrel('1', '128525#10', 0, 'Q0'), 12766: TrecQrel('1597', '108520#9', 0, 'Q0'), }) self._test_qrels('miracl/ko/dev', count=3057, items={ 0: TrecQrel('2', '317339#6', 1, 'Q0'), 9: TrecQrel('2', '317339#3', 0, 'Q0'), 3056: TrecQrel('1582', '76034#30', 0, 'Q0'), }) self._test_qrels('miracl/ru/train', count=33921, items={ 0: TrecQrel('1', '2183682#1', 1, 'Q0'), 9: TrecQrel('1', '1937636#2', 0, 'Q0'), 33920: TrecQrel('6761', '89740#1', 1, 'Q0'), }) self._test_qrels('miracl/ru/dev', count=13100, items={ 0: TrecQrel('0', '105156#0', 1, 'Q0'), 9: TrecQrel('0', '6560940#1', 0, 'Q0'), 13099: TrecQrel('6762', '5244374#14', 0, 'Q0'), }) self._test_qrels('miracl/sw/train', count=9359, items={ 0: TrecQrel('0', '18032#10', 1, 'Q0'), 9: TrecQrel('0', '35063#11', 0, 'Q0'), 9358: TrecQrel('2600', '33987#0', 1, 'Q0'), }) self._test_qrels('miracl/sw/dev', count=5092, items={ 0: TrecQrel('6', '32589#0', 1, 'Q0'), 9: TrecQrel('6', '32589#3', 0, 'Q0'), 5091: TrecQrel('2597', '34586#1', 0, 'Q0'), }) self._test_qrels('miracl/te/train', count=18608, items={ 0: TrecQrel('0', '59628#4', 1, 'Q0'), 9: TrecQrel('0', '40216#0', 0, 'Q0'), 18607: TrecQrel('4867', '109661#0', 1, 'Q0'), }) self._test_qrels('miracl/te/dev', count=1606, items={ 0: TrecQrel('2', '259006#0', 1, 'Q0'), 9: TrecQrel('50', '108202#2', 1, 'Q0'), 1605: TrecQrel('4869', '132304#0', 1, 'Q0'), }) self._test_qrels('miracl/th/train', count=21293, items={ 0: TrecQrel('0', '12146#1', 1, 'Q0'), 9: TrecQrel('12', '548193#17', 0, 'Q0'), 21292: TrecQrel('4130', '32375#1', 1, 'Q0'), }) self._test_qrels('miracl/th/dev', count=7573, items={ 0: TrecQrel('4', '9800#4', 1, 'Q0'), 9: TrecQrel('4', '928787#0', 0, 'Q0'), 7572: TrecQrel('4113', '235995#23', 0, 'Q0'), }) self._test_qrels('miracl/yo/dev', count=1188, items={ 0: TrecQrel('10020#0', '10020#1', 1, 'Q0'), 9: TrecQrel('10020#0', '68760#255', 0, 'Q0'), 1187: TrecQrel('9593#0', '36891#0', 0, 'Q0'), }) self._test_qrels('miracl/zh/train', count=13113, items={ 0: TrecQrel('1000222#0', '453#50', 1, 'Q0'), 9: TrecQrel('1000222#0', '7345501#3', 0, 'Q0'), 13112: TrecQrel('997826#0', '7757109#2', 0, 'Q0'), }) self._test_qrels('miracl/zh/dev', count=3928, items={ 0: TrecQrel('1009493#0', '1016228#22', 1, 'Q0'), 9: TrecQrel('1009493#0', '66771#5', 0, 'Q0'), 3927: TrecQrel('992843#0', '293786#0', 0, 'Q0'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/mmarco.py ================================================ import re import unittest import ir_datasets from ir_datasets.formats import GenericDoc, GenericQuery, TrecQrel, GenericScoredDoc, GenericDocPair from .base import DatasetIntegrationTest _logger = ir_datasets.log.easy() class TestMMarco(DatasetIntegrationTest): def test_docs(self): self._test_docs('mmarco/es', count=8841823, items={ 0: GenericDoc('0', re.compile('^La presencia de la comunicación entre las mentes científicas fue igualmente importante para el éxito.{317}micos es lo que su éxito realmente significó; cientos de miles de vidas inocentes fueron destruidas\\.$', flags=48)), 9: GenericDoc('9', re.compile('^Una de las principales razones por las que Hanford fue seleccionado como sitio para el Reactor B del.{38}al río Columbia, el río más grande que fluye hacia el Océano Pacífico desde la costa norteamericana\\.$', flags=48)), 8841822: GenericDoc('8841822', re.compile('^Ver imagen a tamaño completo\\. Detrás de las escenas de la luz deslumbrante muestra que los espectado.{344}cipalmente sales metálicas y óxidos metálicos, que reaccionan para producir una variedad de colores\\.$', flags=48)), }) self._test_docs('mmarco/fr', count=8841823, items={ 0: GenericDoc('0', re.compile('^La présence de la communication au milieu des esprits scientifiques était tout aussi importante pour.{355}st ce que leur succès signifiait vraiment; des centaines de milliers de vies innocentes ont disparu\\.$', flags=48)), 9: GenericDoc('9', re.compile("^L'une des principales raisons pour lesquelles Hanford a été choisi comme site du réacteur B du proje.{42}re Columbia, la plus grande rivière qui coule dans l'océan Pacifique depuis la côte nord\\-américaine\\.$", flags=48)), 8841822: GenericDoc('8841822', re.compile("^Affichage de l'image pleine taille\\. Dans les coulisses de la lumière éblouissante montre que les spe.{421} des sels métalliques et des oxydes métalliques, qui réagissent pour produire une gamme de couleurs\\.$", flags=48)), }) self._test_docs('mmarco/pt', count=8841823, items={ 0: GenericDoc('0', re.compile('^A presença da comunicação em meio às mentes científicas era igualmente importante para o sucesso do .{294}micos é o que seu sucesso realmente significava; centenas de milhares de vidas inocentes destruídas\\.$', flags=48)), 9: GenericDoc('9', re.compile('^Uma das principais razões pelas quais Hanford foi selecionado como um local para o Reator B do Proje.{22}roximidade com o Rio Columbia, o maior rio que flui para o Oceano Pacífico da costa norte\\-americana\\.$', flags=48)), 8841822: GenericDoc('8841822', re.compile('^Ver imagem em tamanho completo\\. Atrás das cenas da luz deslumbrante mostra que os espectadores ooh e.{332}, principalmente sais metálicos e óxidos metálicos, que reagem para produzir uma variedade de cores\\.$', flags=48)), }) self._test_docs('mmarco/it', count=8841823, items={ 0: GenericDoc('0', re.compile('^La presenza della comunicazione tra le menti scientifiche era altrettanto importante per il successo.{327}ò che il loro successo ha veramente significato; centinaia di migliaia di vite innocenti annientati\\.$', flags=48)), 9: GenericDoc('9', re.compile("^Uno dei motivi principali per cui Hanford è stato scelto come sito per il reattore B del Progetto Ma.{28} al fiume Columbia, il più grande fiume che scorreva nell'Oceano Pacifico dalla costa nordamericana\\.$", flags=48)), 8841822: GenericDoc('8841822', re.compile('^Visualizza immagine a grandezza naturale\\. Dietro le quinte della luce abbagliante mostra che gli spe.{353} principalmente sali metallici e ossidi di metallo, che reagiscono per produrre una serie di colori\\.$', flags=48)), }) self._test_docs('mmarco/id', count=8841823, items={ 0: GenericDoc('0', re.compile('^Kehadiran komunikasi di tengah pikiran ilmiah sama pentingnya dengan keberhasilan Proyek Manhattan s.{249} adalah apa sebenarnya tujuan kesuksesan mereka; ratusan ribu nyawa yang tidak bersalah dilenyapkan\\.$', flags=48)), 9: GenericDoc('9', 'Salah satu alasan utama Hanford dipilih sebagai lokasi untuk Reaktor B Proyek Manhattan adalah dekatnya ke Sungai Columbia, sungai terbesar yang mengalir ke Samudra Pasifik dari pesisir Amerika Utara.'), 8841822: GenericDoc('8841822', re.compile('^Tilik gambar ukuran penuh\\. Di balik layar cahaya yang menyilaukan itu, para penonton memperlihatkan .{356}a khusus, terutama garam logam dan logam oksida, yang bereaksi untuk menghasilkan serangkaian warna\\.$', flags=48)), }) self._test_docs('mmarco/de', count=8841823, items={ 0: GenericDoc('0', re.compile('^Die Präsenz der Kommunikation inmitten wissenschaftlicher Köpfe war für den Erfolg des Manhattan\\-Pro.{306}ängt, ist, was ihr Erfolg wirklich bedeutete; Hunderttausende unschuldiger Leben wurden ausgelöscht\\.$', flags=48)), 9: GenericDoc('9', re.compile('^Einer der Hauptgründe, weshalb Hanford als Standort für den B Reactor des Manhattan\\-Projekts ausgewä.{34}mbia River, dem größten Fluss, der von der nordamerikanischen Küste in den Pazifischen Ozean fließt\\.$', flags=48)), 8841822: GenericDoc('8841822', re.compile('^Sehen Sie das Bild in voller Größe an\\. Hinter den Kulissen des blendenden Lichts zeigt sich, dass di.{332}üllt, vor allem Metallsalze und Metalloxide, die reagieren, um eine Reihe von Farben zu produzieren\\.$', flags=48)), }) self._test_docs('mmarco/ru', count=8841823, items={ 0: GenericDoc('0', re.compile('^▪ Присутствие общения в научных кругах имело не менее важное значение для успеха Манхэттенского прое.{286} то, что их успех действительно имел в виду; сотни тысяч ни в чем не повинных людей были уничтожены\\.$', flags=48)), 9: GenericDoc('9', re.compile('^• Одной из главных причин, по которой Хэнфорд был выбран в качестве объекта для реактора B Манхэттен.{28}ость к реке Колумбия, крупнейшей реке, которая течет в Тихий океан от североамериканского побережья\\.$', flags=48)), 8841822: GenericDoc('8841822', re.compile('^Посмотрите на изображение полного размера\\. За сценами ослепляющего света видно, что зрители 4 июля т.{329}бразом металлическими солями и оксидами металлов, которые реагируют на появление целого ряда цветов\\.$', flags=48)), }) self._test_docs('mmarco/zh', count=8841823, items={ 0: GenericDoc('0', '科学思想中的交流对曼哈顿项目的成功同样重要,就像科学智慧一样。 科学思想中的交流对曼哈顿项目的成功同样重要,就像科学智慧一样。 原子研究者与工程师们唯一令人印象深刻的成就是他们的成功真正意味着什么;数十万无辜生命被毁灭。'), 9: GenericDoc('9', 'Hanford被选为曼哈顿项目B反应堆场地的主要原因之一是它靠近哥伦比亚河,这是从北美海岸流入太平洋的最大河流。'), 8841822: GenericDoc('8841822', '查看全尺寸图像 。 @ action: inmenu 7月4日的觀眾Oh和Ahh是精心策劃的煙火。 無論是紅、白、藍噴泉或紫煙花, 在每件手工制的烟火中,都有装满特殊化学品的小包,主要是金属盐和氧化金属,它们的反应是产生多种颜色。'), }) self._test_docs('mmarco/v2/ar', count=8841823, items={ 0: GenericDoc('0', re.compile('^كان وجود التواصل وسط العقول العلمية مهمًا بنفس القدر لنجاح مشروع مانهاتن مثل الفكر العلمي\\. السحابة ا.{18} الإنجاز الرائع للباحثين والمهندسين الذريين هو ما يعنيه نجاحهم حقًا ؛ مئات الآلاف من الأبرياء طمسوا\\.$', flags=48)), 9: GenericDoc('9', 'كان أحد الأسباب الرئيسية لاختيار هانفورد كموقع لمشروع مانهاتن B Reactor هو قربه من نهر كولومبيا ، أكبر نهر يتدفق إلى المحيط الهادئ من ساحل أمريكا الشمالية.'), 8841822: GenericDoc('8841822', re.compile('^عرض الصورة بالحجم الكامل\\. خلف الكواليس ، يظهر الضوء المبهر أن المتفرجين في الرابع من يوليو ، هم ألعا.{226}بمواد كيميائية خاصة ، خاصة الأملاح المعدنية وأكاسيد المعادن ، والتي تتفاعل لإنتاج مجموعة من الألوان\\.$', flags=48)), }) self._test_docs('mmarco/v2/zh', count=8841823, items={ 0: GenericDoc('0', '科学头脑中的交流对于曼哈顿计划的成功与科学智力同等重要。笼罩着原子研究人员和工程师令人印象深刻的成就的唯一乌云是他们的成功真正意味着什么;数十万无辜的生命被抹杀。'), 9: GenericDoc('9', '汉福德被选为曼哈顿项目 B 反应堆选址的主要原因之一是它靠近哥伦比亚河,哥伦比亚河是从北美海岸流入太平洋的最大河流。'), 8841822: GenericDoc('8841822', '查看全尺寸图像。 在耀眼的灯光背后,七月四日的观众哦和啊,都是精心制作的烟花。 无论是红色、白色和蓝色的喷泉,还是紫色的烟花,每个烟花都含有恰到好处的化学物质组合,以创造出这些五颜六色的灯光。 每个手工制作的烟花里面都有小包,里面装满了特殊的化学物质,主要是金属盐和金属氧化物,它们会发生反应,产生一系列颜色。'), }) self._test_docs('mmarco/v2/dt', count=8841823, items={ 0: GenericDoc('0', re.compile('^De aanwezigheid van communicatie te midden van wetenschappelijke geesten was even belangrijk voor he.{157}enieurs hangt, is wat hun succes werkelijk betekende; honderdduizenden onschuldige levens uitgewist\\.$', flags=48)), 9: GenericDoc('9', re.compile('^Een van de belangrijkste redenen waarom Hanford werd geselecteerd als locatie voor de B Reactor van .{46} Columbia River, de grootste rivier die vanaf de Noord\\-Amerikaanse kust in de Stille Oceaan stroomt\\.$', flags=48)), 8841822: GenericDoc('8841822', re.compile('^Bekijk de afbeelding op volledige grootte\\. Achter de schermen van het verblindende licht is te zien .{329}aliën, voornamelijk metaalzouten en metaaloxiden, die reageren om een \u200b\u200breeks kleuren te produceren\\.$', flags=48)), }) self._test_docs('mmarco/v2/fr', count=8841823, items={ 0: GenericDoc('0', re.compile('^La présence de la communication parmi les esprits scientifiques était tout aussi importante pour le .{158}est ce que leur succès signifiait vraiment ; des centaines de milliers de vies innocentes anéanties\\.$', flags=48)), 9: GenericDoc('9', re.compile("^L'une des principales raisons pour lesquelles Hanford a été choisi comme site pour le réacteur B du .{46}euve Columbia, le plus grand fleuve se jetant dans l'océan Pacifique depuis la côte nord\\-américaine\\.$", flags=48)), 8841822: GenericDoc('8841822', re.compile("^Voir l'image en taille réelle\\. Dans les coulisses des spectacles de lumière éblouissante que les spe.{414} des sels métalliques et des oxydes métalliques, qui réagissent pour produire une gamme de couleurs\\.$", flags=48)), }) self._test_docs('mmarco/v2/de', count=8841823, items={ 0: GenericDoc('0', re.compile('^Die Präsenz der Kommunikation unter wissenschaftlichen Köpfen war für den Erfolg des Manhattan\\-Proje.{118}eure schwebt nur, was ihr Erfolg wirklich bedeutete; Hunderttausende unschuldiger Leben ausgelöscht\\.$', flags=48)), 9: GenericDoc('9', re.compile('^Einer der Hauptgründe, warum Hanford als Standort für den B\\-Reaktor des Manhattan\\-Projekts ausgewähl.{30}mbia River, dem größten Fluss, der von der nordamerikanischen Küste in den Pazifischen Ozean mündet\\.$', flags=48)), 8841822: GenericDoc('8841822', re.compile('^Bild in voller Größe anzeigen\\. Hinter den Kulissen der gleißenden Lichtershows, die die Zuschauer am.{349}d, hauptsächlich Metallsalzen und Metalloxiden, die reagieren, um eine Reihe von Farben zu erzeugen\\.$', flags=48)), }) self._test_docs('mmarco/v2/hi', count=8841823, items={ 0: GenericDoc('0', re.compile('^वैज्ञानिक बुद्धि के रूप में मैनहट्टन परियोजना की सफलता के लिए वैज्ञानिक दिमाग के बीच संचार की उपस्थि.{111}का हुआ एकमात्र बादल उनकी सफलता का सही मायने में मतलब है; सैकड़ों हजारों निर्दोष लोगों की जान चली गई।$', flags=48)), 9: GenericDoc('9', re.compile('^मैनहट्टन प्रोजेक्ट के बी रिएक्टर के लिए हनफोर्ड को एक साइट के रूप में चुने जाने के मुख्य कारणों में .{9}ंबिया नदी से इसकी निकटता थी, जो उत्तरी अमेरिकी तट से प्रशांत महासागर में बहने वाली सबसे बड़ी नदी थी।$', flags=48)), 8841822: GenericDoc('8841822', re.compile('^पूर्ण आकार की छवि देखें। चकाचौंध रोशनी के दृश्यों के पीछे से पता चलता है कि चार जुलाई को दर्शक ऊह और.{284} से धातु के लवण और धातु के आक्साइड, जो रंगों की एक सरणी का उत्पादन करने के लिए प्रतिक्रिया करते हैं।$', flags=48)), }) self._test_docs('mmarco/v2/id', count=8841823, items={ 0: GenericDoc('0', re.compile('^Kehadiran komunikasi di tengah pikiran ilmiah sama pentingnya dengan keberhasilan Proyek Manhattan s.{130} atom adalah apa arti kesuksesan mereka yang sebenarnya; ratusan ribu nyawa tak berdosa dilenyapkan\\.$', flags=48)), 9: GenericDoc('9', re.compile('^Salah satu alasan utama Hanford dipilih sebagai lokasi Reaktor B Proyek Manhattan adalah kedekatanny.{1} dengan Sungai Columbia, sungai terbesar yang mengalir ke Samudra Pasifik dari pantai Amerika Utara\\.$', flags=48)), 8841822: GenericDoc('8841822', re.compile('^Lihat gambar ukuran penuh\\. Di balik layar cahaya yang menyilaukan menunjukkan bahwa penonton ooh dan.{326}imia khusus, terutama garam logam dan oksida logam, yang bereaksi untuk menghasilkan berbagai warna\\.$', flags=48)), }) self._test_docs('mmarco/v2/it', count=8841823, items={ 0: GenericDoc('0', re.compile('^La presenza della comunicazione tra le menti scientifiche era altrettanto importante per il successo.{164} ciò che significava veramente il loro successo; centinaia di migliaia di vite innocenti cancellate\\.$', flags=48)), 9: GenericDoc('9', re.compile("^Uno dei motivi principali per cui Hanford è stata scelta come sito per il reattore B del Progetto Ma.{30}za al fiume Columbia, il più grande fiume che scorre nell'Oceano Pacifico dalla costa nordamericana\\.$", flags=48)), 8841822: GenericDoc('8841822', re.compile("^Visualizza l'immagine a dimensione intera\\. Dietro le quinte degli spettacoli di luce abbagliante che.{380}, principalmente sali metallici e ossidi metallici, che reagiscono per produrre una serie di colori\\.$", flags=48)), }) self._test_docs('mmarco/v2/ja', count=8841823, items={ 0: GenericDoc('0', 'マンハッタン計画の成功には、科学的知性と同様に、科学的精神の中でのコミュニケーションの存在も同様に重要でした。原子研究者とエンジニアの印象的な業績にぶら下がっている唯一の雲は、彼らの成功が本当に意味したことです。何十万もの罪のない命が失われました。'), 9: GenericDoc('9', 'ハンフォードがマンハッタン計画のB原子炉の場所として選ばれた主な理由のひとつは、北米沿岸から太平洋に流れ込む最大の川であるコロンビア川に近いことでした。'), 8841822: GenericDoc('8841822', re.compile('^フルサイズの画像を表示します。 まばゆいばかりの光の舞台裏では、7月4日の観客が慎重に作られた花火であることがわかります。 赤、白、青の噴水でも紫の線香花火でも、各花火にはこれらのカラフルなライトを作.{2}るための化学物質の適切な組み合わせが詰め込まれています。 それぞれの手作りの花火の中には、特殊な化学物質、主に金属塩と金属酸化物で満たされた小さなパケットがあり、これらは反応して一連の色を生成します。$', flags=48)), }) self._test_docs('mmarco/v2/pt', count=8841823, items={ 0: GenericDoc('0', re.compile('^A presença de comunicação entre mentes científicas foi tão importante para o sucesso do Projeto Manh.{128}ômicos é o que seu sucesso realmente significou; centenas de milhares de vidas inocentes destruídas\\.$', flags=48)), 9: GenericDoc('9', re.compile('^Um dos principais motivos pelos quais Hanford foi selecionado como local para o Reator B do Projeto .{24}idade com o Rio Columbia, o maior rio que deságua no Oceano Pacífico vindo da costa norte\\-americana\\.$', flags=48)), 8841822: GenericDoc('8841822', re.compile('^Veja a imagem em tamanho grande\\. Nos bastidores da luz deslumbrante mostra que os espectadores ooh e.{329}s especiais, principalmente sais e óxidos de metal, que reagem para produzir uma variedade de cores\\.$', flags=48)), }) self._test_docs('mmarco/v2/ru', count=8841823, items={ 0: GenericDoc('0', re.compile('^Наличие связи между научными умами было столь же важно для успеха Манхэттенского проекта, как и науч.{100}ов\\-атомщиков, \\- это то, что на самом деле означало их успех; уничтожены сотни тысяч невинных жизней\\.$', flags=48)), 9: GenericDoc('9', re.compile('^Одной из основных причин, по которой Хэнфорд был выбран в качестве площадки для реактора B Манхэттен.{20}его близость к реке Колумбия, крупнейшей реке, впадающей в Тихий океан с побережья Северной Америки\\.$', flags=48)), 8841822: GenericDoc('8841822', re.compile('^Посмотреть полноразмерное изображение\\. За кулисами ослепительный свет показывает, что зрители ох и о.{334}, в основном солями металлов и оксидами металлов, которые реагируют с образованием множества цветов\\.$', flags=48)), }) self._test_docs('mmarco/v2/es', count=8841823, items={ 0: GenericDoc('0', re.compile('^La presencia de comunicación entre mentes científicas fue tan importante para el éxito del Proyecto .{135}ros atómicos es lo que realmente significó su éxito; cientos de miles de vidas inocentes destruidas\\.$', flags=48)), 9: GenericDoc('9', re.compile('^Una de las principales razones por las que Hanford fue seleccionado como sitio para el Reactor B del.{45}Columbia, el río más grande que desemboca en el Océano Pacífico desde la costa de América del Norte\\.$', flags=48)), 8841822: GenericDoc('8841822', re.compile('^Ver imagen a tamaño completo\\. Detrás de las escenas de los espectáculos de luces deslumbrantes que l.{390}cipalmente sales metálicas y óxidos metálicos, que reaccionan para producir una variedad de colores\\.$', flags=48)), }) self._test_docs('mmarco/v2/vi', count=8841823, items={ 0: GenericDoc('0', re.compile('^Sự hiện diện của giao tiếp giữa những bộ óc khoa học cũng quan trọng không kém đối với sự thành công.{139}n tử là thành công của họ thực sự có ý nghĩa như thế nào; hàng trăm ngàn sinh mạng vô tội bị xóa sổ\\.$', flags=48)), 9: GenericDoc('9', 'Một trong những lý do chính khiến Hanford được chọn làm nơi đặt Lò phản ứng B của Dự án Manhattan là nó nằm gần sông Columbia, con sông lớn nhất đổ ra Thái Bình Dương từ bờ biển Bắc Mỹ.'), 8841822: GenericDoc('8841822', re.compile('^Xem hình ảnh kích thước đầy đủ\\. Phía sau hậu trường của ánh sáng rực rỡ cho thấy khán giả ooh và ahh.{283} đầy hóa chất đặc biệt, chủ yếu là muối kim loại và oxit kim loại, phản ứng tạo ra một loạt màu sắc\\.$', flags=48)), }) def test_queries(self): self._test_queries('mmarco/es/train', count=808731, items={ 0: GenericQuery('121352', 'Definir extrema'), 9: GenericQuery('492875', 'Temperatura del desinfectante'), 808730: GenericQuery('50393', 'Beneficios de hervir limones y beber jugo.'), }) self._test_queries('mmarco/es/dev', count=101092, items={ 0: GenericQuery('1048578', 'coste de interminables piscinas / spa de natación'), 9: GenericQuery('1048587', '¿Qué es el patrón?'), 101091: GenericQuery('524285', 'Cinta de correr inclinada significado'), }) self._test_queries('mmarco/es/dev/small', count=6980, items={ 0: GenericQuery('1048585', '¿Qué es el hermano de Paula Deen?'), 9: GenericQuery('524699', 'Número del servicio de tricare'), 6979: GenericQuery('1048565', 'que juega michaelis sebastian'), }) self._test_queries('mmarco/fr/train', count=808731, items={ 0: GenericQuery('121352', 'Définition extrême'), 9: GenericQuery('492875', 'Température du désinfectant'), 808730: GenericQuery('50393', 'Les bienfaits des citrons bouillants et du jus de boisson.'), }) self._test_queries('mmarco/fr/dev', count=101093, items={ 0: GenericQuery('1048578', 'Coût des piscines sans fin / spa de natation'), 9: GenericQuery('1048587', "Qu'est-ce que le patron"), 101092: GenericQuery('524285', "Sens de l'inclinaison du tapis roulant"), }) self._test_queries('mmarco/fr/dev/small', count=6980, items={ 0: GenericQuery('1048585', "Qu'est-ce que le frère de Paula Deen?"), 9: GenericQuery('524699', 'Numéro de service tricare'), 6979: GenericQuery('1048565', 'Qui joue des michaélis sébastes'), }) self._test_queries('mmarco/pt/train', count=811690, items={ 0: GenericQuery('121352', 'Definir extremo'), 9: GenericQuery('492875', 'Temperatura do desinfectante'), 760: GenericQuery('1009273', 'Qual evangelista se tornou um ateu depois de realizar muitas reuniões?'), 761: GenericQuery('1009273', 'O nome dele era Chuck...'), 811689: GenericQuery('50393', 'Benefícios de ferver limão e beber suco.'), }) self._test_queries('mmarco/pt/dev', count=101619, items={ 0: GenericQuery('1048578', 'custo de piscinas intermináveis / spa de banho'), 9: GenericQuery('1048587', 'O que é patrono?'), 29: GenericQuery('52', 'Pneumo é um prefixo que significa ar.'), 30: GenericQuery('52', 'Sabendo disso, explique por que essa condição é chamada pneumotórax.'), 101618: GenericQuery('524285', 'Esteira inclinada significando'), }) self._test_queries('mmarco/pt/dev/small', count=7000, items={ 0: GenericQuery('1048585', 'O que é o irmão de Paula Deen?'), 9: GenericQuery('524699', 'Número do serviço tricare.'), 6999: GenericQuery('1048565', 'que joga sebastiano michaelis'), }) self._test_queries('mmarco/pt/train/v1.1', count=808731, items={ 0: GenericQuery('121352', 'Definir extremo'), 9: GenericQuery('492875', 'Temperatura do desinfectante'), 760: GenericQuery('1009273', 'Qual evangelista se tornou um ateu depois de realizar muitas reuniões? O nome dele era Chuck...'), 808730: GenericQuery('50393', 'Benefícios de ferver limão e beber suco.'), }) self._test_queries('mmarco/pt/dev/v1.1', count=101093, items={ 0: GenericQuery('1048578', 'custo de piscinas intermináveis / spa de banho'), 9: GenericQuery('1048587', 'O que é patrono?'), 29: GenericQuery('52', 'Pneumo é um prefixo que significa ar. Sabendo disso, explique por que essa condição é chamada pneumotórax.'), 101092: GenericQuery('524285', 'Esteira inclinada significando'), }) self._test_queries('mmarco/pt/dev/small/v1.1', count=6980, items={ 0: GenericQuery('1048585', 'O que é o irmão de Paula Deen?'), 9: GenericQuery('524699', 'Número do serviço tricare.'), 6979: GenericQuery('1048565', 'que joga sebastiano michaelis'), }) self._test_queries('mmarco/it/train', count=808731, items={ 0: GenericQuery('121352', "L'ente creditizio definisce l'estremo"), 9: GenericQuery('492875', 'Temperatura di sanificante di polistirolo'), 808730: GenericQuery('50393', 'Benefici di limone bollente e succo di bevuta.'), }) self._test_queries('mmarco/it/dev', count=101093, items={ 0: GenericQuery('1048578', 'Costo delle infinite piscine/piscine termali'), 9: GenericQuery('1048587', '[47] ciò che è patrono'), 101092: GenericQuery('524285', 'Tapis roulant inclinazione significato'), }) self._test_queries('mmarco/it/dev/small', count=6980, items={ 0: GenericQuery('1048585', "Cos'è il fratello di Paula deen?"), 9: GenericQuery('524699', 'Numero di servizio di tricare del benzile'), 6979: GenericQuery('1048565', 'Coccodrillo che gioca a michaelis Sebastian'), }) self._test_queries('mmarco/id/train', count=808731, items={ 0: GenericQuery('121352', 'Definisi ekstrim'), 9: GenericQuery('492875', 'Suhu pembersih'), 808730: GenericQuery('50393', 'Manfaat lemon mendidih dan jus minum.'), }) self._test_queries('mmarco/id/dev', count=101093, items={ 0: GenericQuery('1048578', 'Biaya kolam renang tak berujung/swim spa'), 9: GenericQuery('1048587', 'Apa yang menjadi pelindungnya?'), 101092: GenericQuery('524285', 'Arti dari treadmill treadmill'), }) self._test_queries('mmarco/id/dev/small', count=6980, items={ 0: GenericQuery('1048585', 'Apa itu kakak paula deen'), 9: GenericQuery('524699', 'Nomor layanan tricare'), 6979: GenericQuery('1048565', 'Yang memainkan Sebastian Michaelis.'), }) self._test_queries('mmarco/de/train', count=808731, items={ 0: GenericQuery('121352', 'Bestimmen Sie extreme'), 9: GenericQuery('492875', 'Temperatur des Reinigungsmittels'), 808730: GenericQuery('50393', 'Die Vorteile von kochenden Zitronen und Trinksaft.'), }) self._test_queries('mmarco/de/dev', count=101093, items={ 0: GenericQuery('1048578', 'Kosten für endlose Pools/Schwimmbad'), 9: GenericQuery('1048587', 'Was ist der Schutzpatron?'), 101092: GenericQuery('524285', 'Laufband Neigung Bedeutung'), }) self._test_queries('mmarco/de/dev/small', count=6980, items={ 0: GenericQuery('1048585', 'Was ist Paula deens Bruder?'), 9: GenericQuery('524699', 'Nummer des Tricare-Services'), 6979: GenericQuery('1048565', 'Wer spielt sebastian michaelis'), }) self._test_queries('mmarco/ru/train', count=808731, items={ 0: GenericQuery('121352', '□ определить крайность'), 9: GenericQuery('492875', '▸ Температура обеззараживающих веществ'), 808730: GenericQuery('50393', '- выгоды от кипящего лимона и сока.'), }) self._test_queries('mmarco/ru/dev', count=101093, items={ 0: GenericQuery('1048578', '- стоимость бесконечных пулов/плавучих спа'), 9: GenericQuery('1048587', '□ что является покровителем'), 101092: GenericQuery('524285', '" беговая дорожка " означает " наклонная дорожка ".'), }) self._test_queries('mmarco/ru/dev/small', count=6980, items={ 0: GenericQuery('1048585', 'Что такое брат Паулы Дина?'), 9: GenericQuery('524699', '□ номер службы трехразового ухода'), 6979: GenericQuery('1048565', 'который играет себастьяна Михаэлиса'), }) self._test_queries('mmarco/zh/train', count=808731, items={ 0: GenericQuery('121352', '定义极端'), 9: GenericQuery('492875', '净化剂温度'), 808730: GenericQuery('50393', '煮柠檬和喝果汁的好处。'), }) self._test_queries('mmarco/zh/dev', count=101093, items={ 0: GenericQuery('1048578', '無止境的池塘/游泳垃圾邮件的成本'), 1: GenericQuery('1048579', ''), 9: GenericQuery('1048587', '什么是赞助人?'), 101092: GenericQuery('524285', '踢踏机嵌入的含義Name'), }) self._test_queries('mmarco/zh/dev/small', count=6980, items={ 0: GenericQuery('1048585', '{\\fn黑体\\fs22\\bord1\\shad0\\3aHBE\\4aH00\\fscx67\\fscy66\\2cHFFFFFF\\3cH808080}Paula Deen的弟弟是什么 {\\fn黑体\\fs22\\bord1\\shad0\\3aHBE\\4aH00\\fscx67\\fscy66\\2cHFFFFFF\\3cH808080}'), 9: GenericQuery('524699', '三角护理服务数量@ info: whatsthis'), 6979: GenericQuery('1048565', '扮演塞巴斯蒂安·米切利斯'), }) self._test_queries('mmarco/zh/dev/v1.1', count=101093, items={ 0: GenericQuery('1048578', '無止境的池塘/游泳垃圾邮件的成本'), 1: GenericQuery('1048579', ' . '), 9: GenericQuery('1048587', '什么是赞助人?'), 101092: GenericQuery('524285', '踢踏机嵌入的含義Name'), }) self._test_queries('mmarco/zh/dev/small/v1.1', count=6980, items={ 0: GenericQuery('1048585', '{\\fn黑体\\fs22\\bord1\\shad0\\3aHBE\\4aH00\\fscx67\\fscy66\\2cHFFFFFF\\3cH808080}Paula Deen的弟弟是什么 {\\fn黑体\\fs22\\bord1\\shad0\\3aHBE\\4aH00\\fscx67\\fscy66\\2cHFFFFFF\\3cH808080}'), 9: GenericQuery('524699', '三角护理服务数量@ info: whatsthis'), 6979: GenericQuery('1048565', '扮演塞巴斯蒂安·米切利斯'), }) self._test_queries('mmarco/v2/ar/train', count=808731, items={ 0: GenericQuery('1', 'يعتبر بوتلاتش مثالاً على'), 9: GenericQuery('14', ' كانت سامانثا وجوزفين يستعدان لبدء تجارة الملابس الخاصة بهما ؛ اتصلوا'), 808730: GenericQuery('1185869', ') ما هو الأثر المباشر لنجاح مشروع مانهاتن؟'), }) self._test_queries('mmarco/v2/ar/dev', count=101093, items={ 0: GenericQuery('2', 'تحديد مستقبلات الاندروجين'), 9: GenericQuery('106', 'تعريف تقسيم "كوم متعدد الوحدات"'), 101092: GenericQuery('1102432', '. ما هي الشركة؟'), }) self._test_queries('mmarco/v2/ar/dev/small', count=6980, items={ 0: GenericQuery('2', 'تحديد مستقبلات الاندروجين'), 9: GenericQuery('5925', 'سوني PS-LX300USB كيفية الاتصال بجهاز الكمبيوتر'), 6979: GenericQuery('1102400', 'لماذا الدببة السبات'), }) self._test_queries('mmarco/v2/zh/train', count=808731, items={ 0: GenericQuery('1', '一个potlatch被认为是一个例子'), 9: GenericQuery('14', ' 萨曼莎和约瑟芬正准备开始自己的服装生意;他们联系了'), 808730: GenericQuery('1185869', ') 曼哈顿项目成功的直接影响是什么?'), }) self._test_queries('mmarco/v2/zh/dev', count=101093, items={ 0: GenericQuery('2', '雄激素受体定义'), 9: GenericQuery('106', '“com 多单元”分区定义'), 101092: GenericQuery('1102432', '.什么是公司?'), }) self._test_queries('mmarco/v2/zh/dev/small', count=6980, items={ 0: GenericQuery('2', '雄激素受体定义'), 9: GenericQuery('5925', '索尼 PS-LX300USB 如何连接电脑'), 6979: GenericQuery('1102400', '熊为什么要冬眠'), }) self._test_queries('mmarco/v2/dt/train', count=808731, items={ 0: GenericQuery('1', 'Een potlatch wordt beschouwd als een voorbeeld van:'), 9: GenericQuery('14', ' Samantha en Josephine waren zich aan het voorbereiden om hun eigen kledingzaak te beginnen; ze namen contact op'), 808730: GenericQuery('1185869', ')wat was de onmiddellijke impact van het succes van het Manhattan-project?'), }) self._test_queries('mmarco/v2/dt/dev', count=101093, items={ 0: GenericQuery('2', 'Androgeenreceptor definiëren'), 9: GenericQuery('106', "'com multi unit' bestemmingsdefinitie"), 101092: GenericQuery('1102432', '. wat is een corporatie?'), }) self._test_queries('mmarco/v2/dt/dev/small', count=6980, items={ 0: GenericQuery('2', 'Androgeenreceptor definiëren'), 9: GenericQuery('5925', 'Sony PS-LX300USB hoe te verbinden met pc'), 6979: GenericQuery('1102400', 'waarom overwinteren beren?'), }) self._test_queries('mmarco/v2/fr/train', count=808731, items={ 0: GenericQuery('121352', "définir l'extrême"), 9: GenericQuery('492875', 'température du désinfectant'), 808730: GenericQuery('50393', 'avantages de faire bouillir des citrons et de boire du jus.'), }) self._test_queries('mmarco/v2/fr/dev', count=101093, items={ 0: GenericQuery('1048578', 'coût des piscines sans fin / spa de nage'), 9: GenericQuery('1048587', "qu'est-ce que le patron"), 101092: GenericQuery('524285', "signification de l'inclinaison du tapis roulant"), }) self._test_queries('mmarco/v2/fr/dev/small', count=6980, items={ 0: GenericQuery('1048585', 'quel est le frère de paula deen'), 9: GenericQuery('524699', 'numéro de service tricare'), 6979: GenericQuery('1048565', 'qui joue sebastian michaelis'), }) self._test_queries('mmarco/v2/de/train', count=808731, items={ 0: GenericQuery('121352', 'Extrem definieren'), 9: GenericQuery('492875', 'Desinfektionsmitteltemperatur'), 808730: GenericQuery('50393', 'Vorteile von Zitronen kochen und Saft trinken.'), }) self._test_queries('mmarco/v2/de/dev', count=101093, items={ 0: GenericQuery('1048578', 'Kosten für endlose Pools/Swimming Spa'), 9: GenericQuery('1048587', 'was ist Patron'), 101092: GenericQuery('524285', 'Laufbandneigung bedeutung'), }) self._test_queries('mmarco/v2/de/dev/small', count=6980, items={ 0: GenericQuery('1048585', 'was ist paula deens bruder'), 9: GenericQuery('524699', 'Tricare-Servicenummer'), 6979: GenericQuery('1048565', 'wer spielt sebastian michaelis'), }) self._test_queries('mmarco/v2/hi/train', count=808731, items={ 0: GenericQuery('1', 'पॉटलैच को का एक उदाहरण माना जाता है'), 9: GenericQuery('14', ' सामंथा और जोसफीन अपना खुद का कपड़ों का व्यवसाय शुरू करने की तैयारी कर रहे थे; उन्होंने संपर्क किया'), 808730: GenericQuery('1185869', ') मैनहट्टन परियोजना की सफलता का तत्काल प्रभाव क्या था?'), }) self._test_queries('mmarco/v2/hi/dev', count=101093, items={ 0: GenericQuery('2', 'एण्ड्रोजन रिसेप्टर परिभाषित'), 9: GenericQuery('106', "'कॉम मल्टी यूनिट' ज़ोनिंग परिभाषा"), 101092: GenericQuery('1102432', '. एक निगम क्या है?'), }) self._test_queries('mmarco/v2/hi/dev/small', count=6980, items={ 0: GenericQuery('2', 'एण्ड्रोजन रिसेप्टर परिभाषित'), 9: GenericQuery('5925', 'Sony PS-LX300USB पीसी से कैसे कनेक्ट करें'), 6979: GenericQuery('1102400', 'भालू हाइबरनेट क्यों करते हैं?'), }) self._test_queries('mmarco/v2/id/train', count=808731, items={ 0: GenericQuery('1', 'Potlatch dianggap sebagai contoh dari'), 9: GenericQuery('14', ' Samantha dan Josephine sedang bersiap untuk memulai bisnis pakaian mereka sendiri; mereka menghubungi'), 808730: GenericQuery('1185869', ') apa dampak langsung dari keberhasilan proyek manhattan?'), }) self._test_queries('mmarco/v2/id/dev', count=101093, items={ 0: GenericQuery('2', 'Reseptor androgen menentukan'), 9: GenericQuery('106', "Definisi zonasi 'com multi unit'"), 101092: GenericQuery('1102432', '. apa itu korporasi?'), }) self._test_queries('mmarco/v2/id/dev/small', count=6980, items={ 0: GenericQuery('2', 'Reseptor androgen menentukan'), 9: GenericQuery('5925', 'Sony PS-LX300USB cara menghubungkan ke pc'), 6979: GenericQuery('1102400', 'mengapa beruang hibernasi'), }) self._test_queries('mmarco/v2/it/train', count=808731, items={ 0: GenericQuery('121352', 'definire estremo'), 9: GenericQuery('492875', 'temperatura del disinfettante'), 808730: GenericQuery('50393', 'benefici di bollire i limoni e bere succo.'), }) self._test_queries('mmarco/v2/it/dev', count=101093, items={ 0: GenericQuery('1048578', 'costo di piscine infinite/spa'), 9: GenericQuery('1048587', "cos'è il patrono?"), 101092: GenericQuery('524285', 'significato di inclinazione del tapis roulant'), }) self._test_queries('mmarco/v2/it/dev/small', count=6980, items={ 0: GenericQuery('1048585', "cos'è il fratello di paula deen?"), 9: GenericQuery('524699', 'numero di servizio tricare'), 6979: GenericQuery('1048565', 'chi interpreta sebastian michaelis'), }) self._test_queries('mmarco/v2/ja/train', count=808731, items={ 0: GenericQuery('1', 'ポトラッチはの例と見なされます'), 9: GenericQuery('14', ' サマンサとジョセフィンは、独自の衣料品事業を始める準備をしていました。彼らは連絡した'), 808730: GenericQuery('1185869', ')マンハッタン計画の成功の直接の影響は何でしたか?'), }) self._test_queries('mmarco/v2/ja/dev', count=101093, items={ 0: GenericQuery('2', 'アンドロゲン受容体の定義'), 9: GenericQuery('106', '「commultiunit」ゾーニング定義'), 101092: GenericQuery('1102432', '。法人とは?'), }) self._test_queries('mmarco/v2/ja/dev/small', count=6980, items={ 0: GenericQuery('2', 'アンドロゲン受容体の定義'), 9: GenericQuery('5925', 'ソニーPS-LX300USBPCへの接続方法'), 6979: GenericQuery('1102400', 'なぜクマは冬眠するのですか'), }) self._test_queries('mmarco/v2/pt/train', count=808731, items={ 0: GenericQuery('121352', 'definir extremo'), 9: GenericQuery('492875', 'temperatura do desinfetante'), 808730: GenericQuery('50393', 'benefícios de ferver limões e beber suco.'), }) self._test_queries('mmarco/v2/pt/dev', count=101093, items={ 0: GenericQuery('1048578', 'custo de piscinas infinitas / spa de natação'), 9: GenericQuery('1048587', 'o que é patrono'), 101092: GenericQuery('524285', 'significado de inclinação da esteira'), }) self._test_queries('mmarco/v2/pt/dev/small', count=6980, items={ 0: GenericQuery('1048585', 'o que é irmão de paula deen'), 9: GenericQuery('524699', 'número do serviço tricare'), 6979: GenericQuery('1048565', 'quem joga sebastian michaelis'), }) self._test_queries('mmarco/v2/ru/train', count=808731, items={ 0: GenericQuery('1', 'Потлач считается примером'), 9: GenericQuery('14', ' Саманта и Жозефина готовились открыть собственный бизнес по производству одежды; они связались'), 808730: GenericQuery('1185869', ') каково было непосредственное влияние успеха манхэттенского проекта?'), }) self._test_queries('mmarco/v2/ru/dev', count=101093, items={ 0: GenericQuery('2', 'Рецепторы андрогенов определяют'), 9: GenericQuery('106', 'Определение зонирования com multi unit'), 101092: GenericQuery('1102432', '. что такое корпорация?'), }) self._test_queries('mmarco/v2/ru/dev/small', count=6980, items={ 0: GenericQuery('2', 'Рецепторы андрогенов определяют'), 9: GenericQuery('5925', 'Sony PS-LX300USB как подключить к ПК'), 6979: GenericQuery('1102400', 'почему медведи впадают в спячку'), }) self._test_queries('mmarco/v2/es/train', count=808731, items={ 0: GenericQuery('121352', 'definir extremo'), 9: GenericQuery('492875', 'temperatura del desinfectante'), 808730: GenericQuery('50393', 'Beneficios de hervir limones y beber jugo.'), }) self._test_queries('mmarco/v2/es/dev', count=101093, items={ 0: GenericQuery('1048578', 'costo de piscinas infinitas / spa de natación'), 9: GenericQuery('1048587', 'que es patron'), 101092: GenericQuery('524285', 'significado de la inclinación de la cinta de correr'), }) self._test_queries('mmarco/v2/es/dev/small', count=6980, items={ 0: GenericQuery('1048585', 'que es el hermano de paula deen'), 9: GenericQuery('524699', 'número de servicio tricare'), 6979: GenericQuery('1048565', 'quien interpreta a sebastian michaelis'), }) self._test_queries('mmarco/v2/vi/train', count=808731, items={ 0: GenericQuery('1', 'Một potlatch được coi là một ví dụ của'), 9: GenericQuery('14', ' Samantha và Josephine đang chuẩn bị bắt đầu kinh doanh quần áo của riêng họ; họ đã liên hệ'), 808730: GenericQuery('1185869', ') tác động tức thời đến sự thành công của dự án manhattan là gì?'), }) self._test_queries('mmarco/v2/vi/dev', count=101093, items={ 0: GenericQuery('2', 'Xác định thụ thể androgen'), 9: GenericQuery('106', "định nghĩa phân vùng 'com multi unit'"), 101092: GenericQuery('1102432', '. một công ty là gì?'), }) self._test_queries('mmarco/v2/vi/dev/small', count=6980, items={ 0: GenericQuery('2', 'Xác định thụ thể androgen'), 9: GenericQuery('5925', 'Cách kết nối Sony PS-LX300USB với PC'), 6979: GenericQuery('1102400', 'tại sao gấu ngủ đông'), }) def test_qrels(self): self._test_qrels('mmarco/es/train', count=532761, items={ 0: TrecQrel('1185869', '0', 1, '0'), 9: TrecQrel('186154', '1160', 1, '0'), 532760: TrecQrel('405466', '8841735', 1, '0'), }) self._test_qrels('mmarco/es/dev', count=59273, items={ 0: TrecQrel('1102432', '2026790', 1, '0'), 9: TrecQrel('300674', '7067032', 1, '0'), 59272: TrecQrel('371455', '8009476', 1, '0'), }) self._test_qrels('mmarco/es/dev/small', count=7437, items={ 0: TrecQrel('300674', '7067032', 1, '0'), 9: TrecQrel('54544', '7068203', 1, '0'), 7436: TrecQrel('195199', '8009377', 1, '0'), }) self._test_qrels('mmarco/fr/train', count=532761, items={ 0: TrecQrel('1185869', '0', 1, '0'), 9: TrecQrel('186154', '1160', 1, '0'), 532760: TrecQrel('405466', '8841735', 1, '0'), }) self._test_qrels('mmarco/fr/dev', count=59273, items={ 0: TrecQrel('1102432', '2026790', 1, '0'), 9: TrecQrel('300674', '7067032', 1, '0'), 59272: TrecQrel('371455', '8009476', 1, '0'), }) self._test_qrels('mmarco/fr/dev/small', count=7437, items={ 0: TrecQrel('300674', '7067032', 1, '0'), 9: TrecQrel('54544', '7068203', 1, '0'), 7436: TrecQrel('195199', '8009377', 1, '0'), }) self._test_qrels('mmarco/it/train', count=532761, items={ 0: TrecQrel('1185869', '0', 1, '0'), 9: TrecQrel('186154', '1160', 1, '0'), 532760: TrecQrel('405466', '8841735', 1, '0'), }) self._test_qrels('mmarco/it/dev', count=59273, items={ 0: TrecQrel('1102432', '2026790', 1, '0'), 9: TrecQrel('300674', '7067032', 1, '0'), 59272: TrecQrel('371455', '8009476', 1, '0'), }) self._test_qrels('mmarco/it/dev/small', count=7437, items={ 0: TrecQrel('300674', '7067032', 1, '0'), 9: TrecQrel('54544', '7068203', 1, '0'), 7436: TrecQrel('195199', '8009377', 1, '0'), }) self._test_qrels('mmarco/id/train', count=532761, items={ 0: TrecQrel('1185869', '0', 1, '0'), 9: TrecQrel('186154', '1160', 1, '0'), 532760: TrecQrel('405466', '8841735', 1, '0'), }) self._test_qrels('mmarco/id/dev', count=59273, items={ 0: TrecQrel('1102432', '2026790', 1, '0'), 9: TrecQrel('300674', '7067032', 1, '0'), 59272: TrecQrel('371455', '8009476', 1, '0'), }) self._test_qrels('mmarco/id/dev/small', count=7437, items={ 0: TrecQrel('300674', '7067032', 1, '0'), 9: TrecQrel('54544', '7068203', 1, '0'), 7436: TrecQrel('195199', '8009377', 1, '0'), }) self._test_qrels('mmarco/pt/train', count=532761, items={ 0: TrecQrel('1185869', '0', 1, '0'), 9: TrecQrel('186154', '1160', 1, '0'), 532760: TrecQrel('405466', '8841735', 1, '0'), }) self._test_qrels('mmarco/pt/dev', count=59273, items={ 0: TrecQrel('1102432', '2026790', 1, '0'), 9: TrecQrel('300674', '7067032', 1, '0'), 59272: TrecQrel('371455', '8009476', 1, '0'), }) self._test_qrels('mmarco/pt/dev/small', count=7437, items={ 0: TrecQrel('300674', '7067032', 1, '0'), 9: TrecQrel('54544', '7068203', 1, '0'), 7436: TrecQrel('195199', '8009377', 1, '0'), }) self._test_qrels('mmarco/pt/train/v1.1', count=532761, items={ 0: TrecQrel('1185869', '0', 1, '0'), 9: TrecQrel('186154', '1160', 1, '0'), 532760: TrecQrel('405466', '8841735', 1, '0'), }) self._test_qrels('mmarco/pt/dev/v1.1', count=59273, items={ 0: TrecQrel('1102432', '2026790', 1, '0'), 9: TrecQrel('300674', '7067032', 1, '0'), 59272: TrecQrel('371455', '8009476', 1, '0'), }) self._test_qrels('mmarco/pt/dev/small/v1.1', count=7437, items={ 0: TrecQrel('300674', '7067032', 1, '0'), 9: TrecQrel('54544', '7068203', 1, '0'), 7436: TrecQrel('195199', '8009377', 1, '0'), }) self._test_qrels('mmarco/de/train', count=532761, items={ 0: TrecQrel('1185869', '0', 1, '0'), 9: TrecQrel('186154', '1160', 1, '0'), 532760: TrecQrel('405466', '8841735', 1, '0'), }) self._test_qrels('mmarco/de/dev', count=59273, items={ 0: TrecQrel('1102432', '2026790', 1, '0'), 9: TrecQrel('300674', '7067032', 1, '0'), 59272: TrecQrel('371455', '8009476', 1, '0'), }) self._test_qrels('mmarco/de/dev/small', count=7437, items={ 0: TrecQrel('300674', '7067032', 1, '0'), 9: TrecQrel('54544', '7068203', 1, '0'), 7436: TrecQrel('195199', '8009377', 1, '0'), }) self._test_qrels('mmarco/ru/train', count=532761, items={ 0: TrecQrel('1185869', '0', 1, '0'), 9: TrecQrel('186154', '1160', 1, '0'), 532760: TrecQrel('405466', '8841735', 1, '0'), }) self._test_qrels('mmarco/ru/dev', count=59273, items={ 0: TrecQrel('1102432', '2026790', 1, '0'), 9: TrecQrel('300674', '7067032', 1, '0'), 59272: TrecQrel('371455', '8009476', 1, '0'), }) self._test_qrels('mmarco/ru/dev/small', count=7437, items={ 0: TrecQrel('300674', '7067032', 1, '0'), 9: TrecQrel('54544', '7068203', 1, '0'), 7436: TrecQrel('195199', '8009377', 1, '0'), }) self._test_qrels('mmarco/zh/train', count=532761, items={ 0: TrecQrel('1185869', '0', 1, '0'), 9: TrecQrel('186154', '1160', 1, '0'), 532760: TrecQrel('405466', '8841735', 1, '0'), }) self._test_qrels('mmarco/zh/dev', count=59273, items={ 0: TrecQrel('1102432', '2026790', 1, '0'), 9: TrecQrel('300674', '7067032', 1, '0'), 59272: TrecQrel('371455', '8009476', 1, '0'), }) self._test_qrels('mmarco/zh/dev/small', count=7437, items={ 0: TrecQrel('300674', '7067032', 1, '0'), 9: TrecQrel('54544', '7068203', 1, '0'), 7436: TrecQrel('195199', '8009377', 1, '0'), }) self._test_qrels('mmarco/zh/dev/v1.1', count=59273, items={ 0: TrecQrel('1102432', '2026790', 1, '0'), 9: TrecQrel('300674', '7067032', 1, '0'), 59272: TrecQrel('371455', '8009476', 1, '0'), }) self._test_qrels('mmarco/zh/dev/small/v1.1', count=7437, items={ 0: TrecQrel('300674', '7067032', 1, '0'), 9: TrecQrel('54544', '7068203', 1, '0'), 7436: TrecQrel('195199', '8009377', 1, '0'), }) for ds in ['mmarco/v2/ar', 'mmarco/v2/zh', 'mmarco/v2/dt', 'mmarco/v2/fr', 'mmarco/v2/de', 'mmarco/v2/hi', 'mmarco/v2/id', 'mmarco/v2/it', 'mmarco/v2/ja', 'mmarco/v2/pt', 'mmarco/v2/ru', 'mmarco/v2/es', 'mmarco/v2/vi']: self._test_qrels(f'{ds}/train', count=532761, items={ 0: TrecQrel('1185869', '0', 1, '0'), 9: TrecQrel('186154', '1160', 1, '0'), 532760: TrecQrel('405466', '8841735', 1, '0'), }) self._test_qrels(f'{ds}/dev', count=59273, items={ 0: TrecQrel('1102432', '2026790', 1, '0'), 9: TrecQrel('300674', '7067032', 1, '0'), 59272: TrecQrel('371455', '8009476', 1, '0'), }) self._test_qrels(f'{ds}/dev/small', count=7437, items={ 0: TrecQrel('300674', '7067032', 1, '0'), 9: TrecQrel('54544', '7068203', 1, '0'), 7436: TrecQrel('195199', '8009377', 1, '0'), }) def test_scoreddocs(self): self._test_scoreddocs('mmarco/es/dev/small', count=6786720, items={ 0: GenericScoredDoc('2', '1782337', 12.7035), 9: GenericScoredDoc('2', '4339073', 11.2667), 6786719: GenericScoredDoc('1102400', '533560', 5.598184), }) self._test_scoreddocs('mmarco/fr/dev/small', count=6785763, items={ 0: GenericScoredDoc('2', '1001873', 12.766), 9: GenericScoredDoc('2', '6285819', 11.793), 6785762: GenericScoredDoc('1102400', '1123059', 5.5288), }) self._test_scoreddocs('mmarco/pt/dev/small/v1.1', count=6976324, items={ 0: GenericScoredDoc('2', '1782337', 13.6133), 9: GenericScoredDoc('2', '3996546', 11.7405), 6976323: GenericScoredDoc('1102400', '8613556', 6.592), }) self._test_scoreddocs('mmarco/it/dev/small', count=6966491, items={ 0: GenericScoredDoc('2', '1782337', 13.0295), 9: GenericScoredDoc('2', '2022784', 11.7611), 6966490: GenericScoredDoc('1102400', '4411180', 5.498484), }) self._test_scoreddocs('mmarco/id/dev/small', count=6841990, items={ 0: GenericScoredDoc('2', '1782337', 13.2769), 9: GenericScoredDoc('2', '7496504', 11.857), 6841989: GenericScoredDoc('1102400', '8018446', 4.710481), }) self._test_scoreddocs('mmarco/de/dev/small', count=6594126, items={ 0: GenericScoredDoc('2', '2022779', 9.7118), 9: GenericScoredDoc('2', '2022785', 7.9781), 6594125: GenericScoredDoc('1102400', '1954600', 5.524396), }) self._test_scoreddocs('mmarco/ru/dev/small', count=6958739, items={ 0: GenericScoredDoc('2', '6285817', 12.2203), 9: GenericScoredDoc('2', '513654', 10.9697), 6958738: GenericScoredDoc('1102400', '3863095', 5.499489), }) self._test_scoreddocs('mmarco/zh/dev/small/v1.1', count=1034597, items={ 0: GenericScoredDoc('1215', '6593209', 2.2469), 9: GenericScoredDoc('1215', '2815463', 2.237397), 1034596: GenericScoredDoc('1102393', '3789102', 5.4537), }) self._test_scoreddocs('mmarco/v2/ar/dev/small', count=6848687, items={ 0: GenericScoredDoc('2', '1001873', -1.0), 9: GenericScoredDoc('2', '2022779', -10.0), 6848686: GenericScoredDoc('1102400', '8183767', -1000.0), }) self._test_scoreddocs('mmarco/v2/zh/dev/small', count=6979520, items={ 0: GenericScoredDoc('2', '6285817', -1.0), 9: GenericScoredDoc('2', '1782337', -10.0), 6979519: GenericScoredDoc('1102400', '8662465', -1000.0), }) self._test_scoreddocs('mmarco/v2/dt/dev/small', count=6608183, items={ 0: GenericScoredDoc('2', '2022779', -1.0), 9: GenericScoredDoc('2', '6285817', -10.0), 6608182: GenericScoredDoc('1102400', '424880', -1000.0), }) self._test_scoreddocs('mmarco/v2/fr/dev/small', count=6831783, items={ 0: GenericScoredDoc('2', '1782337', -1.0), 9: GenericScoredDoc('2', '6285819', -10.0), 6831782: GenericScoredDoc('1102400', '3586980', -1000.0), }) self._test_scoreddocs('mmarco/v2/de/dev/small', count=6586918, items={ 0: GenericScoredDoc('2', '2022779', -1.0), 9: GenericScoredDoc('2', '5414414', -10.0), 6586917: GenericScoredDoc('1102400', '2607361', -1000.0), }) self._test_scoreddocs('mmarco/v2/hi/dev/small', count=6961912, items={ 0: GenericScoredDoc('2', '1782337', -1.0), 9: GenericScoredDoc('2', '5762719', -10.0), 6961911: GenericScoredDoc('1102400', '5426691', -1000.0), }) self._test_scoreddocs('mmarco/v2/id/dev/small', count=6791487, items={ 0: GenericScoredDoc('2', '1782337', -1.0), 9: GenericScoredDoc('2', '2022782', -10.0), 6791486: GenericScoredDoc('1102400', '2744256', -1000.0), }) self._test_scoreddocs('mmarco/v2/it/dev/small', count=6952771, items={ 0: GenericScoredDoc('2', '1782337', -1.0), 9: GenericScoredDoc('2', '2022782', -10.0), 6952770: GenericScoredDoc('1102400', '7352536', -1000.0), }) self._test_scoreddocs('mmarco/v2/ja/dev/small', count=6817446, items={ 0: GenericScoredDoc('2', '3214931', -1.0), 9: GenericScoredDoc('2', '3634076', -10.0), 6817445: GenericScoredDoc('1102400', '1776192', -1000.0), }) self._test_scoreddocs('mmarco/v2/pt/dev/small', count=6975268, items={ 0: GenericScoredDoc('2', '1782337', -1.0), 9: GenericScoredDoc('2', '6285819', -10.0), 6975267: GenericScoredDoc('1102400', '3981631', -1000.0), }) self._test_scoreddocs('mmarco/v2/ru/dev/small', count=6931773, items={ 0: GenericScoredDoc('2', '1782337', -1.0), 9: GenericScoredDoc('2', '112127', -10.0), 6931772: GenericScoredDoc('1102400', '6819581', -1000.0), }) self._test_scoreddocs('mmarco/v2/es/dev/small', count=6777044, items={ 0: GenericScoredDoc('2', '3214931', -1.0), 9: GenericScoredDoc('2', '4339072', -10.0), 6777043: GenericScoredDoc('1102400', '2729917', -1000.0), }) self._test_scoreddocs('mmarco/v2/vi/dev/small', count=6976219, items={ 0: GenericScoredDoc('2', '2022779', -1.0), 9: GenericScoredDoc('2', '3634076', -10.0), 6976218: GenericScoredDoc('1102400', '7485193', -1000.0), }) def test_docpairs(self): self._test_docpairs('mmarco/es/train', count=39780811, items={ 0: GenericDocPair('400296', '1540783', '3518497'), 9: GenericDocPair('189845', '1051356', '4238671'), 39780810: GenericDocPair('749547', '394235', '7655192'), }) self._test_docpairs('mmarco/fr/train', count=39780811, items={ 0: GenericDocPair('400296', '1540783', '3518497'), 9: GenericDocPair('189845', '1051356', '4238671'), 39780810: GenericDocPair('749547', '394235', '7655192'), }) self._test_docpairs('mmarco/pt/train', count=39780811, items={ 0: GenericDocPair('400296', '1540783', '3518497'), 9: GenericDocPair('189845', '1051356', '4238671'), 39780810: GenericDocPair('749547', '394235', '7655192'), }) self._test_docpairs('mmarco/it/train', count=39780811, items={ 0: GenericDocPair('400296', '1540783', '3518497'), 9: GenericDocPair('189845', '1051356', '4238671'), 39780810: GenericDocPair('749547', '394235', '7655192'), }) self._test_docpairs('mmarco/id/train', count=39780811, items={ 0: GenericDocPair('400296', '1540783', '3518497'), 9: GenericDocPair('189845', '1051356', '4238671'), 39780810: GenericDocPair('749547', '394235', '7655192'), }) self._test_docpairs('mmarco/de/train', count=39780811, items={ 0: GenericDocPair('400296', '1540783', '3518497'), 9: GenericDocPair('189845', '1051356', '4238671'), 39780810: GenericDocPair('749547', '394235', '7655192'), }) self._test_docpairs('mmarco/ru/train', count=39780811, items={ 0: GenericDocPair('400296', '1540783', '3518497'), 9: GenericDocPair('189845', '1051356', '4238671'), 39780810: GenericDocPair('749547', '394235', '7655192'), }) self._test_docpairs('mmarco/zh/train', count=39780811, items={ 0: GenericDocPair('400296', '1540783', '3518497'), 9: GenericDocPair('189845', '1051356', '4238671'), 39780810: GenericDocPair('749547', '394235', '7655192'), }) for ds in ['mmarco/v2/ar/train', 'mmarco/v2/zh/train', 'mmarco/v2/dt/train', 'mmarco/v2/fr/train', 'mmarco/v2/de/train', 'mmarco/v2/hi/train', 'mmarco/v2/id/train', 'mmarco/v2/it/train', 'mmarco/v2/ja/train', 'mmarco/v2/pt/train', 'mmarco/v2/ru/train', 'mmarco/v2/es/train', 'mmarco/v2/vi/train']: self._test_docpairs(ds, count=39780811, items={ 0: GenericDocPair('400296', '1540783', '3518497'), 9: GenericDocPair('189845', '1051356', '4238671'), 39780810: GenericDocPair('749547', '394235', '7655192'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/mr_tydi.py ================================================ import re import unittest import ir_datasets from ir_datasets.formats import GenericQuery, GenericDoc, TrecQrel from .base import DatasetIntegrationTest class TestMrTydi(DatasetIntegrationTest): def test_docs(self): self._test_docs('mr-tydi/ar', count=2106586, items={ 0: GenericDoc('7#0', re.compile('^الماء مادةٌ شفافةٌ عديمة اللون والرائحة، وهو المكوّن الأساسي للجداول والبحيرات والبحار والمحيطات وكذ.{231} يكون الماء سائلاً، ولكنّ حالاته الأخرى شائعة الوجود أيضاً؛ وهي حالة الجليد الصلبة والبخار الغازيّة\\.$', flags=48)), 9: GenericDoc('7#9', re.compile('^يصنّف الماء كيميائيّاً على أنّه أكسيد للهيدروجين، وهو يتشكّل عندما يحترق الهيدروجين أو أيّ مركّب حاو.{1410} كما يمكن أن يكون على الشكل ماء فائق الثقل عندما يحلّ التريتيوم مكان الهيدروجين في جزيء الماء \\(T2O\\)\\.$', flags=48)), 2106585: GenericDoc('5272574#0', re.compile('^الشاهيه او الشوهيه هو مرادف للكافيه وهو مكان لتقديم الشاي بدلاً من القهوة \\. الشاهيه غير منتشرة عالمي.{208}ث العرش و والي الممالك السبعة ايمن بن فهد اول من وضع لبنة لهذا النوع من المحلات و من مهد الطريق لهم\\.$', flags=48)), }) self._test_docs('mr-tydi/bn', count=304059, items={ 0: GenericDoc('608#0', re.compile('^বাংলা ভাষা \\(/bɑːŋlɑː/; pronunciation\\) দক্ষিণ এশিয়ার বঙ্গ অঞ্চলের মানুষের স্থানীয় ভাষা, এই অঞ্চলটি .{444}, এবং ভারতের জাতীয় স্তোত্র এই ভাষাতেই রচিত এবং তা থেকেই দক্ষিণ এশিয়ায় এই ভাষার গুরুত্ব বোঝা যায়।$', flags=48)), 9: GenericDoc('608#9', re.compile('^১৯৫১–৫২ সালে পূর্ব পাকিস্তানে বাঙালি জনগণের প্রবল ভাষা সচেতনতার ফলস্বরূপ বাংলা ভাষা আন্দোলন নামক একট.{264}। ১৯৯৯ খ্রিস্টাব্দের ১৭ই নভেম্বর ইউনেস্কো এই দিনটিকে আন্তর্জাতিক মাতৃভাষা দিবসের মর্যাদা প্রদান করে।$', flags=48)), 304058: GenericDoc('719190#0', re.compile('^কোনরাড এলস্ট \\(জন্ম 7 অগাস্ট 1959\\) একজন বেলজীয় প্রাচ্যবিদ এবং ভারতবিদ যিনি তুলনামূলক ধর্মতত্ত্ব, হিন.{454}ভূত। এলস্ট হিন্দু জাতীয়তাবাদের বিষয়ে ডক্টরেট করেছেন, এবং হিন্দু জাতীয়তাবাদ আন্দোলনের সমর্থন করেন।$', flags=48)), }) self._test_docs('mr-tydi/en', count=32907100, items={ 0: GenericDoc('12#0', re.compile('^Anarchism is a political philosophy that advocates self\\-governed societies based on voluntary, coope.{285}lds capitalism, the state, and representative democracy to be undesirable, unnecessary, and harmful\\.$', flags=48)), 9: GenericDoc('12#9', re.compile('^The French Pierre\\-Joseph Proudhon is regarded as the first self\\-proclaimed anarchist, a label he ado.{1030}ractices inspired subsequent anarchists and made him one of the leading social thinkers of his time\\.$', flags=48)), 32907099: GenericDoc('59828278#1', 'Casalegno died on 23 January 2019 at the age of 93.'), }) self._test_docs('mr-tydi/fi', count=1908757, items={ 0: GenericDoc('1#0', re.compile('^Amsterdam on Alankomaiden pääkaupunki\\. Amsterdam on väkiluvultaan Alankomaiden suurin kaupunki, huht.{399} pääkaupunki, sijaitsevat niin kuningashuone, hallitus, parlamentti kuin korkein oikeuskin Haagissa\\.$', flags=48)), 9: GenericDoc('1#9', re.compile('^Amsterdamia johtaa muiden Alankomaiden kuntien tapaan valtuusto\\. Amsterdamin kaupunginvaltuustoon va.{281}ginvaltuuston että raatimieskollegion puheenjohtaja, mutta hänellä ei ole äänioikeutta valtuustossa\\.$', flags=48)), 1908756: GenericDoc('1494441#5', 'Vuoden valmentajaksi valittiin Päivi Alafrantin valmentaja Eino Maksimainen.'), }) self._test_docs('mr-tydi/id', count=1469399, items={ 0: GenericDoc('1#0', re.compile('^Asam deoksiribonukleat, lebih dikenal dengan singkatan DNA \\(bahasa Inggris: deoxyribo ^^ 243: GenericDoc(doc_id='243', text="John Maynard Keynes, 1st Baron Keynes, CB, FBA (/ˈkeɪnz/ KAYNZ; 5 June 1883 – 21 April 1946), was a British economist whose ideas fundamentally changed the theory and practice of modern macroeconomics and the economic policies of governments."), # Special characters ----------------------------------------------------------> ^^ 1004772: GenericDoc(doc_id='1004772', text='Jordan B Peterson added, Jason Belich 🇺🇸 @JasonBelich. Replying to @JasonBelich @jordanbpeterson. and it is /trivial/ for anybody with the authority to deploy code to slip a bit of code to enforce a grey list of sorts.'), # The above would be broken if 4-character utf8 codes were not handled -------> ^^ 1032614: GenericDoc(doc_id='1032614', text='The CLP Group (Chinese: 中電集團) and its holding company, CLP Holdings Ltd (SEHK: 0002) (Chinese: 中電控股有限公司), is a Hong Kong electric company that has businesses in a number of Asian markets and Australia. It is one of the two main electric power generation companies in Hong Kong. The other is Hongkong Electric Company. Incorporated in 1901 as China Light & Power Company Syndicate, its core business remains the generation, transmission, and retailing of electricity.'), # The above would be broken if using codecs.getreader -----------------------------------------------------------------------------------------> ^ 1038932: GenericDoc(doc_id='1038932', text='Insulin-naïve with type 1 diabetes: Initially ⅓–½ of total daily insulin dose. Give remainder of the total dose as short-acting insulin divided between each daily meal. Insulin-naïve with type 2 diabetes: Initially 0.2 Units/kg once daily. May need to adjust dose of other co-administered antidiabetic drugs.'), 8841822: GenericDoc(doc_id='8841822', text='View full size image. Behind the scenes of the dazzling light shows that spectators ooh and ahh at on the Fourth of July, are carefully crafted fireworks. Whether red, white and blue fountains or purple sparklers, each firework is packed with just the right mix of chemicals to create these colorful lights. Inside each handmade firework are small packets filled with special chemicals, mainly metal salts and metal oxides, which react to produce an array of colors.') }) def test_msmarco_passage_queries(self): self._test_queries('msmarco-passage/train', count=808731, items={ 0: GenericQuery(query_id='121352', text='define extreme'), 9: GenericQuery(query_id='492875', text='sanitizer temperature'), 808730: GenericQuery(query_id='50393', text='benefits of boiling lemons and drinking juice.') }) self._test_queries('msmarco-passage/train/judged', count=502939, items={ 0: GenericQuery(query_id='121352', text='define extreme'), 9: GenericQuery(query_id='54528', text='blood clots in urine after menopause'), 502938: GenericQuery(query_id='50393', text='benefits of boiling lemons and drinking juice.') }) self._test_queries('msmarco-passage/train/split200-train', count=808531, items={ 0: GenericQuery(query_id='121352', text='define extreme'), 9: GenericQuery(query_id='492875', text='sanitizer temperature'), 808530: GenericQuery(query_id='50393', text='benefits of boiling lemons and drinking juice.') }) self._test_queries('msmarco-passage/train/split200-valid', count=200, items={ 0: GenericQuery(query_id='93927', text='coastal processes are located on what vertebrae'), 9: GenericQuery(query_id='503706', text='steroid prednisone possible risks'), 199: GenericQuery(query_id='44209', text='average spousal ss benefit') }) self._test_queries('msmarco-passage/train/medical', count=78895, items={ 0: GenericQuery(query_id='54528', text='blood clots in urine after menopause'), 9: GenericQuery(query_id='445408', text='marijuana for weight loss'), 78894: GenericQuery(query_id='945443', text='when do you start going to the doctor every other week during pregnancy') }) self._test_queries('msmarco-passage/dev', count=101093, items={ 0: GenericQuery(query_id='1048578', text='cost of endless pools/swim spa'), 9: GenericQuery(query_id='1048587', text='what is patron'), 101092: GenericQuery(query_id='524285', text='treadmill incline meaning') }) self._test_queries('msmarco-passage/dev/small', count=6980, items={ 0: GenericQuery('1048585', "what is paula deen's brother"), 9: GenericQuery('524699', 'tricare service number'), 6979: GenericQuery('1048565', 'who plays sebastian michaelis'), }) self._test_queries('msmarco-passage/dev/2', count=4281, items={ 0: GenericQuery('1048579', 'what is pcnt'), 9: GenericQuery('1048779', 'what is ott media'), 4280: GenericQuery('1092262', ';liter chemistry definition'), }) self._test_queries('msmarco-passage/dev/judged', count=55578, items={ 0: GenericQuery(query_id='1048578', text='cost of endless pools/swim spa'), 9: GenericQuery(query_id='1048601', text='what is pastoral medicine'), 55577: GenericQuery(query_id='1048570', text='what is pearls before swine?') }) self._test_queries('msmarco-passage/eval', count=101092, items={ 0: GenericQuery(query_id='786436', text='what is prescribed to treat thyroid storm'), 9: GenericQuery(query_id='1048619', text='who plays stitch'), 101091: GenericQuery(query_id='786430', text='what is prescribed for pelvic inflammatory disease?') }) self._test_queries('msmarco-passage/eval/small', count=6837, items={ 0: GenericQuery('57', ' term service agreement definition'), 9: GenericQuery('262636', 'how long is a moment'), 6836: GenericQuery('567976', 'what are the causes of unemployment'), }) self._test_queries('msmarco-passage/trec-dl-2019', count=200, items={ 0: GenericQuery(query_id='1108939', text='what slows down the flow of blood'), 9: GenericQuery(query_id='885490', text='what party is paul ryan in'), 199: GenericQuery(query_id='532603', text='university of dubuque enrollment') }) self._test_queries('msmarco-passage/trec-dl-2019/judged', count=43, items={ 0: GenericQuery(query_id='156493', text='do goldfish grow'), 9: GenericQuery(query_id='1037798', text='who is robert gray'), 42: GenericQuery(query_id='146187', text='difference between a mcdouble and a double cheeseburger') }) self._test_queries('msmarco-passage/trec-dl-2020', count=200, items={ 0: GenericQuery(query_id='1030303', text='who is aziz hashim'), 9: GenericQuery(query_id='1071750', text='why is pete rose banned from hall of fame'), 199: GenericQuery(query_id='132622', text='definition of attempted arson') }) self._test_queries('msmarco-document/trec-dl-2020', count=200, items={ 0: GenericQuery('1030303', 'who is aziz hashim'), 9: GenericQuery('1071750', 'why is pete rose banned from hall of fame'), 199: GenericQuery('132622', 'definition of attempted arson'), }) self._test_queries('msmarco-document/trec-dl-2020/judged', count=45, items={ 0: GenericQuery('1030303', 'who is aziz hashim'), 9: GenericQuery('1105792', 'define: geon'), 44: GenericQuery('997622', 'where is the show shameless filmed'), }) self._test_queries('msmarco-passage/trec-dl-hard', count=50, items={ 0: GenericQuery('1108939', 'what slows down the flow of blood'), 9: GenericQuery('451602', "medicare's definition of mechanical ventilation"), 49: GenericQuery('88495', 'causes of stroke?'), }) self._test_queries('msmarco-passage/trec-dl-hard/fold1', count=10, items={ 0: GenericQuery('966413', 'where are the benefits of cinnamon as a supplement?'), 9: GenericQuery('883915', 'what other brain proteins can cause dementia'), }) self._test_queries('msmarco-passage/trec-dl-hard/fold2', count=10, items={ 0: GenericQuery('588587', 'what causes heavy metal toxins in your body'), 9: GenericQuery('794429', 'what is sculpture shape space'), }) self._test_queries('msmarco-passage/trec-dl-hard/fold3', count=10, items={ 0: GenericQuery('1108939', 'what slows down the flow of blood'), 9: GenericQuery('86606', 'causes of gas in large intestine'), }) self._test_queries('msmarco-passage/trec-dl-hard/fold4', count=10, items={ 0: GenericQuery('1108100', 'what type of movement do bacteria exhibit?'), 9: GenericQuery('88495', 'causes of stroke?'), }) self._test_queries('msmarco-passage/trec-dl-hard/fold5', count=10, items={ 0: GenericQuery('190044', 'foods to detox liver naturally'), 9: GenericQuery('877809', 'what metal are hip replacements made of'), }) def test_msmarco_passage_qrels(self): self._test_qrels('msmarco-passage/train', count=532761, items={ 0: TrecQrel(query_id='1185869', doc_id='0', relevance=1, iteration='0'), 9: TrecQrel(query_id='186154', doc_id='1160', relevance=1, iteration='0'), 532760: TrecQrel(query_id='405466', doc_id='8841735', relevance=1, iteration='0') }) self._test_qrels('msmarco-passage/train/judged', count=532761, items={ 0: TrecQrel(query_id='1185869', doc_id='0', relevance=1, iteration='0'), 9: TrecQrel(query_id='186154', doc_id='1160', relevance=1, iteration='0'), 532760: TrecQrel(query_id='405466', doc_id='8841735', relevance=1, iteration='0') }) self._test_qrels('msmarco-passage/train/medical', count=54627, items={ 0: TrecQrel(query_id='403613', doc_id='60', relevance=1, iteration='0'), 9: TrecQrel(query_id='685235', doc_id='12191', relevance=1, iteration='0'), 54626: TrecQrel(query_id='496447', doc_id='8839368', relevance=1, iteration='0') }) self._test_qrels('msmarco-passage/dev', count=59273, items={ 0: TrecQrel(query_id='1102432', doc_id='2026790', relevance=1, iteration='0'), 9: TrecQrel(query_id='300674', doc_id='7067032', relevance=1, iteration='0'), 59272: TrecQrel(query_id='371455', doc_id='8009476', relevance=1, iteration='0') }) self._test_qrels('msmarco-passage/dev/small', count=7437, items={ 0: TrecQrel('300674', '7067032', 1, '0'), 9: TrecQrel('54544', '7068203', 1, '0'), 7436: TrecQrel('195199', '8009377', 1, '0'), }) self._test_qrels('msmarco-passage/dev/2', count=4655, items={ 0: TrecQrel('1090266', '7068220', 1, '0'), 9: TrecQrel('30178', '7071029', 1, '0'), 4654: TrecQrel('1090285', '8009191', 1, '0'), }) self._test_qrels('msmarco-passage/dev/judged', count=59273, items={ 0: TrecQrel(query_id='1102432', doc_id='2026790', relevance=1, iteration='0'), 9: TrecQrel(query_id='300674', doc_id='7067032', relevance=1, iteration='0'), 59272: TrecQrel(query_id='371455', doc_id='8009476', relevance=1, iteration='0') }) self._test_qrels('msmarco-passage/trec-dl-2019', count=9260, items={ 0: TrecQrel(query_id='19335', doc_id='1017759', relevance=0, iteration='Q0'), 9: TrecQrel(query_id='19335', doc_id='1274615', relevance=0, iteration='Q0'), 9259: TrecQrel(query_id='1133167', doc_id='977421', relevance=0, iteration='Q0') }) self._test_qrels('msmarco-passage/trec-dl-2019/judged', count=9260, items={ 0: TrecQrel(query_id='19335', doc_id='1017759', relevance=0, iteration='Q0'), 9: TrecQrel(query_id='19335', doc_id='1274615', relevance=0, iteration='Q0'), 9259: TrecQrel(query_id='1133167', doc_id='977421', relevance=0, iteration='Q0') }) self._test_qrels('msmarco-passage/train/split200-train', count=532630, items={ 0: TrecQrel(query_id='1185869', doc_id='0', relevance=1, iteration='0'), 9: TrecQrel(query_id='186154', doc_id='1160', relevance=1, iteration='0'), 532629: TrecQrel(query_id='405466', doc_id='8841735', relevance=1, iteration='0') }) self._test_qrels('msmarco-passage/train/split200-valid', count=131, items={ 0: TrecQrel(query_id='318166', doc_id='179254', relevance=1, iteration='0'), 9: TrecQrel(query_id='1158250', doc_id='791721', relevance=1, iteration='0'), 130: TrecQrel(query_id='302427', doc_id='512871', relevance=1, iteration='0') }) self._test_qrels('msmarco-document/trec-dl-2020', count=9098, items={ 0: TrecQrel('42255', 'D1006124', 0, '0'), 9: TrecQrel('42255', 'D1168483', 0, '0'), 9097: TrecQrel('1136962', 'D96742', 0, '0'), }) self._test_qrels('msmarco-document/trec-dl-2020/judged', count=9098, items={ 0: TrecQrel('42255', 'D1006124', 0, '0'), 9: TrecQrel('42255', 'D1168483', 0, '0'), 9097: TrecQrel('1136962', 'D96742', 0, '0'), }) self._test_qrels('msmarco-passage/trec-dl-hard', count=4256, items={ 0: TrecQrel('915593', '1396701', 0, 'Q0'), 9: TrecQrel('915593', '1772932', 0, 'Q0'), 4255: TrecQrel('1056416', '8739207', 0, 'Q0'), }) self._test_qrels('msmarco-passage/trec-dl-hard/fold1', count=1072, items={ 0: TrecQrel('915593', '1396701', 0, 'Q0'), 9: TrecQrel('915593', '1772932', 0, 'Q0'), 1071: TrecQrel('174463', '8770954', 1, '0'), }) self._test_qrels('msmarco-passage/trec-dl-hard/fold2', count=898, items={ 0: TrecQrel('794429', '8663241', 3, 'Q0'), 9: TrecQrel('588587', '8548223', 1, 'Q0'), 897: TrecQrel('19335', '901329', 0, 'Q0'), }) self._test_qrels('msmarco-passage/trec-dl-hard/fold3', count=444, items={ 0: TrecQrel('177604', '8451987', 0, 'Q0'), 9: TrecQrel('177604', '8451996', 2, 'Q0'), 443: TrecQrel('1105792', '996676', 0, '0'), }) self._test_qrels('msmarco-passage/trec-dl-hard/fold4', count=716, items={ 0: TrecQrel('801118', '8708701', 3, 'Q0'), 9: TrecQrel('507445', '8407104', 1, 'Q0'), 715: TrecQrel('1056416', '8739207', 0, 'Q0'), }) self._test_qrels('msmarco-passage/trec-dl-hard/fold5', count=1126, items={ 0: TrecQrel('190044', '1353072', 3, 'Q0'), 9: TrecQrel('190044', '886798', 1, 'Q0'), 1125: TrecQrel('1103153', '8226445', 0, 'Q0'), }) def test_msmarco_passage_docpairs(self): self._test_docpairs('msmarco-passage/train', count=269919004, items={ 0: GenericDocPair(query_id='662731', doc_id_a='193249', doc_id_b='2975302'), 9: GenericDocPair(query_id='411362', doc_id_a='31018', doc_id_b='4238671'), 269919003: GenericDocPair(query_id='88228', doc_id_a='5117891', doc_id_b='7075853') }) self._test_docpairs('msmarco-passage/train/judged', count=269919004, items={ 0: GenericDocPair(query_id='662731', doc_id_a='193249', doc_id_b='2975302'), 9: GenericDocPair(query_id='411362', doc_id_a='31018', doc_id_b='4238671'), 269919003: GenericDocPair(query_id='88228', doc_id_a='5117891', doc_id_b='7075853') }) self._test_docpairs('msmarco-passage/train/triples-v2', count=397768673, items={ 0: GenericDocPair('1000094', '5399011', '4239068'), 9: GenericDocPair('1000094', '5399011', '6686526'), 397768672: GenericDocPair('999511', '1108465', '2605718'), }) self._test_docpairs('msmarco-passage/train/triples-small', count=39780811, items={ 0: GenericDocPair('400296', '1540783', '3518497'), 9: GenericDocPair('189845', '1051356', '4238671'), 39780810: GenericDocPair('749547', '394235', '7655192'), }) def test_msmarco_passage_scoreddocs(self): self._test_scoreddocs('msmarco-passage/train', count=478002393, items={ 0: GenericScoredDoc(query_id='965162', doc_id='1000930', score=0.0), 9: GenericScoredDoc(query_id='817636', doc_id='1000930', score=0.0), 478002392: GenericScoredDoc(query_id='824165', doc_id='999540', score=0.0) }) self._test_scoreddocs('msmarco-passage/train/judged', count=478002393, items={ 0: GenericScoredDoc(query_id='965162', doc_id='1000930', score=0.0), 9: GenericScoredDoc(query_id='817636', doc_id='1000930', score=0.0), 478002392: GenericScoredDoc(query_id='824165', doc_id='999540', score=0.0) }) self._test_scoreddocs('msmarco-passage/train/medical', count=48852277, items={ 0: GenericScoredDoc(query_id='15613', doc_id='1000930', score=0.0), 9: GenericScoredDoc(query_id='85825', doc_id='1008454', score=0.0), 48852276: GenericScoredDoc(query_id='466728', doc_id='993343', score=0.0) }) self._test_scoreddocs('msmarco-passage/train/split200-train', count=477883382, items={ 0: GenericScoredDoc(query_id='965162', doc_id='1000930', score=0.0), 9: GenericScoredDoc(query_id='817636', doc_id='1000930', score=0.0), 477883381: GenericScoredDoc(query_id='824165', doc_id='999540', score=0.0) }) self._test_scoreddocs('msmarco-passage/train/split200-valid', count=119011, items={ 0: GenericScoredDoc(query_id='867810', doc_id='1158056', score=0.0), 9: GenericScoredDoc(query_id='540814', doc_id='1609172', score=0.0), 119010: GenericScoredDoc(query_id='908661', doc_id='8839164', score=0.0) }) self._test_scoreddocs('msmarco-passage/dev/small', count=6668967, items={ 0: GenericScoredDoc(query_id='188714', doc_id='1000052', score=0.0), 9: GenericScoredDoc(query_id='345453', doc_id='1000327', score=0.0), 6668966: GenericScoredDoc(query_id='36473', doc_id='999956', score=0.0) }) self._test_scoreddocs('msmarco-passage/eval/small', count=6515736, items={ 0: GenericScoredDoc(query_id='992904', doc_id='1000038', score=0.0), 9: GenericScoredDoc(query_id='1114402', doc_id='1000236', score=0.0), 6515735: GenericScoredDoc(query_id='30677', doc_id='999956', score=0.0) }) self._test_scoreddocs('msmarco-passage/trec-dl-2019', count=189877, items={ 0: GenericScoredDoc(query_id='494835', doc_id='7130104', score=0.0), 9: GenericScoredDoc(query_id='1014126', doc_id='8001869', score=0.0), 189876: GenericScoredDoc(query_id='1124145', doc_id='7998901', score=0.0) }) self._test_scoreddocs('msmarco-passage/trec-dl-2019/judged', count=41042, items={ 0: GenericScoredDoc(query_id='131843', doc_id='7130104', score=0.0), 9: GenericScoredDoc(query_id='1117099', doc_id='7135553', score=0.0), 41041: GenericScoredDoc(query_id='1115776', doc_id='7997171', score=0.0) }) self._test_scoreddocs('msmarco-passage/trec-dl-2020', count=190699, items={ 0: GenericScoredDoc(query_id='1104501', doc_id='5138533', score=0.0), 9: GenericScoredDoc(query_id='1129081', doc_id='5140109', score=0.0), 190698: GenericScoredDoc(query_id='197312', doc_id='8001747', score=0.0) }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/msmarco_passage_v2.py ================================================ import re import unittest import ir_datasets from ir_datasets.datasets.msmarco_passage_v2 import MsMarcoV2Passage from ir_datasets.formats import GenericQuery, TrecQrel, GenericScoredDoc from .base import DatasetIntegrationTest class TestMsMarcoPassageV2(DatasetIntegrationTest): def test_docs(self): self._test_docs('msmarco-passage-v2', count=138364198, items={ 0: MsMarcoV2Passage('msmarco_passage_00_0', '0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews.', ((0, 75),), 'msmarco_doc_00_0'), 9: MsMarcoV2Passage('msmarco_passage_00_3346', re.compile('^Engineers and designers work tirelessly to provide better and better numbers with each progressive m.{69} or muscle car enthusiast can determine the 0\\-60 times of their cars and make moves to improve them\\.$', flags=48), ((1653, 1789), (1790, 1922)), 'msmarco_doc_00_0'), 138364197: MsMarcoV2Passage('msmarco_passage_69_159748475', re.compile('^When it asks "What item would you like to create a shortcut for\\?", paste in the URL you want to use .{100} like to name the shortcut\\?", type the name of the meeting \\(i\\.e\\. "Standup Meeting"\\)\\. Click "Finish"\\.$', flags=48), ((1794, 1951), (1952, 1965), (1966, 2078), (2079, 2094)), 'msmarco_doc_59_1043776256'), }) self._test_docs('msmarco-passage-v2/dedup', count=119582876, items={ 0: MsMarcoV2Passage('msmarco_passage_00_0', '0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews.', ((0, 75),), 'msmarco_doc_00_0'), 9: MsMarcoV2Passage('msmarco_passage_00_3346', re.compile('^Engineers and designers work tirelessly to provide better and better numbers with each progressive m.{69} or muscle car enthusiast can determine the 0\\-60 times of their cars and make moves to improve them\\.$', flags=48), ((1653, 1789), (1790, 1922)), 'msmarco_doc_00_0'), 119582875: MsMarcoV2Passage('msmarco_passage_69_159748475', re.compile('^When it asks "What item would you like to create a shortcut for\\?", paste in the URL you want to use .{100} like to name the shortcut\\?", type the name of the meeting \\(i\\.e\\. "Standup Meeting"\\)\\. Click "Finish"\\.$', flags=48), ((1794, 1951), (1952, 1965), (1966, 2078), (2079, 2094)), 'msmarco_doc_59_1043776256'), }) # the following doc_id is a duplicate and shouldn't be returned in the dedup version self.assertRaises(KeyError, lambda: ir_datasets.load('msmarco-passage-v2/dedup').docs.lookup('msmarco_passage_00_9218')) self.assertEqual({}, ir_datasets.load('msmarco-passage-v2/dedup').docs.lookup(['msmarco_passage_00_9218'])) self.assertNotEqual(None, ir_datasets.load('msmarco-passage-v2').docs.lookup('msmarco_passage_00_9218')) self.assertEqual(1, len(ir_datasets.load('msmarco-passage-v2').docs.lookup(['msmarco_passage_00_9218']))) def test_queries(self): self._test_queries('msmarco-passage-v2/train', count=277144, items={ 0: GenericQuery('121352', 'define extreme'), 9: GenericQuery('80926', 'can you use wallapop on your computer'), 277143: GenericQuery('50393', 'benefits of boiling lemons and drinking juice.'), }) self._test_queries('msmarco-passage-v2/dev1', count=3903, items={ 0: GenericQuery('2', ' Androgen receptor define'), 9: GenericQuery('1049200', 'who recorded loving you'), 3902: GenericQuery('1048565', 'who plays sebastian michaelis'), }) self._test_queries('msmarco-passage-v2/dev2', count=4281, items={ 0: GenericQuery('1048579', 'what is pcnt'), 9: GenericQuery('1048779', 'what is ott media'), 4280: GenericQuery('1092262', ';liter chemistry definition'), }) self._test_queries('msmarco-passage-v2/trec-dl-2021', count=477, items={ 0: GenericQuery('787021', 'what is produced by muscle'), 9: GenericQuery('1052368', 'who stabbed dr. martin luther king'), 476: GenericQuery('855410', 'what is theraderm used for'), }) self._test_queries('msmarco-passage-v2/trec-dl-2021/judged', count=53, items={ 0: GenericQuery('2082', 'At about what age do adults normally begin to lose bone mass?'), 9: GenericQuery('1107704', 'what was the main benefit of a single european currency?'), 52: GenericQuery('1040198', 'who is the final arbiter of florida law in instances where there is no federal authority?'), }) self._test_queries('msmarco-passage-v2/trec-dl-2022', count=500, items={ 0: GenericQuery('588', '1099 b cost basis i sell specific shares'), 9: GenericQuery('77640', "can you get a master's degree in tefl"), 499: GenericQuery('2056473', 'is a dairy farm considered as an agriculture'), }) self._test_queries('msmarco-passage-v2/trec-dl-2022/judged', count=76, items={ 0: GenericQuery('2000511', 'average bahamas temperature at the end of october'), 9: GenericQuery('2003157', 'how to cook frozen ham steak on nuwave oven'), 75: GenericQuery('2056323', 'how does magic leap optics work'), }) self._test_queries('msmarco-passage-v2/trec-dl-2023', count=700, items={ 0: GenericQuery('2000138', 'How does the process of digestion and metabolism of carbohydrates start'), 9: GenericQuery('2001686', 'good food and bad food for high cholesterol'), 699: GenericQuery('3100949', 'How do birth control and hormone levels affect menstrual cycle variations?'), }) self._test_queries('msmarco-passage-v2/trec-dl-2023/judged', count=82, items={ 0: GenericQuery('2001010', 'cost comparison of funerals in australia'), 9: GenericQuery('2003787', 'how to make linkedin private'), 81: GenericQuery('3100922', 'What is the meaning and origin of the name Corrin?'), }) def test_qrels(self): self._test_qrels('msmarco-passage-v2/train', count=284212, items={ 0: TrecQrel('1185869', 'msmarco_passage_08_840101254', 1, '0'), 9: TrecQrel('186154', 'msmarco_passage_02_556351008', 1, '0'), 284211: TrecQrel('697642', 'msmarco_passage_05_512118117', 1, '0'), }) self._test_qrels('msmarco-passage-v2/dev1', count=4009, items={ 0: TrecQrel('763878', 'msmarco_passage_33_459057644', 1, '0'), 9: TrecQrel('290779', 'msmarco_passage_10_301562908', 1, '0'), 4008: TrecQrel('1091692', 'msmarco_passage_23_330102695', 1, '0'), }) self._test_qrels('msmarco-passage-v2/dev2', count=4411, items={ 0: TrecQrel('419507', 'msmarco_passage_04_254301507', 1, '0'), 9: TrecQrel('1087630', 'msmarco_passage_18_685926585', 1, '0'), 4410: TrecQrel('961297', 'msmarco_passage_18_858458289', 1, '0'), }) self._test_qrels('msmarco-passage-v2/trec-dl-2021', count=10828, items={ 0: TrecQrel('2082', 'msmarco_passage_01_552803451', 0, '0'), 9: TrecQrel('2082', 'msmarco_passage_02_437070914', 3, '0'), 10827: TrecQrel('1129560', 'msmarco_passage_68_639912287', 0, '0'), }) self._test_qrels('msmarco-passage-v2/trec-dl-2021/judged', count=10828, items={ 0: TrecQrel('2082', 'msmarco_passage_01_552803451', 0, '0'), 9: TrecQrel('2082', 'msmarco_passage_02_437070914', 3, '0'), 10827: TrecQrel('1129560', 'msmarco_passage_68_639912287', 0, '0'), }) self._test_qrels('msmarco-passage-v2/trec-dl-2022', count=386416, items={ 0: TrecQrel('2000511', 'msmarco_passage_00_491550793', 0, '0'), 9: TrecQrel('2000511', 'msmarco_passage_00_491585086', 0, '0'), 386415: TrecQrel('2056323', 'msmarco_passage_68_715747739', 1, '0'), }) self._test_qrels('msmarco-passage-v2/trec-dl-2022/judged', count=386416, items={ 0: TrecQrel('2000511', 'msmarco_passage_00_491550793', 0, '0'), 9: TrecQrel('2000511', 'msmarco_passage_00_491585086', 0, '0'), 386415: TrecQrel('2056323', 'msmarco_passage_68_715747739', 1, '0'), }) self._test_qrels('msmarco-passage-v2/trec-dl-2023', count=22327, items={ 0: TrecQrel('2001010', 'msmarco_passage_00_257661787', 0, '0'), 9: TrecQrel('2001010', 'msmarco_passage_01_221183941', 0, '0'), 22326: TrecQrel('3100922', 'msmarco_passage_68_194985280', 0, '0'), }) self._test_qrels('msmarco-passage-v2/trec-dl-2023/judged', count=22327, items={ 0: TrecQrel('2001010', 'msmarco_passage_00_257661787', 0, '0'), 9: TrecQrel('2001010', 'msmarco_passage_01_221183941', 0, '0'), 22326: TrecQrel('3100922', 'msmarco_passage_68_194985280', 0, '0'), }) def test_scoreddocs(self): self._test_scoreddocs('msmarco-passage-v2/train', count=27713673, items={ 0: GenericScoredDoc('5', 'msmarco_passage_49_25899182', 12.1278), 9: GenericScoredDoc('5', 'msmarco_passage_53_503988399', 11.2986), 27713672: GenericScoredDoc('1185869', 'msmarco_passage_41_540702769', 9.739399), }) self._test_scoreddocs('msmarco-passage-v2/dev1', count=390300, items={ 0: GenericScoredDoc('2', 'msmarco_passage_30_389397788', 14.5301), 9: GenericScoredDoc('2', 'msmarco_passage_05_830539414', 13.1325), 390299: GenericScoredDoc('1102390', 'msmarco_passage_29_705182521', 9.148), }) self._test_scoreddocs('msmarco-passage-v2/dev2', count=428100, items={ 0: GenericScoredDoc('1325', 'msmarco_passage_35_295199374', 20.979799), 9: GenericScoredDoc('1325', 'msmarco_passage_68_757687820', 18.208799), 428099: GenericScoredDoc('1102413', 'msmarco_passage_07_48510484', 10.8093), }) self._test_scoreddocs('msmarco-passage-v2/trec-dl-2021', count=47700, items={ 0: GenericScoredDoc('2082', 'msmarco_passage_45_623131157', 19.8207), 9: GenericScoredDoc('2082', 'msmarco_passage_30_709623997', 17.350901), 47699: GenericScoredDoc('1136769', 'msmarco_passage_06_68704200', 14.8941), }) self._test_scoreddocs('msmarco-passage-v2/trec-dl-2022', count=50000, items={ 0: GenericScoredDoc('588', 'msmarco_passage_30_337959223', 18.762699), 9: GenericScoredDoc('588', 'msmarco_passage_10_355039123', 17.7628), 49999: GenericScoredDoc('2056473', 'msmarco_passage_17_225374709', 12.147499), }) self._test_scoreddocs('msmarco-passage-v2/trec-dl-2022/judged', count=7600, items={ 0: GenericScoredDoc('2000511', 'msmarco_passage_05_149863652', 17.3363), 9: GenericScoredDoc('2000511', 'msmarco_passage_10_615816159', 13.9977), 7599: GenericScoredDoc('2056323', 'msmarco_passage_42_417977141', 9.592), }) self._test_scoreddocs('msmarco-passage-v2/trec-dl-2023', count=70000, items={ 0: GenericScoredDoc('2000138', 'msmarco_passage_04_207262207', 17.8992), 9: GenericScoredDoc('2000138', 'msmarco_passage_35_358067216', 15.2805), 69999: GenericScoredDoc('3100949', 'msmarco_passage_30_84437641', 18.801701), }) self._test_scoreddocs('msmarco-passage-v2/trec-dl-2023/judged', count=8200, items={ 0: GenericScoredDoc('2001010', 'msmarco_passage_39_224640845', 11.9941), 9: GenericScoredDoc('2001010', 'msmarco_passage_60_353412311', 10.992799), 8199: GenericScoredDoc('3100922', 'msmarco_passage_38_636536351', 10.032599), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/msmarco_qna.py ================================================ import re import unittest from ir_datasets.formats import GenericDoc, GenericQuery, TrecQrel, GenericDocPair, GenericScoredDoc from ir_datasets.datasets.msmarco_qna import MsMarcoQnADoc, MsMarcoQnAQuery, MsMarcoQnAEvalQuery from .base import DatasetIntegrationTest class TestMsMarcoQnA(DatasetIntegrationTest): def test_docs(self): self._test_docs('msmarco-qna', count=9048606, items={ 0: MsMarcoQnADoc('0-0', re.compile('^The presence of communication amid scientific minds was equally important to the success of the Manh.{125}nd engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated\\.$', flags=48), 'http://www.pitt.edu/~sdb14/atombomb.html', '0', 'D59219'), 9: MsMarcoQnADoc('9-0', re.compile("^One of the main reasons Hanford was selected as a site for the Manhattan Project's B Reactor was its.{13} the Columbia River, the largest river flowing into the Pacific Ocean from the North American coast\\.$", flags=48), 'https://www.atomicheritage.org/history/environmental-consequences', '9', 'D59228'), 9048605: MsMarcoQnADoc('120010-0', re.compile('^Considering the cost of tuition at a place like UNT, this logic would read that the total cost – inc.{36}er year at UNT would be around \\$9,000\\. For Indiana, the total cost should be around \\$18,000\\-\\$19,000\\.$', flags=48), 'http://musicschoolcentral.com/real-cost-dollars-getting-college-music-education/', '120010', 'D59214'), }) def test_queries(self): self._test_queries('msmarco-qna/train', count=808731, items={ 0: MsMarcoQnAQuery('1185869', ')what was the immediate impact of the success of the manhattan project?', 'DESCRIPTION', ('The immediate impact of the success of the manhattan project was the only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.',)), 9: MsMarcoQnAQuery('410717', 'is funner a word?', 'DESCRIPTION', ('Yes, funner is a word.',)), 808730: MsMarcoQnAQuery('461916', 'name some organisms that might live in a marine biome', 'ENTITY', ('Walrus, star fish, eel, crabs, jelly fish, and fresh and salt water fish',)), }) self._test_queries('msmarco-qna/dev', count=101093, items={ 0: MsMarcoQnAQuery('1102432', '. what is a corporation?', 'DESCRIPTION', ('A corporation is a company or group of people authorized to act as a single entity and recognized as such in law.',)), 9: MsMarcoQnAQuery('36558', 'average force of a raindrop', 'NUMERIC', ()), 101092: MsMarcoQnAQuery('371455', 'how to offer health insurance to employees', 'DESCRIPTION', ('List the elements you want to include in a health insurance package for your employees. Investigate the financial impact of various deductibles, family coverage vs. individual coverage and other factors.',)), }) self._test_queries('msmarco-qna/eval', count=101092, items={ 0: MsMarcoQnAEvalQuery('1136966', '#ffffff color code', 'ENTITY'), 9: MsMarcoQnAEvalQuery('80665', 'can you use horse trailer for hay', 'DESCRIPTION'), 101091: MsMarcoQnAEvalQuery('315646', 'how much does it cost to go to college online', 'NUMERIC'), }) def test_qrels(self): self._test_qrels('msmarco-qna/train', count=8069749, items={ 0: TrecQrel('1185869', '0-0', 1, '0'), 9: TrecQrel('1185869', '9-0', 0, '0'), 8069748: TrecQrel('461916', '7066857-0', 0, '0'), }) self._test_qrels('msmarco-qna/dev', count=1008985, items={ 0: TrecQrel('1102432', '7066858-0', 0, '0'), 9: TrecQrel('1102432', '7066861-0', 0, '0'), 1008984: TrecQrel('371455', '8009483-0', 0, '0'), }) def test_scoreddocs(self): self._test_scoreddocs('msmarco-qna/train', count=8069749, items={ 0: GenericScoredDoc('1185869', '0-0', 0.0), 9: GenericScoredDoc('1185869', '9-0', -9.0), 8069748: GenericScoredDoc('461916', '7066857-0', -9.0), }) self._test_scoreddocs('msmarco-qna/dev', count=1008985, items={ 0: GenericScoredDoc('1102432', '7066858-0', 0.0), 9: GenericScoredDoc('1102432', '7066861-0', -9.0), 1008984: GenericScoredDoc('371455', '8009483-0', -9.0), }) self._test_scoreddocs('msmarco-qna/eval', count=1008943, items={ 0: GenericScoredDoc('1136966', '7164732-0', 0.0), 9: GenericScoredDoc('1136966', '8009488-0', -9.0), 1008942: GenericScoredDoc('315646', '120010-0', -9.0), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/nano_beir.py ================================================ import unittest from ir_datasets.formats import GenericDoc, GenericQuery, TrecQrel from .base import DatasetIntegrationTest class TestBeir(DatasetIntegrationTest): def test_docs(self): self._test_docs("nano-beir/arguana", count=3635, items={ 0: GenericDoc('test-environment-aeghhgwpe-pro01a', "It is immoral to kill animals As evolved human beings it is our moral duty to inflict as little pain as possible for our survival. So if we do not need to inflict pain to animals in order to survive, we should not do it. Farm animals such as chickens, pigs, sheep, and cows are sentient living beings like us - they are our evolutionary cousins and like us they can feel pleasure and pain. The 18th century utilitarian philosopher Jeremy Bentham even believed that animal suffering was just as serious as human suffering and likened the idea of human superiority to racism. It is wrong to farm and kill these animals for food when we do not need to do so. The methods of farming and slaughter of these animals are often barbaric and cruel - even on supposedly 'free range' farms. [1] Ten billion animals were slaughtered for human consumption each year, stated PETA. And unlike the farms long time ago, where animals roamed freely, today, most animals are factory farmed: \x97crammed into cages where they can barely move and fed a diet adulterated with pesticides and antibiotics. These animals spend their entire lives in their “prisoner cells” so small that they can't even turn around. Many suffer serious health problems and even death because they are selectively bred to grow or produce milk or eggs at a far greater rate than their bodies are capable of coping with. At the slaughterhouse, there were millions of others who are killed every year for food. Further on Tom Regan explains that all duties regarding animals are indirect duties to one another from a philosophical point of view. He illustrates it with an analogy regarding children: “Children, for example, are unable to sign contracts and lack rights. But they are protected by the moral contract nonetheless because of the sentimental interests of others. So we have, then, duties involving these children, duties regarding them, but no duties to them. Our duties in their case are indirect duties to other human beings, usually their parents.” [2] With this he supports the theory that animals must be protected from suffering, as it is moral to protect any living being from suffering, not because we have a moral contract with them, but mainly due to respect of life and recognition of suffering itself. [1] Claire Suddath, A brief history of Veganism, Time, 30 October 2008 [2] Tom Regan, The case for animal rights, 1989"), 9: GenericDoc('test-environment-assgbatj-con03b', 'When a drug is first tested on human volunteers, they are only given a tiny fraction of the amount shown safe to give to primates showing there is another way, to start with very low doses. Animal research isn’t a reliable indicator of how a drug will work in people – even with animal testing, some drugs trials go very wrong [15].'), 3634: GenericDoc('validation-society-fyhwscdcj-con03a', 'Sponsorship is often more about the intentions of the donors rather than the needs of poor children. Some schemes have a clear cultural and religious motive – a desire to give aid in such a way that it will affect and even impose (force) foreign ideas onto a vulnerable (weaker) society. Any organisation that has such a clear overlap between their own ideas of faith [19] and the practical side of helping people is ultimately imposing its ideas onto people without giving them any choice in the matter. Families may even come to think that they have to show belief in order to keep receiving sponsorship. For example, sponsored children may be encouraged to send cards at Christmas, even if they are not Christians. At the end of the day this comes down to a very serious question of choice – many would argue that by offering aid with the intention of turning children into adult Christians [20], organisations like “Compassion” are effectively manipulating charity into part of a conversion campaign.'), }) self._test_docs("nano-beir/climate-fever", count=3408, items={ 0: GenericDoc('1993_Storm_of_the_Century', "The 1993 Storm of the Century ( also known as the 93 Super Storm or the Great Blizzard of 1993 ) was a large cyclonic storm that formed over the Gulf of Mexico on March 12 , 1993 . The storm eventually dissipated in the North Atlantic Ocean on March 15 , 1993 . It was unique for its intensity , massive size , and wide-reaching effects . At its height , the storm stretched from Canada to the Gulf of Mexico . The cyclone moved through the Gulf of Mexico and then through the eastern United States before moving onto Canada . Heavy snow was first reported in highland areas as far south as Alabama and northern Georgia , with Union County , Georgia reporting up to 35 inches of snow in the north Georgia mountains . Birmingham , Alabama , reported a rare 13 in of snow . The Florida Panhandle reported up to 4 in , with hurricane-force wind gusts and record low barometric pressures . Between Louisiana and Cuba , the hurricane-force winds produced high storm surges across Northwestern Florida which , in combination with scattered tornadoes , killed dozens of people . Record cold temperatures were seen across portions of the south and east of the US in the wake of this storm . In the United States , the storm was responsible for the loss of electric power to more than 10 million households . An estimated 40 percent of the country 's population experienced the effects of the storm with a total of 208 fatalities ."), 9: GenericDoc('1990', "For the album by Enigma see MCMXC a.D. Important events of 1990 include the Reunification of Germany and the unification of Yemen , the formal beginning of the Human Genome Project ( finished in 2003 ) , the launch of the Hubble Space Telescope , the separation of Namibia from South Africa , and the Baltic states declaring independence from the Soviet Union amidst Perestroika . Yugoslavia 's communist regime collapses amidst increasing internal tensions and multiparty elections held within its constituent republics result in separatist governments being elected in most of the republics marking the beginning of the breakup of Yugoslavia . Also in this year began the crisis that would lead to the Gulf War in 1991 following the Iraq invasion and the largely internationally unrecognized annexation of Kuwait resulting in a crisis in the Persian Gulf involving the issue of the sovereignty of Kuwait and fears by Saudi Arabia over Iraqi aggression against their oil fields near Kuwait , this resulted in Operation Desert Shield being enacted with an international coalition of military forces being built up on the Kuwaiti-Saudi border with demands for Iraq to peacefully withdraw from Kuwait . Also in this year , Nelson Mandela was released from prison , and Margaret Thatcher resigned as Prime Minister of the United Kingdom after over 11 years . 1990 was an important year in the Internet 's early history . In the fall of 1990 , Tim Berners-Lee created the first web server and the foundation for the World Wide Web . Test operations began around December 20 and it was released outside of CERN the following year . 1990 also saw the official decommissioning of the ARPANET , a forerunner of the Internet system and the introduction of the first content search engine , Archie on September 10 . September 14 , 1990 saw the first case of successful somatic gene therapy on a patient . Due to the early 1990s recession that began that year and uncertainty due to the collapse of the socialist governments in Eastern Europe , birth rates in many countries stopped rising or fell steeply in 1990 . In most western countries the Echo Boom peaked in 1990 ; fertility rates declined thereafter . Encyclopædia Britannica , which ceased printing in 2012 , saw its highest all time sales in 1990 ; 120,000 volumes were sold that year . The number of librarians in the United States also peaked around 1990 ."), 3407: GenericDoc('Climate_change_(general_concept)', 'Climate variability includes all the variations in the climate that last longer than individual weather events, whereas the term climate change only refers to those variations that persist for a longer period of time, typically decades or more. In the time since the industrial revolution the climate has increasingly been affected by human activities that are causing global warming and climate change.The climate system receives nearly all of its energy from the sun. The climate system also radiates energy to outer space. The balance of incoming and outgoing energy, and the passage of the energy through the climate system, determines Earth\\\'s energy budget. When the incoming energy is greater than the outgoing energy, earth\\\'s energy budget is positive and the climate system is warming. If more energy goes out, the energy budget is negative and earth experiences cooling.\\nThe energy moving through Earth\\\'s climate system finds expression in weather, varying on geographic scales and time. Long-term averages and variability of weather in a region constitute the region\\\'s climate. Such changes can be the result of "internal variability", when natural processes inherent to the various parts of the climate system alter the distribution of energy. Examples include variability in ocean basins such as the Pacific decadal oscillation and Atlantic multidecadal oscillation. Climate variability can also result from external forcing, when events outside of the climate system\\\'s components nonetheless produce changes within the system. Examples include changes in solar output and volcanism.\\nClimate variability has consequences for sea level changes, plant life, and mass extinctions; it also affects human societies.'), }) self._test_docs("nano-beir/dbpedia-entity", count=6045, items={ 0: GenericDoc('', "The Academy Awards are the oldest awards ceremony for achievements in motion pictures. The Academy Award for Best Production Design recognizes achievement in art direction on a film. The category's original name was Best Art Direction, but was changed to its current name in 2012 for the 85th Academy Awards. This change resulted from the Art Director's branch of the Academy being renamed the Designer's branch."), 9: GenericDoc('', "The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA), which accomplished landing the first humans on the Moon from 1969 to 1972. First conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was later dedicated to President John F."), 6044: GenericDoc('', "Frankfurt am Main (German pronunciation: [ˈfʁaŋkfʊɐ̯t am ˈmaɪ̯n] ) is the largest city in the German state of Hesse (Hessia) and the fifth-largest city in Germany, with a 2015 population of 731,095 within its administrative boundaries. The urban area called Frankfurt Rhein-Main has a population of 2,221,910. The city is at the centre of the larger Frankfurt Rhine-Main Metropolitan Region which has a population of 5,500,000 and is Germany's second-largest metropolitan region. Since the enlargement of the European Union in 2013, the geographic centre of the EU is about 40 km (25 mi) to the east."), }) self._test_docs("nano-beir/fever", count=4996, items={ 0: GenericDoc('1999_Pulitzer_Prize', 'The Pulitzer Prizes for 1999 were announced on April 12 , 1999 .'), 9: GenericDoc('1951_NBA_Playoffs', "The 1951 NBA Playoffs was the postseason tournament of the National Basketball Association 1950 -- 51 season . The tournament concluded with the Western Division champion Rochester Royals defeating the Eastern Division champion New York Knicks 4 games to 3 in the NBA Finals . The eight qualified teams began tournament play on Tuesday and Wednesday , March 20 and 21 , and the Finals concluded on Saturday , April 21 . Rochester and New York played 14 games in a span of 33 days ; their seven final games in fifteen days . The Rochester Royals ( now the Sacramento Kings ) were `` royalty '' in their first nine seasons , from 1945 -- 46 to 1954 -- 54 always one of the strong teams in their league . Rochester had played three seasons in the National Basketball League , winning the 1946 NBL championship and losing the Finals in 1947 and 1948 . In one BAA and one NBA season , the team had won 75 % of its games before losing in the second round , then first round , of the 1949 and 1950 playoffs . The 1950 -- 51 team won more than 60 % of its games , as the Royals would do for three more seasons , and participated in the club 's only NBA Finals . That remains true more than 60 years later , covering stints in Rochester , Cincinnati , Kansas City , and Sacramento . The New York Knicks were an original Basketball Association of America franchise , now in its sixth season and participating in the BAA or NBA Finals for the first time . It would be the first three consecutive years as losing finalist . Another six-year-old , original BAA team , the Boston Celtics had qualified only for the 1948 BAA Playoffs . Now the second-place Eastern Division team , Boston had earned home-court advantage for a first-round series with third-place New York . It was the first playoff meeting in the Celtics -- Knicks rivalry and it would be the first of 19 consecutive years in the playoffs ."), 4995: GenericDoc('XSM-74', 'The Convair XSM-74 was a sub-sonic , jet-powered , ground-launched decoy cruise missile .'), }) self._test_docs("nano-beir/fiqa", count=4598, items={ 0: GenericDoc('277', "My super fund and I would say many other funds give you one free switch of strategies per year. Some suggest you should change from high growth option to a more balance option once you are say about 10 to 15 years from retirement, and then change to a more capital guaranteed option a few years from retirement. This is a more passive approach and has benefits as well as disadvantages. The benefit is that there is not much work involved, you just change your investment option based on your life stage, 2 to 3 times during your lifetime. This allows you to take more risk when you are young to aim for higher returns, take a balanced approach with moderate risk and returns during the middle part of your working life, and take less risk with lower returns (above inflation) during the latter part of your working life. A possible disadvantage of this strategy is you may be in the higher risk/ higher growth option during a market correction and then change to a more balanced option just when the market starts to pick up again. So your funds will be hit with large losses whilst the market is in retreat and just when things look to be getting better you change to a more balanced portfolio and miss out on the big gains. A second more active approach would be to track the market and change investment option as the market changes. One approach which shouldn't take much time is to track the index such as the ASX200 (if you investment option is mainly invested in the Australian stock market) with a 200 day Simple Moving Average (SMA). The concept is that if the index crosses above the 200 day SMA the market is bullish and if it crosses below it is bearish. See the chart below: This strategy will work well when the market is trending up or down but not very well when the market is going sideways, as you will be changing from aggressive to balanced and back too often. Possibly a more appropriate option would be a combination of the two. Use the first passive approach to change investment option from aggressive to balanced to capital guaranteed with your life stages, however use the second active approach to time the change. For example, if you were say in your late 40s now and were looking to change from aggressive to balanced in the near future, you could wait until the ASX200 crosses below the 200 day SMA before making the change. This way you could capture the majority of the uptrend (which could go on for years) before changing from the high growth/aggressive option to the balanced option. If you where after more control over your superannuation assets another option open to you is to start a SMSF, however I would recommend having at least $300K to $400K in assets before starting a SMSF, or else the annual costs would be too high as a percentage of your total super assets."), 9: GenericDoc('1699', '"The TWRR calculation will work even with negative values: TWRR = (1 + 0.10) x (1 + (-0.191) ) x (1 + 0.29) ^ (1/3) = 1.047 which is a 4.7% return. Your second question concerns the -19% return calculated for the second quarter. You seem to think this return is ""way-off"". Not really. The TWRR calculates a return by accounting for cash that was added or deducted to/from the account. So if I started with $100,000, added $10,000 to the account, and ended up with $110,000, what should be the return on my investment? My answer would be 0% since the only reason my account balance went up was due to me adding cash to it. Therefore, if I started with $100,000, added $10,000 in cash to the account, and ended up with $100,000 in my account, then my return would be a negative value since I lost the $10,000 that I deposited in the account. In the second quarter you started with $15,000, deposited $4,000, and ended with $15,750. You essentially lost almost all of the $4,000 you deposited. That is a significant loss."'), 4597: GenericDoc('599925', "Investopedia has a good explanation of the term shorting which is what this is. In the simplest of terms, someone is borrowing the bond and selling it with the intent to replace the security and any dividends or coupons in the end. The idea is that if a bond is overvalued, one may be able to buy it back later for a cheaper price and pocket the difference. There are various rules about this including margin requirements to maintain since there is the risk of the security going up in price enough that someone may be forced into a buy to cover in the form of a margin call. If one can sell the bond at $960 now and then buy it back later for $952.38 then one could pocket the difference. Part of what you aren't seeing is what are other bonds doing in terms of their prices over time here. The key point here is that brokers may lend out securities and accrue interest on loaned securities for another point here."), }) self._test_docs("nano-beir/hotpotqa", count=5090, items={ 0: GenericDoc('974', 'Augusta Ada King-Noel, Countess of Lovelace ("née" Byron; 10 December 1815\xa0– 27 November 1852) was an English mathematician and writer, chiefly known for her work on Charles Babbage\'s proposed mechanical general-purpose computer, the Analytical Engine. She was the first to recognise that the machine had applications beyond pure calculation, and created the first algorithm intended to be carried out by such a machine. As a result, she is often regarded as the first to recognise the full potential of a "computing machine" and the first computer programmer.'), 9: GenericDoc('11242', 'Final Fantasy: The Spirits Within is a 2001 American computer-animated science fiction film directed by Hironobu Sakaguchi, creator of the "Final Fantasy" series of role-playing video games. It was the first photorealistic computer-animated feature film and remains the most expensive video game-inspired film of all time. It features the voices of Ming-Na Wen, Alec Baldwin, Donald Sutherland, James Woods, Ving Rhames, Peri Gilpin and Steve Buscemi.'), 5089: GenericDoc('55344979', 'The augerino is a legendary creature present in the folk tales of lumberjack and ranching communities in the western United States. Tales of the augerino described it as a subterranean creature which inhabited the drier regions of Colorado. The augerino required a dry environment to survive and would bore holes in dams and irrigation ditches to let the water drain out. Some accounts described the augerino as a type of worm, though tales differ on the exact physical description of the creature. The name appears to derive from the diminutive of the common hand tool, the auger.'), }) self._test_docs("nano-beir/msmarco", count=5043, items={ 0: GenericDoc('2004', 'n (Brit. history) a local government committee composed of magistrates and representatives of the county borough council responsible for the efficiency of the local police force. English Collins Dictionary-English Definition & Thesaurus  .'), 9: GenericDoc('25901', 'understood within the framework of intersubjectivity (Trevarthen, C., 2001, Diamond, N., & Marrone, M., (2003), which has a central role in the healthy development of brain systems (Shore, 1994), social functioning, and interpersonal relationships.'), 5042: GenericDoc('8841335', 'On September 9, 1890, Colonel Harland Sanders was born on a farm outside Henryville, Indiana. More than 30 years after his death, the man in the trademark white suit and black string tie who pioneered Kentucky Fried Chicken’s “finger-lickin’ good” secret recipe remains the public face of the fast-food chain.'), }) self._test_docs("nano-beir/nfcorpus", count=2953, items={ 0: GenericDoc('MED-10', 'Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear. We evaluated risk of breast cancer death among statin users in a population-based cohort of breast cancer patients. The study cohort included all newly diagnosed breast cancer patients in Finland during 1995–2003 (31,236 cases), identified from the Finnish Cancer Registry. Information on statin use before and after the diagnosis was obtained from a national prescription database. We used the Cox proportional hazards regression method to estimate mortality among statin users with statin use as time-dependent variable. A total of 4,151 participants had used statins. During the median follow-up of 3.25 years after the diagnosis (range 0.08–9.0 years) 6,011 participants died, of which 3,619 (60.2%) was due to breast cancer. After adjustment for age, tumor characteristics, and treatment selection, both post-diagnostic and pre-diagnostic statin use were associated with lowered risk of breast cancer death (HR 0.46, 95% CI 0.38–0.55 and HR 0.54, 95% CI 0.44–0.67, respectively). The risk decrease by post-diagnostic statin use was likely affected by healthy adherer bias; that is, the greater likelihood of dying cancer patients to discontinue statin use as the association was not clearly dose-dependent and observed already at low-dose/short-term use. The dose- and time-dependence of the survival benefit among pre-diagnostic statin users suggests a possible causal effect that should be evaluated further in a clinical trial testing statins’ effect on survival in breast cancer patients.'), 9: GenericDoc('MED-666', 'Breast pain is a common condition affecting most women at some stage in their reproductive life. Mastalgia is resistant to treatment in 6% of cyclical and 26% non-cyclical patients. Surgery is not widely used to treat this condition and only considered in patients with severe mastalgia resistant to medication. The aims of this study were to audit the efficacy of surgery in severe treatment resistant mastalgia and to assess patient satisfaction following surgery. This is a retrospective review of the medical records of all patients seen in mastalgia clinic in the University Hospital of Wales, Cardiff since 1973. A postal questionnaire was distributed to all patients who had undergone surgery. Results showed that of the 1054 patients seen in mastalgia clinic, 12 (1.2%) had undergone surgery. Surgery included 8 subcutaneous mastectomies with implants (3 bilateral, 5 unilateral), 1 bilateral simple mastectomy and 3 quadrantectomies (1 having a further simple mastectomy). The median duration of symptoms was 6.5 years (range 2-16 years). Five patients (50%) were pain free following surgery, 3 developed capsular contractures and 2 wound infections with dehiscence. Pain persisted in both patients undergoing quadrantectomy. We conclude that surgery for mastalgia should only be considered in a minority of patients. Patients should be informed of possible complications inherent of reconstructive surgery and warned that in 50% cases their pain will not be improved.'), 2952: GenericDoc('MED-942', 'Apple cider vinegar products are advertised in the popular press and over the Internet for treatment of a variety of conditions. After an adverse event was reported to the authors, eight apple cider vinegar tablet products were tested for pH, component acid content, and microbial growth. Considerable variability was found between the brands in tablet size, pH, component acid content, and label claims. Doubt remains as to whether apple cider vinegar was in fact an ingredient in the evaluated products. The inconsistency and inaccuracy in labeling, recommended dosages, and unsubstantiated health claims make it easy to question the quality of the products.'), }) self._test_docs("nano-beir/nq", count=5035, items={ 0: GenericDoc('doc806', "The team arrives at an underground antipurge hideout run by Dante Bishop. Barnes discovers that Bishop's group intends to assassinate Owens, in an effort to end the purge. A large group of paramilitary forces arrives at the hideout looking for Bishop. Barnes and Roan escape back to the streets and meet Joe, Marcos, and Laney, who had left the hideout earlier to return to Joe's store."), 9: GenericDoc('doc2836', "Belle befriends the castle's servants, who invite her to a spectacular dinner. When she wanders into the forbidden west wing and finds the rose, the Beast, enraged, scares her into the woods. She is ambushed by a pack of wolves, but the Beast rescues her, and is injured in the process. As Belle nurses his wounds, a friendship develops between them. The Beast shows Belle a gift from the enchantress, a book that transports readers wherever they want. Belle uses it to visit her childhood home in Paris, where she discovers a plague doctor mask and realizes that she and her father were forced to leave her mother's deathbed when her mother succumbed to the plague."), 5034: GenericDoc('doc2680676', "The film was primarily shot in County Donegal, Ireland. Some of the orphanage scenes were shot in an abandoned hospital 'touched-up' by the design crew. Over 500 locals/extras were seen for casting, over three days, including many children. 'Smudge' was an animatronic, and its scenes were shot first, due to concerns about bad weather on the beach, which never occurred. In fact, for the 'rain scene' (jumping in puddles), they had to produce it, as it failed to rain during production.[2]"), }) self._test_docs("nano-beir/quora", count=5046, items={ 0: GenericDoc('58', 'What is best way to ask for money online?'), 9: GenericDoc('575', 'Who will win if a war starts between India and Pakistan?'), 5045: GenericDoc('537834', 'How good is the Sony MDR ZX 100 as a headphone?'), }) self._test_docs("nano-beir/scidocs", count=2210, items={ 0: GenericDoc('1b2a0e8af5c1f18e47e71244973ce4ace4ac6034', 'Hierarchical Pitman-Yor Process priors are compelling methods for learning language models, outperforming point-estimate based methods. However, these models remain unpopular due to computational and statistical inference issues, such as memory and time usage, as well as poor mixing of sampler. In this work we propose a novel framework which represents the HPYP model compactly using compressed suffix trees. Then, we develop an efficient approximate inference scheme in this framework that has a much lower memory footprint compared to full HPYP and is fast in the inference time. The experimental results illustrate that our model can be built on significantly larger datasets compared to previous HPYP models, while being several orders of magnitudes smaller, fast for training and inference, and outperforming the perplexity of the state-of-the-art Modified Kneser-Ney countbased LM smoothing by up to 15%.'), 9: GenericDoc('922b5eaa5ca03b12d9842b7b84e0e420ccd2feee', 'AN IMPORTANT class of theoretical and practical problems in communication and control is of a statistical nature. Such problems are: (i) Prediction of random signals; (ii) separation of random signals from random noise; (iii) detection of signals of known form (pulses, sinusoids) in the presence of random noise. In his pioneering work, Wiener [1]3 showed that problems (i) and (ii) lead to the so-called Wiener-Hopf integral equation; he also gave a method (spectral factorization) for the solution of this integral equation in the practically important special case of stationary statistics and rational spectra. Many extensions and generalizations followed Wiener’s basic work. Zadeh and Ragazzini solved the finite-memory case [2]. Concurrently and independently of Bode and Shannon [3], they also gave a simplified method [2) of solution. Booton discussed the nonstationary Wiener-Hopf equation [4]. These results are now in standard texts [5-6]. A somewhat different approach along these main lines has been given recently by Darlington [7]. For extensions to sampled signals, see, e.g., Franklin [8], Lees [9]. Another approach based on the eigenfunctions of the WienerHopf equation (which applies also to nonstationary problems whereas the preceding methods in general don’t), has been pioneered by Davis [10] and applied by many others, e.g., Shinbrot [11], Blum [12], Pugachev [13], Solodovnikov [14]. In all these works, the objective is to obtain the specification of a linear dynamic system (Wiener filter) which accomplishes the prediction, separation, or detection of a random signal.4 ——— 1 This research was supported in part by the U. S. Air Force Office of Scientific Research under Contract AF 49 (638)-382. 2 7212 Bellona Ave. 3 Numbers in brackets designate References at end of paper. 4 Of course, in general these tasks may be done better by nonlinear filters. At present, however, little or nothing is known about how to obtain (both theoretically and practically) these nonlinear filters. Contributed by the Instruments and Regulators Division and presented at the Instruments and Regulators Conference, March 29– Apri1 2, 1959, of THE AMERICAN SOCIETY OF MECHANICAL ENGINEERS. NOTE: Statements and opinions advanced in papers are to be understood as individual expressions of their authors and not those of the Society. Manuscript received at ASME Headquarters, February 24, 1959. Paper No. 59-IRD—11. A New Approach to Linear Filtering and Prediction Problems'), 2209: GenericDoc('dec997b20ebe2b867f68cc5c123d9cb9eafad6bb', 'Training deep neural networks generally requires massive amounts of data and is very computation intensive. We show here that it may be possible to circumvent the expensive gradient descent procedure and derive the parameters of a neural network directly from properties of the training data. We show that, near convergence, the gradient descent equations for layers close to the input can be linearized and become stochastic equations with noise related to the covariance of data for each class. We derive the distribution of solutions to these equations and discover that it is related to a “supervised principal component analysis.” We implement these results on image datasets MNIST, CIFAR10 and CIFAR100 and find that, indeed, pretrained layers using our findings performs comparable or superior to neural networks of the same size and architecture trained with gradient descent. Moreover, our pretrained layers can often be calculated using a fraction of the training data, owing to the quick convergence of the covariance matrix. Thus, our findings indicate that we can cut the training time both by requiring only a fraction of the data used for gradient descent, and by eliminating layers in the costly backpropagation step of the training. Additionally, these findings partially elucidate the inner workings of deep neural networks and allow us to mathematically calculate optimal solutions for some stages of classification problems, thus significantly boosting our ability to solve such problems efficiently.'), }) self._test_docs("nano-beir/scifact", count=2919, items={ 0: GenericDoc('5836', 'Myelodysplastic syndromes (MDS) are age-dependent stem cell malignancies that share biological features of activated adaptive immune response and ineffective hematopoiesis. Here we report that myeloid-derived suppressor cells (MDSC), which are classically linked to immunosuppression, inflammation, and cancer, were markedly expanded in the bone marrow of MDS patients and played a pathogenetic role in the development of ineffective hematopoiesis. These clonally distinct MDSC overproduce hematopoietic suppressive cytokines and function as potent apoptotic effectors targeting autologous hematopoietic progenitors. Using multiple transfected cell models, we found that MDSC expansion is driven by the interaction of the proinflammatory molecule S100A9 with CD33. These 2 proteins formed a functional ligand/receptor pair that recruited components to CD33’s immunoreceptor tyrosine-based inhibition motif (ITIM), inducing secretion of the suppressive cytokines IL-10 and TGF-β by immature myeloid cells. S100A9 transgenic mice displayed bone marrow accumulation of MDSC accompanied by development of progressive multilineage cytopenias and cytological dysplasia. Importantly, early forced maturation of MDSC by either all-trans-retinoic acid treatment or active immunoreceptor tyrosine-based activation motif–bearing (ITAM-bearing) adapter protein (DAP12) interruption of CD33 signaling rescued the hematologic phenotype. These findings indicate that primary bone marrow expansion of MDSC driven by the S100A9/CD33 pathway perturbs hematopoiesis and contributes to the development of MDS.'), 9: GenericDoc('104130', 'Bone tissue undergoes constant turnover supported by stem cells. Recent studies showed that perivascular mesenchymal stem cells (MSCs) contribute to the turnover of long bones. Craniofacial bones are flat bones derived from a different embryonic origin than the long bones. The identity and regulating niche for craniofacial-bone MSCs remain unknown. Here, we identify Gli1+ cells within the suture mesenchyme as the main MSC population for craniofacial bones. They are not associated with vasculature, give rise to all craniofacial bones in the adult and are activated during injury repair. Gli1+ cells are typical MSCs in vitro. Ablation of Gli1+ cells leads to craniosynostosis and arrest of skull growth, indicating that these cells are an indispensable stem cell population. Twist1(+/-) mice with craniosynostosis show reduced Gli1+ MSCs in sutures, suggesting that craniosynostosis may result from diminished suture stem cells. Our study indicates that craniofacial sutures provide a unique niche for MSCs for craniofacial bone homeostasis and repair.'), 2918: GenericDoc('196664003', 'A signaling pathway transmits information from an upstream system to downstream systems, ideally in a unidirectional fashion. A key obstacle to unidirectional transmission is retroactivity, the additional reaction flux that affects a system once its species interact with those of downstream systems. This raises the fundamental question of whether signaling pathways have developed specialized architectures that overcome retroactivity and transmit unidirectional signals. Here, we propose a general procedure based on mathematical analysis that provides an answer to this question. Using this procedure, we analyze the ability of a variety of signaling architectures to transmit one-way (from upstream to downstream) signals, as key biological parameters are tuned. We find that single stage phosphorylation and phosphotransfer systems that transmit signals from a kinase show a stringent design trade-off that hampers their ability to overcome retroactivity. Interestingly, cascades of these architectures, which are highly represented in nature, can overcome this trade-off and thus enable unidirectional transmission. By contrast, phosphotransfer systems, and single and double phosphorylation cycles that transmit signals from a substrate are unable to mitigate retroactivity effects, even when cascaded, and hence are not well suited for unidirectional information transmission. Our results identify signaling architectures that, allowing unidirectional transmission of signals, embody modular processes that conserve their input/output behavior across multiple contexts. These findings can be used to decompose natural signal transduction networks into modules, and, at the same time, they establish a library of devices that can be used in synthetic biology to facilitate modular circuit design.'), }) self._test_docs("nano-beir/webis-touche2020", count=5745, items={ 0: GenericDoc('ffd45b01-2019-04-18T18:54:19Z-00004-000', "Well, we meet again. If we were to prevent MLB players from entering the Hall of Fame because they used steroids; nearly every All-Star from 1980 to early 2000's would be ineligible. Take a look at the Mitchell Report, and you will find a list of incredible players which is far too long to list on here, who would all be banned from baseball's greatest honor. Steroids were just as part of the game during the 80's and 90's as Peanuts and Cracker Jacks. It was an era of steroids. If you weren't using them, then you were considered abnormal. You cannot fault an entire generation of players for just being a product of the times. I agree that records, such as the HR record broken by Bonds, should have an asterisk with them, but this should not be the case with the Hall of Fame. If we were to do what you propose, then from 1980 to 2000 there would be about 5 people in the Hall."), 9: GenericDoc('934989d9-2019-04-18T11:38:17Z-00000-000', 'Enacting more gun laws in the United States would not stop crimes or dangerous situations from occurring. In fact, according to the National Academy of Sciences, Justice Department, there is no apparent link between restrictions on gun ownership and lower rates of crime, firearms violence, or even accidents with guns. Creating such laws would not stop criminals from committing crimes. As John R Lott, the author of "More Guns, Less Crime: Understanding Crime and Gun Control Laws", stated in 1998, "States with the largest increases in gun ownership also have the largest drops in violent crimes". In other words, increasing the number of guns did not increase the rate of violent crimes but instead decreased. With this, it is clear that people should be able to own guns because doing so prevents more crimes from occurring than actual gun laws. University of Chicago Press. (1998). Interview with John R. Lott, Jr. Retrieved March 28, 2018, from http://press.uchicago.edu... WND. (2004, December 30). Gun control doesn\'t reduce crime, violence, say studies. Retrieved March 28, 2018, from http://mobile.wnd.com...'), 5744: GenericDoc('671509c8-2019-04-17T11:47:34Z-00051-000', 'Charter school administrators can hand-pick the best teachers'), }) def test_queries(self): self._test_queries("nano-beir/arguana", count=50, items={ 0: GenericQuery('test-science-wsihwclscaaw-con01a', 'Cyber attacks are often carried out by non-state actors Cyber attacks are often carried out by non-state actors, such as cyberterrorists or hacktivists (social activists who hack), without any involvement of the actual state. For instance, in 2007 a massive cyber attack launched on Estonia was blamed on Russia due to the then on-going tensions between these two states [17]. However, the attacks on Estonia were generated from all over the world; and even those from Russia could not have been linked to the Russian authorities, who denied involvement. Similarly, a huge wave of cyber attacks dubbed GhostNet that compromised computers in 103 countries in 2009 was blamed on China, not the least for hacking computers of Tibetan authorities. However, it could not be conclusively proven that this was an attack perpetrated by the Chinese authorities [18]. Any retaliation against a state for a cyber attack can never be certain to be against the right target – the state should not be blamed for the actions of its individual citizens.'), 9: GenericQuery('test-politics-gvhbhlsbr-con04a', "The public is apathetic to reform. Whether or not reform of the House of Lords should be a top priority in the current economic climate is debateable, let alone whether or not a coalition government would be able to initiate and drive through such measures. Attempts to reform the House of Lords have been delayed time and time again, demonstrating the House of Commons’ reservations on change. [1] A feeling that is no doubt echoed in popular British opinion – as demonstrated by the recent outcome of the Alternative Vote – the public are either adverse to the idea of change or apathetic to it. [2] [1] Summers, Deborah, ‘Labour's attempts to reform the House of Lords’, The Guardian (27 January, 2009), viewed on 1 June 2011 [2] BBC News, ‘Vote 2011: UK rejects alternative vote, 7 May 2011,"), 49: GenericQuery('test-politics-lghwdecm-con02a', 'Directly elected mayors provide opportunities for populists. The position of elected mayor is likely to attract populist and maverick candidates, who will seek to capitalise on the unpopularity of party politics with “single issue sloganising, glib promises and headline grabbing” (Ken Walker, Labour leader of Middlesbrough council). [1] A good example is Paul Massey, who has had 25 convictions in the past and yet is running to be Mayor of Salford and could even have a chance of winning. [2] In office such candidates are likely to alienate elected councillors and other crucial local partners, to disappoint voters as their promises run up against the actual limitations of their power, and to neglect many aspects of local government in favour of their own pet issue. This danger is even greater if a far-right candidate were to exploit local concerns about immigration and asylum-seekers to inflame racial tensions. Again Lutfur Rahman of Tower Hamlets is a good example of how this could happen, he has links to a Muslim extremist group, and only needed a mere 23,000 votes, 13% of the electorate because there was such low turnout. [3] [1] Hetherington, Peter, ‘Vote for US-style mayors exposes deep Labour rifts’, The Guardian, 20 October 2001. [2] Gilligan, Andrew, ;The town hall dictator taking over near you’, The Telegraph, 22 April 2012. [3] ibid'), }) self._test_queries("nano-beir/climate-fever", count=50, items={ 0: GenericQuery('555', 'In Alaska, brown bears are changing their feeding habits to eat elderberries that ripen earlier.'), 9: GenericQuery('1390', '“Lyme Disease is much more common in northern, cooler regions of the United States than in southern, warmer regions.'), 49: GenericQuery('1157', '“In 2013 the level of U.S. farm output was about 2.7 times its 1948 level, and productivity was growing at an average annual rate of 1.52%.'), }) self._test_queries("nano-beir/dbpedia-entity", count=50, items={ 0: GenericQuery('INEX_LD-2012319', '1994 short story collection Alice Munro is Open'), 9: GenericQuery('INEX_XER-143', 'Hanseatic league in Germany in the Netherlands Circle'), 49: GenericQuery('INEX_LD-20120332', 'tango dance history'), }) self._test_queries("nano-beir/fever", count=50, items={ 0: GenericQuery('74951', 'Caesar is an original play by Orson Welles.'), 9: GenericQuery('179112', 'The Good German starred Tobey Maguire.'), 49: GenericQuery('85454', 'Nero is a person.'), }) self._test_queries("nano-beir/fiqa", count=50, items={ 0: GenericQuery('3357', 'Why big clients want the contractor to be incorporated before giving them work'), 9: GenericQuery('1736', 'How can people have such high credit card debts?'), 49: GenericQuery('8378', 'Should I wait a few days to sell ESPP Stock?'), }) self._test_queries("nano-beir/hotpotqa", count=50, items={ 0: GenericQuery('5ae5669755429960a22e02ec', 'Which of the campaign that brought out the term Vichy Republican on social media was formally launched on June 16, 2015, at Trump Tower in New York City?'), 9: GenericQuery('5ac4e593554299076e296e37', 'OU812 was the second album to feature a vocalist that also played what instrument in "Finish What Ya Started"?'), 49: GenericQuery('5a81ed345542995ce29dcc85', 'The Florida Atlantic Owls team represents a school classified by the Carnegie Foundation as what?'), }) self._test_queries("nano-beir/msmarco", count=50, items={ 0: GenericQuery('994479', 'which health care system provides all citizens or residents with equal access to health care services'), 9: GenericQuery('1048359', 'what is php in mental health care'), 49: GenericQuery('1049867', 'who sang here i go again'), }) self._test_queries("nano-beir/nfcorpus", count=50, items={ 0: GenericQuery('PLAIN-2800', 'Prolonged Liver Function Enhancement From Broccoli'), 9: GenericQuery('PLAIN-2301', 'uterine health'), 49: GenericQuery('PLAIN-3271', 'Saturated Fat & Cancer Progression'), }) self._test_queries("nano-beir/nq", count=50, items={ 0: GenericQuery('test1618', 'was nightmare before christmas originally a disney movie'), 9: GenericQuery('test2845', 'how many paintings of sunflowers did van gogh paint'), 49: GenericQuery('test2247', 'here we go round the mulberry bush origin'), }) self._test_queries("nano-beir/quora", count=50, items={ 0: GenericQuery('127728', 'Why does Quora frequently suggest answers to my feed that put down Donald Trump?'), 9: GenericQuery('14755', 'How often is it good to masturbate?'), 49: GenericQuery('215681', 'What can I do to get more energy?'), }) self._test_queries("nano-beir/scidocs", count=50, items={ 0: GenericQuery('a352061134daa2c47861b8c4216ee5482a93be1d', 'BPM Governance: An Exploratory Study in Public Organizations'), 9: GenericQuery('88ee8b13451deac38384c9f31196227f2535aa65', 'Stochastic Geometric Analysis of User Mobility in Heterogeneous Wireless Networks'), 49: GenericQuery('458f1428273254fc5dd399f3c104f507680ddd54', 'Measuring discrimination in algorithmic decision making'), }) self._test_queries("nano-beir/scifact", count=50, items={ 0: GenericQuery('514', 'High dietary calcium intakes are unnecessary for prevention of secondary hyperparathyroidism in subjects with 25(OH)D levels above 75 nmol/liter.'), 9: GenericQuery('1202', 'The center of the granuloma in an immune cell induces a pro-inflammatory immune response.'), 49: GenericQuery('1278', 'The treatment of cancer patients with co-IR blockade does not cause any adverse autoimmune events.'), }) self._test_queries("nano-beir/webis-touche2020", count=49, items={ 0: GenericQuery('47', 'Is homework beneficial?'), 9: GenericQuery('14', 'Is sexual orientation determined at birth?'), 48: GenericQuery('33', 'Should people become vegetarian?'), }) def test_qrels(self): self._test_qrels("nano-beir/arguana", count=50, items={ 0: TrecQrel('test-science-wsihwclscaaw-con01a', 'test-science-wsihwclscaaw-con01b', 1, '0'), 9: TrecQrel('test-politics-gvhbhlsbr-con04a', 'test-politics-gvhbhlsbr-con04b', 1, '0'), 49: TrecQrel('test-politics-lghwdecm-con02a', 'test-politics-lghwdecm-con02b', 1, '0'), }) self._test_qrels("nano-beir/climate-fever", count=148, items={ 0: TrecQrel('555', 'Brown_bear', 1, '0'), 9: TrecQrel('971', 'Effects_of_global_warming', 1, '0'), 147: TrecQrel('1157', 'Reaganomics', 1, '0'), }) self._test_qrels("nano-beir/dbpedia-entity", count=1158, items={ 0: TrecQrel('INEX_LD-2012319', '', 1, '0'), 9: TrecQrel('INEX_XER-87', '', 1, '0'), 1157: TrecQrel('INEX_LD-20120332', '', 1, '0'), }) self._test_qrels("nano-beir/fever", count=57, items={ 0: TrecQrel('74951', 'Caesar_(Mercury_Theatre)', 1, '0'), 9: TrecQrel('109817', 'Louis_Malle', 1, '0'), 56: TrecQrel('85454', 'Nero', 1, '0'), }) self._test_qrels("nano-beir/fiqa", count=123, items={ 0: TrecQrel('3357', '209974', 1, '0'), 9: TrecQrel('6199', '414693', 1, '0'), 122: TrecQrel('8378', '125298', 1, '0'), }) self._test_qrels("nano-beir/hotpotqa", count=100, items={ 0: TrecQrel('5ae5669755429960a22e02ec', '49892372', 1, '0'), 9: TrecQrel('5ae0ae4555429945ae959419', '76592', 1, '0'), 99: TrecQrel('5a81ed345542995ce29dcc85', '880200', 1, '0'), }) self._test_qrels("nano-beir/msmarco", count=50, items={ 0: TrecQrel('994479', '7275120', 1, '0'), 9: TrecQrel('1048359', '7187663', 1, '0'), 49: TrecQrel('1049867', '7946528', 1, '0'), }) self._test_qrels("nano-beir/nfcorpus", count=2518, items={ 0: TrecQrel('PLAIN-2800', 'MED-4040', 1, '0'), 9: TrecQrel('PLAIN-2800', 'MED-4050', 1, '0'), 2517: TrecQrel('PLAIN-3271', 'MED-3790', 1, '0'), }) self._test_qrels("nano-beir/nq", count=57, items={ 0: TrecQrel('test1618', 'doc57226', 1, '0'), 9: TrecQrel('test2845', 'doc97421', 1, '0'), 56: TrecQrel('test2247', 'doc77971', 1, '0'), }) self._test_qrels("nano-beir/quora", count=70, items={ 0: TrecQrel('127728', '127729', 1, '0'), 9: TrecQrel('287245', '322650', 1, '0'), 69: TrecQrel('215681', '215680', 1, '0'), }) self._test_qrels("nano-beir/scidocs", count=244, items={ 0: TrecQrel('a352061134daa2c47861b8c4216ee5482a93be1d', '38d555bfe13b61e838364016219c7e42fb5dc919', 1, '0'), 9: TrecQrel('f4195462f27158c4afd86ca364347dacfd228bdd', '59fa9d40d129f18f1f3193f65935fcb9e2042afb', 1, '0'), 243: TrecQrel('458f1428273254fc5dd399f3c104f507680ddd54', '4556f3f9463166aa3e27b2bec798c0ca7316bd65', 1, '0'), }) self._test_qrels("nano-beir/scifact", count=56, items={ 0: TrecQrel('514', '16256507', 1, '0'), 9: TrecQrel('971', '28617573', 1, '0'), 55: TrecQrel('1278', '11335781', 1, '0'), }) self._test_qrels("nano-beir/webis-touche2020", count=932, items={ 0: TrecQrel('47', '48cd3dfc-2019-04-18T13:56:49Z-00004-000', 1, '0'), 9: TrecQrel('47', '1733befb-2019-04-18T15:36:46Z-00003-000', 1, '0'), 931: TrecQrel('33', '916864f7-2019-04-18T11:10:13Z-00002-000', 1, '0'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/natural_questions.py ================================================ import re import unittest from ir_datasets.formats import GenericQuery,GenericScoredDoc from ir_datasets.datasets.natural_questions import NqPassageDoc, NqQrel from .base import DatasetIntegrationTest class TestNq(DatasetIntegrationTest): def test_docs(self): self._test_docs('natural-questions', count=28390850, items={ 0: NqPassageDoc('0-0', re.compile('^The trade winds are the prevailing pattern of easterly surface winds found in the tropics , within t.{470}on into the Americas and trade routes to become established across the Atlantic and Pacific oceans \\.$', flags=48), re.compile('^

The trade winds are the prevailing pattern of easterly surface winds found in the tropics , with.{479}to the Americas and trade routes to become established across the Atlantic and Pacific oceans \\.

$', flags=48), 43178, 44666, 44, 161, 'Trade winds', 'https://en.wikipedia.org//w/index.php?title=Trade_winds&oldid=817251427', None), 9: NqPassageDoc('0-9', re.compile('^During mid\\-summer in the Northern Hemisphere \\( July \\) , the westward \\- moving trade winds south of t.{818} decline in the health of coral reefs across the Caribbean and Florida , primarily since the 1970s \\.$', flags=48), re.compile('^

During mid\\-summer in the Northern Hemisphere \\( July \\) , the westward \\- moving trade winds south .{827}ine in the health of coral reefs across the Caribbean and Florida , primarily since the 1970s \\.

$', flags=48), 58657, 60294, 1304, 1488, 'Trade winds', 'https://en.wikipedia.org//w/index.php?title=Trade_winds&oldid=817251427', None), 28390849: NqPassageDoc('231694-385', 'A ball hit high and hard close to the leading edge , causing a low flight and a slight vibratory feel .', '
A ball hit high and hard close to the leading edge , causing a low flight and a slight vibratory feel .
', 143007, 143117, 13559, 13583, 'Glossary of golf', 'https://en.wikipedia.org//w/index.php?title=Glossary_of_golf&oldid=830653780', '231694-384'), }) def test_queries(self): self._test_queries('natural-questions/train', count=307373, items={ 0: GenericQuery('4549465242785278785', 'when is the last episode of season 8 of the walking dead'), 9: GenericQuery('3542596469291219966', 'when was the first robot used in surgery'), 307372: GenericQuery('-9055447625982456209', 'why is the dark age called the dark age'), }) self._test_queries('natural-questions/dev', count=7830, items={ 0: GenericQuery('5225754983651766092', 'what purpose did seasonal monsoon winds have on trade'), 9: GenericQuery('8467542931261548456', 'global trade during the ming dynasty of china'), 7829: GenericQuery('6752717162503553157', 'how many goals have arsenal scored in the premier league'), }) def test_qrels(self): self._test_qrels('natural-questions/train', count=152148, items={ 0: NqQrel('4549465242785278785', '7369-92', 1, ['March 18 , 2018'], 'NONE'), 9: NqQrel('-3126006632503975915', '7383-23', 1, [], 'NONE'), 152147: NqQrel('-9055447625982456209', '29568-1', 1, [], 'NONE'), }) self._test_qrels('natural-questions/dev', count=7695, items={ 0: NqQrel('5225754983651766092', '0-0', 1, ['enabled European empire expansion into the Americas and trade routes to become established across the Atlantic and Pacific oceans'], 'NONE'), 9: NqQrel('8081436745274892553', '11-4', 1, [], 'NONE'), 7694: NqQrel('-430859680692445019', '7366-0', 1, ['1980s'], 'NONE'), }) def test_scoreddocs(self): self._test_scoreddocs('natural-questions/train', count=40374730, items={ 0: GenericScoredDoc('4549465242785278785', '7369-0', 0.0), 9: GenericScoredDoc('4549465242785278785', '7369-9', 0.0), 40374729: GenericScoredDoc('-9055447625982456209', '29568-34', 0.0), }) self._test_scoreddocs('natural-questions/dev', count=973480, items={ 0: GenericScoredDoc('5225754983651766092', '0-0', 0.0), 9: GenericScoredDoc('5225754983651766092', '0-9', 0.0), 973479: GenericScoredDoc('6752717162503553157', '7368-391', 0.0), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/neuclir.py ================================================ import re import unittest from ir_datasets.formats import ExctractedCCDoc, ExctractedCCQuery, TrecQrel, ExctractedCCNoReportQuery, ExctractedCCNoReportNoHtNarQuery, ExctractedCCMultiMtQuery from .base import DatasetIntegrationTest class TestNeuCLIR22(DatasetIntegrationTest): def test_docs(self): self._test_docs('neuclir/1/zh', count=3179209, items={ 0: ExctractedCCDoc(doc_id='95a18cdd-da7a-4815-a07d-7365d8c2e268', title='欧科云链链上大师重磅上线,一起来用“链上Bloomberg”听听行业脉搏跳动', text='Wednesday, 7 July 2021, 18:25 HKT/SGT Share: 欧科云链链上大师重磅上线,一起来用“链上Bloomberg”听听行业脉搏跳动\n\n香港, 2021年7月7日 - (亚太商讯) - 近日,欧科云链集团正式推出国内首款全景式链上数据分析工具——链上大师(ChainHub)。基于OKLink原有的九大公链全节点链上数据,欧科云链链上大师提供10000+数据指标、4大专题数据、以周、月、年为单位的专业研报,具备专业API接口,为广大C端、B端和G端用户打造了一体化的数据解决方案。\n\n\n\n\n\n\n\n\n\n上线之初,欧科云链链上大师受到各大媒体的青睐,被称为“链上Bloomberg”,以六大数据优势享誉业内。\n\n\n\n2011到2021,发现链上数据的价值\n\n\n\n区块链的链上数据分析,始于2011 年的 Coin Days Destroyed 指标——这是第一个链上数据指标。\n\n\n\n到了2021年,欧科云链链上大师上线,已经推出了10000+精选链上数据指标。短短十年,链上数据指标实现从1到1万+的扩展,来自于区块链技术的迅猛发展、新基建政策的扶持、区块链落地场景的成熟、企业的耐心打磨、产业市值的飞涨。\n\n\n\n当前,区块链市场的协议项目超过1万个,总市值一度超过2.5万亿美元,DeFi、NFT、去中心化存储、物联网、支付、人工智能等赛道涌现出优秀项目,特斯拉、微策略、灰度等机构也先后入场,持续推动区块链产业的繁荣。\n\n\n\n作为新基建之一,区块链技术具有透明、公开、不可篡改等特点。而链上数据,也成为了区块链的脉搏,收集、展示所有链上活动,直观地反映出区块链的链上整体生态,帮助机构、组织、长期投资者找到有意义的指标,借此来进行信息的研究和判断。\n\n\n\n如何收集链上数据呢?一般有两种方法:1、通过运行节点来获取数据;2、从数据供应商那里获取。欧科云链链上大师、Nansen、Dune Analytics等产品工具指明了方向——链上数据指标分析。\n\n\n\n在区块链行业,始终没有像彭博终端这样的超级工具,可以集中所有分析资源和实时价格指数,欧科云链链上大师的出现,填补了这一空白。\n\n\n\n三大产品特色,把握市场动向\n\n\n\n欧科云链链上大师是国内首款全景式的链上数据分析工具,上线之初,便受到各大媒体青睐,被称为“链上Bloomberg”。\n\n\n\nBloomberg(彭博)是全球最大的财经资讯服务供应商,为财经与商界专业人士提供数据、新闻和分析工具,当前业务遍及全球185个国家与地区。随着区块链产业的崛起,数字资产市场的成熟,机构、交易员等专业人士对于链上数据的需求日渐强烈。\n\n\n\n在此背景下,欧科云链推出全景式链上数据分析工具——欧科云链链上大师。结合OKLink蜂巢架构的核心底层技术,基于插拔式架构的扩展,欧科云链链上大师利用OLAP数据库做离线+实时分析,使用预计算方案,满足链上数据全方位的多维度指标输出,在产品层面上具备“三大特色”!\n\n\n\n特色一:链上大师依托9大公链,推出了10000+数据指标。链上大师完整详实的链上数据,既包含了多生态角色(矿工、机构等),又覆盖了多维度(地址、市场指标、网络)的数据指标。\n\n\n\n特色二:专业标准的API。欧科云链链上大师的专业API接口,为用户提供一个更为灵活、轻量的数据获取方式,可支持用户进行数据分析及策略挖掘。\n\n\n\n特色三:链上大师一体化的数据解决方案。欧科云链链上大师标准化的数据采集与整理,将多样化的指标及时更新入库呈现给用户。\n\n\n\n具体来说,链上大师提供了“热门专题”和“精选指标”两大类数据,满足链上用户多样化的数据需求。\n\n\n\n在“热门专题”中,目前推出了“DeFi”、“机构数据”、“NFT排行榜”、“波卡生态“4大板块数据,且保持持续更新。近期,欧科云链链上大师在“DeFi板块”嵌入“挖矿收益排行榜”、“24H借贷利率”等数据,为用户节省大量时间成本。\n\n\n\n在“精选指标”中,基于OKLink九大公链全节点链上数据,欧科云链链上大师生成10000+数据指标,用户可以查到主流数字资产的各类指标,并根据矿工、机构、市场指标、全网概览等进行分类,方便用户进行查询。借助链上大师,用户还可以使用收藏功能,将特定指标收藏进“我的收藏”。\n\n\n\n同时,为了方便机构用户、普通用户的专业交流,链上大师支持“主题分享”,可将“矿工”、“市场指标”等同一主题下的多个数据一键分享。\n\n\n\n全景式链上数据分析,打造行业“听诊器”\n\n\n\n欧科云链集团作为区块链行业原生企业,此次推出链上大师,为区块链行业在链上数据赛道再次筑下一座里程碑。\n\n\n\n相比于同类产品,欧科云链链上大师具有三大优势,做到了“三全齐美”,即:\n\n\n\n全数据:全节点链上数据,蜂巢架构设计,数据丰富专业\n\n全天候:全天候实时更新,贴合市场需求,更新精准及时\n\n全方位:全方位统计指标,紧跟市场热点,指标全面多样\n\n\n\n在数据研报方面,欧科云链链上大师拥有专业研究团队,依托OKLink九大公链浏览器、链上天眼、链上大师等产品,定期推出周报、月报、年报等专业研报,借指标解剖行业,用数据洞悉未来。\n\n\n\n链上大师上线后,欧科云链成为集区块链浏览器与数据服务为一体的复合型区块链大数据服务商,同时也为区块链行业提供了一只“听诊器”。只需要打开欧科云链链上大师,就可以“一键听诊”,洞悉链上数据,感知行业冷暖,把握市场脉搏。\n\n\n\n如何响应新基建政策,促进区块链行业健康发展?拿好手上的这只“听诊器”,一起来听听区块链市场的脉搏跳动。\n\n\n\n\n\n\n\n话题 Press release summary\n\n\n\n部门 Cloud & Enterprise, FinTech & Blockchain\n\nhttp://www.acnnewswire.com\n\nFrom the Asia Corporate News Network\n\n\n\nCopyright © 2021 ACN Newswire. All rights reserved. A division of Asia Corporate News Network\n\n', url='https://ch.acnnewswire.com/article.asp?art_id=67895', time=None, cc_file='crawl-data/CC-NEWS/2021/07/CC-NEWS-20210707164449-00551.warc.gz'), 12345: ExctractedCCDoc(doc_id='9ee19273-f5b9-4deb-87fe-f90d6b4bcac4', title='週三起有劇烈降雨 溫度有機會下降', text='週三起有劇烈降雨、溫度略降。氣象局表示,12日、13日各地大多為多雲到晴,午後西半部地區及東北部、東部山區有局部短暫雷陣雨。氣象專家吳德榮指出,14日至18日午後易有強對流發展,伴隨劇烈降雨。台灣整合防災工程公司總監賈新興則表示,14日之後因水氣相對增加,各地高溫會降至30度至32度左右。\n\n週一、二高溫炎熱\n\n氣象局表示,12日、13日各地大多為多雲到晴,東南部及恆春半島有局部短暫陣雨,午後西半部地區及東北部、東部山區有局部短暫雷陣雨,並有局部較大雨勢發生的機率。中央大學大氣科學系兼任副教授吳德榮指出,最新歐洲模式模擬顯示,11日至13日各地大多晴朗炎熱、紫外線強,高溫在36度左右。\n\n吳德榮表示,最新模式模擬顯示,14日至18日太平洋高壓減弱東退,東風波移入,並逐漸發展成季風低壓的型態。季風低壓為大尺度的低壓環流,水氣多、大氣不穩定度高,午後易有強對流發展,伴隨劇烈降雨,其他時間亦偶有局部短暫降雨的機率,各地區白天高溫略降,但因模式模擬將持續調整,應密切觀察最新資料。\n\n週三起降雨增\n\n賈新興表示,12日、13日受太平洋高壓影響,午後有局部短暫雨或短暫雷雨的天氣,14日之後因太平洋高壓北抬,「本來是太平洋高壓主宰的天氣型態,轉為低壓帶接手的環境」,台灣東半部和恆春半島水氣會相對增多,14日之後降雨條件會比較明顯,局部地區會有較大雨勢。\n\n氣溫方面,12日、13日全台各地高溫約在33度至34度,局部地區可能達35度,14日之後因水氣相對較多,午後降雨機率較大,與前一陣子較炎熱的天氣型態相比,各地高溫會略降,降至30度至32度左右。\n\n【更多精采內容,詳見 】', url='https://udn.com/news/story/7266/5594185', time='2021-07-11T17:43:41+00:00', cc_file='crawl-data/CC-NEWS/2021/07/CC-NEWS-20210711102218-00616.warc.gz'), 3179208: ExctractedCCDoc(doc_id='f8fd8814-4658-4982-b262-aae10756d1ea', title='氣立第3季業績回穩 可望衝今年單季高點', text='(中央社記者鍾榮峰台北2日電)氣動元件廠台灣氣立下半年業績仍須觀察美中貿易戰影響,不過第3季業績回穩,法人預估氣立第3季業績獲利可站上今年單季高點。\n\n展望第3季營運表現,法人指出,儘管仍受到美中貿易戰影響,氣立8月到9月業績可望回穩,預估落在新台幣1.2億元到1.3億元區間,第3季整體業績可到3.7億元到3.8億元,較第2季成長7%到8%左右,來到今年迄今單季高點。\n\n在單季獲利部分受惠業績上揚,加上費用支出刪減效益,預估氣立第3季毛利率可較第2季小幅增加,若無業外匯損因素,第3季獲利可望大幅季增180%到190%,來到今年迄今單季高點。不過整體下半年業績還需觀察美中貿易戰影響。\n\n在產能布局,法人指出氣立在中國浙江平湖園區持續動工,預估今年第4季完工,明年第1季開始進駐設備。\n\n氣立先前表示,擴廠項目的廠區主要規劃包括2/3用於集團上游模具設計製造、原物料鋁壓鑄件、氣動元件製造及車用零組件生產等;另外1/3廠區規劃下游金屬表面處理製程,包含精密研磨、陽極處理及表面烤漆等。\n\n氣立今年7月自結合併營收新台幣1.23億元,較去年同期1.56億元減少21.11%,累計今年前7月氣立自結合併營收8.15億元,較去年同期10.26億元減少20.56%。\n\n從產品營收比重來看,今年上半年執行元件占比約6成多,控制元件占比約9%,輔助元件占比約11%到12%,氣源處理元件占比約5%。銷售地區仍以中國大陸為主要市場,占比約8成,台灣市場占比約15%。從客戶別來看,經銷商、電子產業和氣動產業三大應用占比合計超過75%。(編輯:楊凱翔)1080902', url='https://www.cna.com.tw/news/afe/201909020081.aspx', time=None, cc_file='crawl-data/CC-NEWS/2019/09/CC-NEWS-20190902014743-01074.warc.gz') }) self._test_docs('neuclir/1/fa', count=2232016, items={ 0: ExctractedCCDoc(doc_id='93e291b3-0216-4a48-941e-01476e703311', title='انفجار در خطوط انتقال نفت و گاز در شوش تعدادی کشته و زخمی برجای گذارد', text='انفجار در خطوط انتقال نفت و گاز در شوش تعدادی کشته و زخمی برجای گذارد.\n\nانفجار در ایستگاه خط لوله انتقال نفت و گاز در شوش در استان خوزستان دست\u200cکم سه کشته و ۴ زخمی برجای گذارد. به گفته فرماندار شوش، این حادثه عصر روز سه\u200cشنبه ۶ ژوئیه، در اثر انفجار ناشی از نشت گاز در ایستگاه خط لوله انتقال نفت و گاز رخ داد. انفجار این خط لوله، یک روز پس از آتش\u200cسوزی در یک انبار بزرگ در غرب تهران به وقوع پیوست. دلیل این آتش\u200c\u200cسوزی قطعی برق اعلام شد.\n\nتبلیغ بازرگانی دنباله\n\nمجید عسگری، فرماندار شهرستان دهلران، روز گذشته (سه\u200cشنبه) و در پی انفجار در خطوط نفت و گاز گفت: انفجار در ایستگاه خط لوله انتقال نفت و گاز در بخش فتح\u200cالمبین شوش اتفاق افتاده است که منجر به مرگ سه نفر و مجروح شدن چهار نفر دیگر شده است.\n\nبه گزارش خبرگزاری ایرنا، محل دقیق انفجار در فاصله روستای بیت عجم و مجاهدین در بخش فتح\u200cالمبین شهرستان شوش عنوان شده است.\n\nعدنان غزی، فرماندار شوش، نیز درباره مجروحان این حادثه گفت:\u200c چهار نفر که در اتاق جنب محل انفجار در حال استراحت بودند، به شدت زخمی شدند که توسط اورژانس ۱۱۵ به بیمارستان نظام مافی شوش اعزام شدند.\n\nبه گفته فرماندار شهرستان دهلران، قربانیان این انفجار از کارشناسان شرکت بهره\u200cبرداری نفت و گاز غرب ایران بودند.\n\nایستگاه خط لوله انتقال نفت و گاز در بخش فتح المبین شوش زیر نظر شرکت بهره برداری نفت و گاز غرب کشور است.\n\nاین خط انتقال از اهواز تا ایلام امتداد دارد و محل حادثه در حدود ۳۵ کیلومتری شوش گزارش شده است.\n\nدر \u200cهفته\u200cهای اخیر موارد متعددی از آتش\u200cسوزی و انفجار در ایران گزارش شده است. حدود یک ماه پیش،\u200c پالایشگاه تندگویان در جنوب تهران دچار حریق شد.\n\nدریافت رایگان خبرنامهبا خبر-پیامک های ما اخبار را بصورت زنده دریافت کنید آبونه شوید', url='https://www.rfi.fr/fa/%D8%A7%DB%8C%D8%B1%D8%A7%D9%86/20210707-%D8%A7%D9%86%D9%81%D8%AC%D8%A7%D8%B1-%D8%AF%D8%B1-%D8%AE%D8%B7%D9%88%D8%B7-%D8%A7%D9%86%D8%AA%D9%82%D8%A7%D9%84-%D9%86%D9%81%D8%AA-%D9%88-%DA%AF%D8%A7%D8%B2-%D8%AF%D8%B1-%D8%B4%D9%88%D8%B4-%D8%AA%D8%B9%D8%AF%D8%A7%D8%AF%DB%8C-%DA%A9%D8%B4%D8%AA%D9%87-%D9%88-%D8%B2%D8%AE%D9%85%DB%8C-%D8%A8%D8%B1%D8%AC%D8%A7%DB%8C-%DA%AF%D8%B0%D8%A7%D8%B1%D8%AF', time='2021-07-07T13:05:05+00:00', cc_file='crawl-data/CC-NEWS/2021/07/CC-NEWS-20210707140801-00548.warc.gz'), 12345: ExctractedCCDoc(doc_id='f73a0ff9-f69c-49cb-9abf-8b8b31296390', title='رشد 40 درصدی پرداخت صدقه سمنانی\u200cها در بهار امسال', text='ایسنا/سمنان مردم نیکوکار استان سمنان در سه\u200cماهه نخست امسال یک میلیارد و 533 میلیون تومان صدقه پرداخت کردند که این رقم نسبت به مدت مشابه سال گذشته، 40درصد رشد داشته است.\n\nحمیدرضا رجب\u200cزاده، معاون توسعه مشارکت\u200cهای مردمی کمیته امداد استان سمنان با اشاره به اینکه مشارکت خیران، زمینه خدمت اثربخش به نیازمندان و توانمندسازی خانواده\u200cهای تحت حمایت را فراهم می\u200cکنند، گفت: مردم نیکوکار استان سمنان در سه ماهه نخست سال جاری از طریق صندوق\u200cهای صدقات، یک میلیارد و 533 میلیون تومان، صدقه پرداخت کردند.\n\nوی با اشاره به رشد 40 درصدی پرداخت صدقه در سه\u200cماهه نخست امسال نسبت به مدت مشابه گذشته، افزود: مردم این استان در سه ماهه نخست سال 99،نیز یک میلیارد و 98 میلیون تومان، در پرداخت صدقه مشارکت داشتند.\n\nرجب زاده با تأکید بر اینکه رشد کمک\u200cهای مردمی به نیازمندان از طریق این نهاد، نشانه اعتماد آنان به کمیته امداد است، افزود: تمامی مبالغ جمع\u200cآوری\u200cشده برای رفع نیازهای خانواده\u200cهای تحت حمایت در بخش\u200cهای مختلف ازجمله تأمین مسکن، جهیزیه، درمان، تحصیل و اعطای وام هزینه می\u200cشود.\n\nوی بر پرداخت صدقه به روش الکترونیکی با هدف کاهش هزینه\u200cهای جمع آوری صدقات، رعایت شیوه نامه\u200cهای بهداشتی با توجه به شیوع کرونا و افزایش سرعت در رسیدن صدقات به دست نیازمندان تاکید کرد.\n\nمعاون توسعه مشارکت\u200cهای مردمی کمیته امداد استان سمنان بابیان اینکه بر اساس میزان جمعیت 702 هزار 360 نفر استان سمنان و صدقات جمع\u200cآوری\u200cشده در سه ماهه نخست سال جاری، هر شهروند استان سمنان به\u200cطور میانگین روزانه 24تومان صدقه پرداخت می\u200cکند.\n\nانتهای پیام', url='https://www.isna.ir/news/1400042014309/%D8%B1%D8%B4%D8%AF-40-%D8%AF%D8%B1%D8%B5%D8%AF%DB%8C-%D9%BE%D8%B1%D8%AF%D8%A7%D8%AE%D8%AA-%D8%B5%D8%AF%D9%82%D9%87-%D8%B3%D9%85%D9%86%D8%A7%D9%86%DB%8C-%D9%87%D8%A7-%D8%AF%D8%B1-%D8%A8%D9%87%D8%A7%D8%B1-%D8%A7%D9%85%D8%B3%D8%A7%D9%84', time='2021-07-11T10:19:16+00:00', cc_file='crawl-data/CC-NEWS/2021/07/CC-NEWS-20210711102218-00616.warc.gz'), 2232015: ExctractedCCDoc(doc_id='085e47da-94b0-4da6-a0a6-2865b85cf10c', title='رضیان: روی لایحه تجارت کار کارشناسی لازم انجام شده است', text='نماینده مردم قائمشهر در مجلس شورای اسلامی گفت: کمیسیون حقوقی و قضایی به همراه کارشناسان کار و مطالعه دقیقی بر روی لایحه تجارت انجام دادند و اینگونه نیست که مجلس شورای اسلامی بی\u200cتفاوت از کنار این لایحه گذشته باشد.\n\nبه گزارش ایسنا، عبدالله رضیان در جلسه علنی امروز در تذکری با استناد به ماده ۷۵ آیین\u200cنامه داخلی مجلس شورای اسلامی و با اشاره به اظهارات روز گذشته علیرضا سلیمی در خصوص نحوه بررسی لایحه تجارت در مجلس بیان کرد: در روز گذشته آقای سلیمی مطلبی را در خصوص نحوه بررسی لایحه تجارت بیان کردند که صحیح نیست. ایشان گفتند که همکاران در مجلس حواس\u200cشان نیست که چه تصویب می\u200cشود، اما بنده باید به ایشان بگویم که اینگونه نیست و کار کارشناسی روی این لایحه انجام شده است.\n\nوی در ادامه اظهار کرد: بر روی لایحه تجارت در کمیسیون قضایی و حقوقی به همراه کارشناسان این حوزه کار و مطالعه طولانی\u200cمدتی انجام شده است و همه همکاران حقوقی بر این لایحه اشراف دارند.\n\nنماینده مردم قائمشهر در مجلس شورای اسلامی در ادامه تاکید کرد: صحبت\u200cهای روز گذشته آقای سلیمی بازتابی را در رسانه\u200cها و صداوسیما داشت، در حالی که اینگونه نیست و مجلس شورای اسلامی بر روی این لایحه کار کارشناسی لازم را انجام داده است.\n\nانتهای پیام', url='https://www.isna.ir/news/98061105453/%D8%B1%D8%B6%DB%8C%D8%A7%D9%86-%D8%B1%D9%88%DB%8C-%D9%84%D8%A7%DB%8C%D8%AD%D9%87-%D8%AA%D8%AC%D8%A7%D8%B1%D8%AA-%DA%A9%D8%A7%D8%B1-%DA%A9%D8%A7%D8%B1%D8%B4%D9%86%D8%A7%D8%B3%DB%8C-%D9%84%D8%A7%D8%B2%D9%85-%D8%A7%D9%86%D8%AC%D8%A7%D9%85-%D8%B4%D8%AF%D9%87-%D8%A7%D8%B3%D8%AA', time='2019-09-02T04:48:00+00:00', cc_file='crawl-data/CC-NEWS/2019/09/CC-NEWS-20190902014743-01074.warc.gz') }) self._test_docs('neuclir/1/ru', count=4627543, items={ 0: ExctractedCCDoc(doc_id='ed4af92b-0039-453e-8f25-f359225da8e0', title='Сергей Аксёнов сообщил о регистрации в Крыму 247 новых случаев COVID-19', text='Глава Республики Крым Сергей Аксёнов сообщил на своей официальной странице в социальной сети «ВКонтакте» о регистрации 247 новых случаев коронавирусной инфекции в Крыму.', url='https://mirtesen.ru/dispute/43025555825/Sergey-Aksyonov-soobschil-o-registratsii-v-Kryimu-247-novyih-slu', time=None, cc_file='crawl-data/CC-NEWS/2021/07/CC-NEWS-20210711213016-00623.warc.gz'), 12345: ExctractedCCDoc(doc_id='3f82d6dd-23d4-4274-ac07-dce8745d369f', title='Количество зараженных коронавирусом в Беларуси на 9 июля 2021 года: число новых случаев за сутки растет несколько дней подряд, 11 смертей', text='За сутки в республике выявили 1 тыс. 250 новых случаев COVID-19 Фото: Валерий ЗВОНАРЕВ\n\nСвежая статистика по коронавирусу в Беларуси 9 июля опубликована в Telegram-канале Минздрава. Так, за сутки в республике выявили 1 тыс. 250 новых случаев COVID-19, выписали из больниц 1 тыс. 335 пациентов.\n\nВ предыдущие два дня за сутки регистрировали около тысячи и чуть более тысячи новых случаев коронавируса, таким образом, уже третий день подряд этот показатель растет.\n\n«Всего с начала пандемии в Беларуси были зарегистрированы 425 тыс. 804 человека с положительным тестом на COVID-19. Выздоровели 418 тыс. 920 пациентов, у которых ранее был подтвержден диагноз COVID-19», – добавили в Минздраве.\n\nТакже за прошедшие сутки у мерли 11 пациентов с выявленным коронавирусом. За все время распространения инфекции на территории Беларуси скончались 3 тыс. 236 пациентов с выявленной коронавирусной инфекцией.\n\nТакже специалисты продолжают делать тесты. За минувшие сутки выполнено 17 тыс. 733 теста, а всего в стране проведено 6 млн. 941 тыс. 505 таких анализов.\n\nКроме того, в Беларуси продолжается вакцинация от коронавируса. Более 1,06 млн белорусов получили первую дозу вакцины против COVID-19, из них более 694,2 тыс. человек прошли полный курс вакцинации.\n\nЧитайте также:\n\nКак защититься от заразного индийского штамма корнавируса «Дельта», рассказал доктор Комаровский\n\nПо его словам, не стоит терять бдительность даже жителям тех стран, где мало случаев заражения штаммом «Дельта» (подробности).\n\nВакцины vs СOVID-19: в ВОЗ сказали, кто побеждает. Вам это не понравится\n\nВ ВОЗ прокомментировали ситуацию с новыми штаммами коронавируса (подробности).\n\nБез фастфуда и алкоголя. Диетолог рассказала, как питаться перед вакцинацией от ковида, чтобы прививка лучше подействовала\n\nСпециалист рассказала, что если человек всю жизнь питался фастфудом, порция брокколи перед прививкой вряд ли что-то изменит (подробности).', url='https://www.kp.by/online/news/4358793/', time='2021-07-09T14:10:25+03:00', cc_file='crawl-data/CC-NEWS/2021/07/CC-NEWS-20210709143545-00589.warc.gz'), 4627542: ExctractedCCDoc(doc_id='d79316fa-5986-404b-bd29-4eef59c153ea', title='Фанаты «Барселоны» хотят сделать Пике новым президентом клуба, а Хави — тренером', text='Защитник «Барселоны» Жерар Пике является фаворитом каталонских болельщиков на пост следующего руководителя клуба, сообщает AS.\n\n\n\nПолномочия нынешнего президента Хосепа Марии Бартомеу истекут в 2021 году. По сведениям источника, в команду 32-летнего Пике могут войти бывшие партнёры по команде — Хави и Карлес Пуйоль. Последний может занять пост спортивного директора, а Хави фанаты сине-гранатовых видят главным тренером команды.\n\n\n\nВместе с тем сообщается, что Пике пока не думает о завершении игровой карьеры и набирается управленческого опыта в других клубах — «Андорре» и «Химнастик Манресе», являясь их владельцем.\n\nНапомним, что 32-летний испанский футболист приобрёл контрольный пакет акций клуба «Химнастик Манреса» в конце июля текущего года. При этом бывший игрок сборной Испании взял на себя полную ответственность за выплату долгов клуба. Пике планирует сделать «Химнастик Манреса» филиалом приобретённого им в январе клуба «Андорра», который был основан в 1942 году и выступает в Примере Каталонии — пятом дивизионе чемпионата Испании.', url='https://www.championat.com/football/news-3839327-fanaty-barselony-hotjat-sdelat-pike-novym-prezidentom-kluba-a-havi---trenerom.html', time=None, cc_file='crawl-data/CC-NEWS/2019/09/CC-NEWS-20190902014743-01074.warc.gz') }) self._test_docs('neuclir/1/zh/hc4-filtered', count=519945, items={ 0: ExctractedCCDoc('cef2e118-8da8-4695-8854-1f5cd802f95c', '【圖輯】動物才需要皮毛! 西班牙人「浴血裸躺」抗議皮草製品|香港01|世界說', '請使用下列任何一種瀏覽器瀏覽以達至最佳的用戶體驗:Google Chrome、Mozilla Firefox、Internet Explorer、Microsoft Edge 或Safari。為避免使用網頁時發生問題,請確保你的網頁瀏覽器已更新至最新版本。\n\n繼續瀏覽', re.compile('^https://www\\.hk01\\.com/%E4%B8%96%E7%95%8C%E8%AA%AA/272186/%E5%9C%96%E8%BC%AF\\-%E5%8B%95%E7%89%A9%E6%89%.{67}E4%BA%BA\\-%E6%B5%B4%E8%A1%80%E8%A3%B8%E8%BA%BA\\-%E6%8A%97%E8%AD%B0%E7%9A%AE%E8%8D%89%E8%A3%BD%E5%93%81$', flags=48), '2018-12-17T19:13:00+08:00', 'crawl-data/CC-NEWS/2018/12/CC-NEWS-20181218012339-00004.warc.gz'), 9: ExctractedCCDoc('058f053d-6a7e-41be-9ab7-402eaea1943e', '[表一6]上市上櫃股東會一覽表', re.compile('^代碼 股票名稱 日期 地 點 \\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\-\\- \\*3430 奇鈦科 20.{1331}\\) \\*3557 嘉威 20191015 台北市信義路五段5號 \\*1732 毛寶 20191017 新竹縣湖口鄉中華路22號2樓\\(新竹工業區管理中心\\) 註:有 \\* 號者為「臨時會」,無註記者為「常會」。$', flags=48), 'https://news.cnyes.com/news/id/4367796', '2019-08-16T07:58:22+08:00', 'crawl-data/CC-NEWS/2019/08/CC-NEWS-20190815235008-00897.warc.gz'), 519944: ExctractedCCDoc('5eaf1d5b-1591-4b42-9e6f-2c878b721d5e', '六福村徵鬼!當鬼王陪玩笑傲飛鷹 讓你選擇笑死還是嚇死', re.compile('^▲應徵上鬼王還能陪遊客一起搭乘笑傲飛鷹。(圖/六福村提供,下同)\n\n\n\n記者陳凱力/新竹報導\n\n中國人怕鬼,西洋人也怕鬼,全世界的人都怕鬼!鬼門要關了,但六福村主題遊樂園不關門,迎接即將到來的萬聖節,.{637}本演繹等測試,擁有其他特殊專長如體操、特技更是加分條件。你,準備好要挑戰當鬼王了嗎?我們不見不散!\n\n更多活動詳情及優惠,歡迎至六福村官網 、六福村官方線上購票 、FB粉絲專頁或猛鬼人力銀行網站查詢。$', flags=48), 'https://www.ettoday.net/news/20190828/1523501.htm', '2019-08-28T19:38:00+08:00', 'crawl-data/CC-NEWS/2019/08/CC-NEWS-20190828113707-01025.warc.gz'), }) self._test_docs('neuclir/1/fa/hc4-filtered', count=391703, items={ 0: ExctractedCCDoc('f5c30695-2fea-4a08-b87c-49a7e39c7945', 'کاهش نرخ رسمی ۲۵ ارز', re.compile('^امروز که همزمان با روز تعطیل رسمی در بازارهای جهانی است، نرخ دلار آمریکا همچنان بدون تغییر، ۴۲ هزار .{1518}۳۰ ریال، بولیوار جدید ونزوئلا چهار هزار و ۲۰۶ ریال و منات جدید ترکمنستان ۱۲ هزار ریال ارزش گذاری شد\\.$', flags=48), 'https://www.irna.ir/news/83444978/%DA%A9%D8%A7%D9%87%D8%B4-%D9%86%D8%B1%D8%AE-%D8%B1%D8%B3%D9%85%DB%8C-%DB%B2%DB%B5-%D8%A7%D8%B1%D8%B2', '2019-08-21T05:56:07+00:00', 'crawl-data/CC-NEWS/2019/08/CC-NEWS-20190821054226-00948.warc.gz'), 9: ExctractedCCDoc('7eee1f43-a5a1-4777-89d6-3ff572b9245b', 'بورس آذربایجان\u200cغربی هفته گذشته هم مثبت بود', re.compile('^وحید جوادی روز شنبه در گفت و گو با خبرنگار ایرنا با بیان اینکه طی هفته گذشته شاخص کل بازار فرابورس ن.{840} ۲۲ اسفند سال ۱۳۸۳ شمسی با همت سازمان بورس و اوراق بهادار و استانداری این استان تاسیس شد\\.\n\n۳۲۱۴/۲۰۹۳$', flags=48), re.compile('^https://www\\.irna\\.ir/news/83422351/%D8%A8%D9%88%D8%B1%D8%B3\\-%D8%A2%D8%B0%D8%B1%D8%A8%D8%A7%DB%8C%D8%A.{51}%D8%AA%D9%87\\-%DA%AF%D8%B0%D8%B4%D8%AA%D9%87\\-%D9%87%D9%85\\-%D9%85%D8%AB%D8%A8%D8%AA\\-%D8%A8%D9%88%D8%AF$', flags=48), '2019-08-03T06:14:18+00:00', 'crawl-data/CC-NEWS/2019/08/CC-NEWS-20190803054534-00767.warc.gz'), 391702: ExctractedCCDoc('40ab1e8e-a937-4c9e-89a9-9495a021e47e', 'بهره\u200cبرداری از طرح آبرسانی تهران به سد تنظیمی کرج تا خرداد ۹۹', re.compile('^به گزارش خبرگزاری مهر به نقل از وزارت نیرو، سید حسن رضوی در حاشیه بازدید از تصفیه\u200cخانه ششم تهران از .{942}ایت می\u200cکند\\. بهره\u200cبرداری از تونل انتقال آب کرج – تهران و تصفیه\u200cخانه ششم، منوط به تکمیل این کانال است\\.$', flags=48), re.compile('^https://www\\.mehrnews\\.com/news/4704991/%D8%A8%D9%87%D8%B1%D9%87\\-%D8%A8%D8%B1%D8%AF%D8%A7%D8%B1%DB%8C\\-.{144}%D8%B8%DB%8C%D9%85%DB%8C\\-%DA%A9%D8%B1%D8%AC\\-%D8%AA%D8%A7\\-%D8%AE%D8%B1%D8%AF%D8%A7%D8%AF\\-%DB%B9%DB%B9$', flags=48), '2019-08-28T10:56:28+00:00', 'crawl-data/CC-NEWS/2019/08/CC-NEWS-20190828113707-01025.warc.gz'), }) self._test_docs('neuclir/1/ru/hc4-filtered', count=964719, items={ 0: ExctractedCCDoc('ecd810c8-4b67-4a53-a0bb-20e0214becde', 'Рафаэль Надаль – в четвертьфинале Открытого чемпионата Франции', re.compile('^Двое друзей встретились в парке, гуляя с собаками\\. Один из них предложил зайти позавтракать в ближай.{419}ки осведомился хозяин\\. \\- Чихуахуа\\? \\- удивился мужчина\\. \\- Так вот что они мне подсунули! еще анекдот!$', flags=48), 'https://www.33live.ru/novosti/02-06-2019-rafael-nadal-v-chetvertfinale-otkrytogo-chempionata-francii.html', None, 'crawl-data/CC-NEWS/2019/06/CC-NEWS-20190602191517-00127.warc.gz'), 9: ExctractedCCDoc('f70cc5be-3ff8-4f84-a6d4-c7515231ffbf', 'Парламент Франции одобрил закон об ограничении прав демонстрантов', re.compile('^Нижняя палата парламента Франции во вторник, 5 февраля, одобрила законопроект, расширяющий полномочи.{4654}го пострадали магазины на Елисейских полях, некоторые из которых разграбили\\. Автор: Никита Баталов\n\n$', flags=48), re.compile('^https://www\\.dw\\.com/ru/%D0%BF%D0%B0%D1%80%D0%BB%D0%B0%D0%BC%D0%B5%D0%BD%D1%82\\-%D1%84%D1%80%D0%B0%D0%B.{188}0%B0%D0%B2\\-%D0%B4%D0%B5%D0%BC%D0%BE%D0%BD%D1%81%D1%82%D1%80%D0%B0%D0%BD%D1%82%D0%BE%D0%B2/a\\-47375953$', flags=48), None, 'crawl-data/CC-NEWS/2019/02/CC-NEWS-20190205195434-00321.warc.gz'), 964718: ExctractedCCDoc('c89674e1-fbaf-48f9-9ce2-2517968dc020', 'Пугачева посмеялась над Лободой и назначила своей преемницей Вайкуле', re.compile('^Народная артистка СССР Алла Пугачева и певица Лайма Вайкуле посмеялись над видеороликом в инстаграме.{3749} тем, кто неуважительно писал о звезде, поближе познакомиться с ее творчеством и почитать биографию\\.$', flags=48), 'https://www.gazeta.ru/culture/2019/08/28/a_12607039.shtml', '2019-08-28T14:38:12+03:00', 'crawl-data/CC-NEWS/2019/08/CC-NEWS-20190828113707-01025.warc.gz'), }) self._test_docs('neuclir/1/multi', count=10038768, items={ 0: ExctractedCCDoc('95a18cdd-da7a-4815-a07d-7365d8c2e268', '欧科云链链上大师重磅上线,一起来用“链上Bloomberg”听听行业脉搏跳动', re.compile('^Wednesday, 7 July 2021, 18:25 HKT/SGT Share: 欧科云链链上大师重磅上线,一起来用“链上Bloomberg”听听行业脉搏跳动\n\n香港, 2021年7月7日 \\-.{2347}k\n\n\n\nCopyright © 2021 ACN Newswire\\. All rights reserved\\. A division of Asia Corporate News Network\n\n$', flags=48), 'https://ch.acnnewswire.com/article.asp?art_id=67895', None, 'crawl-data/CC-NEWS/2021/07/CC-NEWS-20210707164449-00551.warc.gz'), 9: ExctractedCCDoc('1c1925e5-4fa4-4ea0-934a-d050dde6d78c', '加比爾捷西斯停賽2場 無緣美洲盃決賽|即時新聞|體波|on.cc東網', re.compile('^巴西殺入美洲國家盃決賽,不過效力曼城嘅前鋒加比爾捷西斯就冇份參加,南美足協周三\\(7日\\)宣布,加比爾捷西斯喺8強對智利施展嘅「天殘腳」紅牌,要停賽2場兼罰款5,000美元,意味4強對秘魯已經開始坐波監嘅.{52}要同隊友道歉,但呢支球隊再次展示出強大力量,我會嘗試從錯誤中繼續學習。」\n\n2年前加比爾捷西斯喺美洲國家盃決賽入波協助巴西捧盃,但同時亦食紅牌,加埋上場被逐,佢成為領隊迪迪領軍以來首個2次食紅牌球員。$', flags=48), 'https://hk.on.cc/hk/bkn/cnt/sport/20210707/bkn-20210707210135443-0707_00882_001.html', None, 'crawl-data/CC-NEWS/2021/07/CC-NEWS-20210707140801-00548.warc.gz'), 10038767: ExctractedCCDoc('d79316fa-5986-404b-bd29-4eef59c153ea', 'Фанаты «Барселоны» хотят сделать Пике новым президентом клуба, а Хави — тренером', re.compile('^Защитник «Барселоны» Жерар Пике является фаворитом каталонских болельщиков на пост следующего руково.{849}оторый был основан в 1942 году и выступает в Примере Каталонии — пятом дивизионе чемпионата Испании\\.$', flags=48), 'https://www.championat.com/football/news-3839327-fanaty-barselony-hotjat-sdelat-pike-novym-prezidentom-kluba-a-havi---trenerom.html', None, 'crawl-data/CC-NEWS/2019/09/CC-NEWS-20190902014743-01074.warc.gz'), }) def test_queries(self): self._test_queries('neuclir/1/zh/hc4-filtered', count=60, items={ 0: ExctractedCCQuery('1', 'Asteroids Endangering Earth', 'Articles related to asteroids that pose danger of impact to Earth.', '小行星危害地球', '与对地球构成撞击危害的小行星相关的文章。', 'Asteroids Endangering Earth', '与对地球产生影响的危险的天蝎座有关的文章。', {'very_valuable': 'Details about asteroids that currently or in the future may pose danger of impact to Earth (names, measurements, age, etc).\nInformation on the risk of an asteroid impacting Earth.', 'somewhat_valuable': 'N/A', 'not_that_valuable': 'Information on discussions about asteroids possibly impacting Earth at some point.', 'non_relevant': 'Details about asteroids that previously impacted Earth or Earth’s atmosphere.\nInformation about asteroids and the asteroid belt in general.\nGeneral research conducted on asteroids that is not focused on the danger of impact.\nInformation about asteroids impacting other planets.\nInformation on research about other planets.\n'}, '2011 UL21 briefly had about a 1 in a million chance of impacting in 2029. Its cumulative impact probability dropped to 1 in 71 million by 2 November 2011 when the observation arc reached 15 days. It was removed from the Sentry Risk Table on 4 November 2011 when all impact scenarios for the next 100 years or more were ruled out. During 2029, the closest approach to Earth is 1.6 AU. Palomar Observatory precovery images from 1989 and 1990 have extended the observation arc to 22 years. Its next notable close approach to the Earth will be on June 27, 2024 at a distance of 0.044 AU (6,600,000 km; 4,100,000 mi).\n\nWith an absolute magnitude of 15.8, it is one the brightest and therefor largest potentially hazardous asteroids (PHA) detected since (242450) 2004 QY2. The next largest PHA (based on absolute magnitude) discovered in 2011 is 2011 WO41 with an absolute magnitude of 16.8. ', 'https://en.wikipedia.org/w/index.php?title=(415029)_2011_UL21&oldid=877055001', '2019-01-06', 'zh'), 9: ExctractedCCQuery('11', 'UK Defense Secretary leaking Huawei information', 'Find information about UK Prime Minister Theresa May firing Defense Secretary Gavin Williamson for leaking information about a Huawei deal from a National Security Council meeting.', '英国防大臣泄露华为情报', '查找与英国首相特蕾莎·梅将国防大臣加文·威廉姆森因泄露国家安全委员会会议有关华为交易情报而解职的相关信息。', '英国国防信息泄露华威', '寻找有关英国总理Theresa May firing Defense的信息,以了解国家安全委员会会议上关于Huawei交易的信息。', {'very_valuable': 'Specific reasons UK Defense Secretary Gavin Williamson was fired', 'somewhat_valuable': 'General information about the Huawei controversy', 'not_that_valuable': 'documents about the UK decision on Huawei 5G', 'non_relevant': 'documents about Huawei'}, 'In December 2018, Williamson expressed "grave" and "very deep concerns" about the Chinese telecommunications company Huawei providing technology to upgrade Britain\'s services to 5G. He accused China of acting "sometimes in a malign way". China\'s Defence Ministry spokesman Wu Qian criticized Williamson’s comments, saying "The remarks just reinforced the deep-rooted ignorance, prejudice and anxiety among some British people."\n\nOn 11 February 2019 Williamson delivered the speech "Defence in Global Britain" at the Royal United Services Institute outlining the future direction of the U.K. Armed Forces. The speech, which amongst a number of themes presented therein, proposes an increased role of the British military in the Indo-Pacific via the introduction of new and additional assets, including the F-35 carrying HMS Queen Elizabeth, was met with the consternation of the Chinese Government, leading Chinese Vice Premier Hu Chunhua to cancel trade talks with Chancellor of the Exchequer Philip Hammond. Hammond later stated that "no decisions have been made or even discussed about where [the aircraft carrier\'s] early deployments might be. And when those decisions are made, they will be made in the National Security Council."\n\nCritics of Williamson\'s approach to his Defence Secretary role have compared him to Private Pike, a hapless and immature character in the popular sitcom Dad\'s Army. ', 'https://en.wikipedia.org/w/index.php?title=Gavin_Williamson&oldid=891214674', '2019-04-06', 'zh'), 59: ExctractedCCQuery('228', '2016 Chicago Cubs accomplishments', 'How did the Chicago Cubs do during the 2016 MLB baseball season?', '2016年芝加哥小熊队成绩', '在 2016 年 MLB 棒球赛季期间,芝加哥小熊队个人和团队取得了哪些成就?', '2016年芝加哥Cubs accomplishment', '芝加哥的Cubs是如何在2016年的MLB基础上做的?', {'very_valuable': 'details about Cubs regular season, the playoffs or winning the World Series; individual accomplishments of Cubs players ', 'somewhat_valuable': 'documents focused on the 2017 season that mention the Cubs 2016 accomplishments ', 'not_that_valuable': 'passing mention of 2016 World Series', 'non_relevant': 'Other Cubs seasons '}, "The 2016 Chicago Cubs season is the 145th season of the Chicago Cubs franchise, the 141st in the National League and the Cubs 101st season at Wrigley Field. To celebrate their 100 years at Wrigley, the Cubs wore a patch on their home uniforms and wore 1916 throwback uniforms on July 6.\n\nThey began the season on the road against the Los Angeles Angels on April 4, 2016 and will finish the regular season on October 2, 2016 at the Cincinnati Reds.\n\nJune\nThe Cubs began June with the best run differential, plus 129, and a record of 35–15, also the best. Cubs starting pitchers began the month with a combined earned run average of 2.38.\nJune 1 – Major League Baseball announces results of early All Star Game voting. The Cubs have five players in starting positions: Kris Bryant, Addison Russell, Ben Zobrist, Anthony Rizzo, and Dexter Fowler.\nJune 1 – Jon Lester pitches a complete game four-hitter as the Cubs beat the Dodgers 2–1.\nJune 8 – Major League baseball releases update of All Star Game voting. The Cubs have received the four highest amount of votes among NL teams: Anthony Rizzo, Kris Bryant, Dexter Fowler, and Ben Zobrist. Addison Russell is also leading at SS.\nJune 11 – Jake Arrieta's 8–2 victory in Atlanta gives him a record of 7–0 in road starts and 10–1 overall.\nJune 13 – Jon Lester is named National League Player of the Week.\nJune 18 – Dexter Fowler leaves game after first inning with hamstring discomfort.\nJune 19 – Willson Contreras hits a home run on the first pitch of his first major-league at-bat becoming the 30th player in the Modern MLB Era to do so and the eighth player in Cubs history to homer in his first at-bat.\nJune 22 – Miguel Montero leaves game after play at the plate with apparent right knee injury. The Cubs are swept by the Cardinals at Wrigley Field for the first time since 1988.\nJune 27 – Kris Bryant becomes first player in MLB history to hit three home runs and two doubles in one game, going 5–5 in the game with six RBI's. His 16 total bases set a franchise record and he becomes the youngest player in club history to hit three home runs in the same game.\nJune 28 – A game at Cincinnati went into extra innings as a 2–2 tie. By the end of the 12th inning, both teams had used up all their position players. The Cubs used three pitchers to play left field. Travis Wood entered the game in left in the 14th with Spencer Patton beginning the inning at pitcher. He retired the first batter and was sent to left field and Wood replaced him on the mound. Following a groundout, the two players switched positions again. After the Cubs took the lead in the 15th inning, Pedro Strop replaced Patton and played left field with Wood returning to pitch. Wood finished the game with 1 1/3 innings pitched, allowing one hit and striking out three. It was the first time since 1961 that three pitchers moved from the mound to outfield and back in the same game. The last time two pitchers did so was in 1986, when the visiting Mets switched Jesse Orosco and Roger McDowell against Cincinnati. In the 15th, Javier Baez hit a grand slam, his first career grand slam, which is the latest grand slam, by inning, in Cubs history.\nJune 30 – Jon Lester is named National League Pitcher of the Month for June.\n\nJuly\nThe Cubs began July with the best record in baseball, 51–27, the best run differential at plus 169 and having a run of 10 consecutive months of at least .500 play. The pitching staff had the lowest ERA, the fewest hits allowed, runs allowed, earned runs allowed, and the lowest batting average by opponents in MLB. The Cubs offense when compared to all teams was second in on base percentage, second in base runners who eventually score, second in the number of RBI's and first in walks. Kris Bryant was tied with Todd Frazier of the Chicago White Sox and Mark Trumbo of the Baltimore Orioles for the Major League Home Run lead with 23 and was 4th with 61 RBI's. Anthony Rizzo had 60.\nThe July 12th All-Star game in San Diego will begin with the entire Cubs infield (Bryant, Rizzo, Russell, and Zobrist) as starters. Rizzo led all National League players in votes. Zobrist beat out Daniel Murphy by 88 votes. Fowler, though injured, was the top vote getter for National League outfielders. Lester and Arrieta were also named to the team.\nJuly 3 – The Cubs are swept by the New York Mets. Cubs begin July 0–3 and losers of 10 of their last 14 games.\nJuly 10 – The Cubs enter the All-Star break at 53–35 having lost 15 out of their last 21 games. However, the Cubs' lead of seven games over St. Louis in the division is the largest lead in baseball.\nJuly 20 – The Mets and Cubs wear throwback uniforms in the series finale ahead of the weekend’s Hall of Fame inductions. Six other teams are also wearing vintage uniforms.\nJuly 20 – Rizzo homers twice in Cubs 6–2 win over the Mets giving him 24 on the season, one behind the National League lead of 25 by teammate Bryant. Kyle Hendricks' ERA is 2.27 which is third best in MLB behind Clayton Kershaw and Madison Bumgarner.\nJuly 22 – Dexter Fowler returns from the disabled list and leads off the game with a home run and drives in three runs as the Cubs beat the Milwaukee Brewers 5–2.\nJuly 27 – Addison Russell hits his first career grand slam as part of a five-run eighth inning as the Cubs defeat the Chicago White Sox 8–1. Javier Baez and Kris Bryant also homer as the Cubs snap a two-game losing streak. Bryant's homer, his 26th, ties his total home runs from last season. Newly acquired Aroldis Chapman pitches a perfect ninth inning.\nJuly 31 – The Cubs beat the Seattle Mariners 7–6 in 12 innings. In a game started by Brian Matusz, who departed after giving up six runs in three innings, the Cubs bullpen pitches nine innings of scoreless relief. Trailing 3–6 in the bottom of the ninth, the Cubs rallied to tie the game and force extra innings. Travis Wood again played left field after pitching in the sixth inning. He made a catch up against the wall in the seventh and returned to pitch in the eighth inning. Having exhausted all position players and bullpen pitchers, the Cubs were forced to have Jon Lester pinch hit in the bottom of the 12th after Jason Heyward doubled and moved to third on a Willson Contreras sacrifice fly. With two strikes, Lester executed a safety squeeze bunt and Heyward scored to win the game.\nThe Cubs had a record of 12–14 for the month marking the first time in manager Joe Maddon's tenure that the Cubs had a record under .500 in a single month.\n\nAugust\nThe Cubs began August with the best MLB record of 63–41. The pitching staff once again had the lowest ERA, the fewest hits allowed, fewest runs allowed, fewest earned runs allowed, the lowest batting average by opponents in MLB and were among the leaders in fewest home runs allowed and in striking out opponent batters. The Cubs offense, when compared to all major league teams, was among the leaders in on-base percentage, base runners who eventually score, RBI's and drawing walks. Bryant and Rizzo were among the major league leaders in home runs and RBI's.\nAugust 1 – Kyle Hendricks throws a complete game shutout and lowers his ERA to 2.22, third best in the National League. His ERA at Wrigley Field this season is 1.19.\nAugust 3 – Cubs complete sweep of Miami Marlins with three-run ninth inning for the win.\nAugust 7 – Cubs beat Oakland A's 3–1 to sweep series and extend current winning streak to seven games. The win also puts the Cubs at a season-high 28 games over .500.\nAugust 8 – Kyle Hendricks is named National League Player of the Week for the first week of August.\nAugust 9 – Cubs defeat Angels 5–1, their eighth straight win, to become the first team to win 70 games. Bryant homers and drives in his 70th run becoming the first Cubs player ever to drive in 70 runs in each of his first two seasons.\nAugust 11 – Cubs defeat Cardinals 4–3 in 11 innings, their 10th straight win, on a walk-off walk. The win moved the Cubs to a season-high 13 game lead over the Cardinals in the NL Central.\nAugust 12 – Cubs defeat Cardinals 13–2, hitting five home runs and winning their 11th straight game. The lead in the NL Central increases to 14 games. The Cubs magic number to win the division stands at 34.\nAugust 18 – Cubs defeat the Brewers 9–6, completing a four-game sweep and move to a season-high 34 games over .500. Kris Bryant goes 5–5 with two home runs and five RBIs in the win.\nAugust 22 – Kyle Hendricks leads the Majors in ERA for pitchers with over 140 innings at 2.16. Jake Arrieta is fourth at 2.75 and Jon Lester is fifth at 2.81. Kris Bryant is the first player to score 100 runs this season.\nAugust 23 – With the Cubs 5–3 victory over the San Diego Padres, the Cubs moved to a season-high 35 games over .500 at 80–45. Their magic number to win the division moved 25. Jake Arrieta won his league-leading 16th game of the season and lowered his ERA to 2.62. Addison Russell hit his fifth home run in the last five games. The Cubs hit multiple home runs in their last seven games, the longest streak since the Cubs hit multiple home runs in an eight-game stretch from June 25 through July 2, 1961.\nAugust 26 – The Cubs defeated the Los Angeles Dodgers 6–3 to move to a season-high 37 games over .500 and moved to season-high 14-game lead in the division.\nAugust 29 – Following back-to-back road losses to the Los Angeles Dodgers, the Cubs returned home with a 14 game lead in the division and a 14.5 game lead over the Pittsburgh Pirates. The Cubs jumped out to a 3–0 lead early, but Jake Arrieta gave up the lead and trailed 6–3 in the eighth. Willson Contreras hit a two-run homer in the eighth and Jorge Soler tied with a homer in the ninth to send the game into extra innings. The Cubs won it in the 13th inning after giving up the go-ahead run in the top of 13th. They scored twice in the bottom of the 13th with a game-winning single by Miguel Montero. The Cubs magic number to win the division moved to 19 and their magic number to secure home-field advantage in the National League playoffs moved to 25.", 'https://en.wikipedia.org/w/index.php?title=2016_Chicago_Cubs_season&oldid=736895533', '2016-08-30', 'zh'), }) self._test_queries('neuclir/1/fa/hc4-filtered', count=60, items={ 0: ExctractedCCQuery('1', 'Asteroids Endangering Earth', 'Articles related to asteroids that pose danger of impact to Earth.', 'سیارک ها\xa0کره زمین را به خطر می اندازند', 'مقالات مربوط به سیارک ها که خطر ضربه به کره زمین را دارند', 'سیارات در معرض خطر زمین', 'مقالات مربوط به سیارک \u200c هایی که خطر برخورد با زمین را تهدید می \u200c کنند.', {'very_valuable': 'Mention of asteroids striking Earth with description', 'somewhat_valuable': 'Mention of asteroids striking Earth but with little information', 'not_that_valuable': 'May talk about asteroids without real mention of their danger', 'non_relevant': 'Describes meteorites, planets, stars with no relation to striking Earth'}, '2011 UL21 briefly had about a 1 in a million chance of impacting in 2029. Its cumulative impact probability dropped to 1 in 71 million by 2 November 2011 when the observation arc reached 15 days. It was removed from the Sentry Risk Table on 4 November 2011 when all impact scenarios for the next 100 years or more were ruled out. During 2029, the closest approach to Earth is 1.6 AU. Palomar Observatory precovery images from 1989 and 1990 have extended the observation arc to 22 years. Its next notable close approach to the Earth will be on June 27, 2024 at a distance of 0.044 AU (6,600,000 km; 4,100,000 mi).\n\nWith an absolute magnitude of 15.8, it is one the brightest and therefor largest potentially hazardous asteroids (PHA) detected since (242450) 2004 QY2. The next largest PHA (based on absolute magnitude) discovered in 2011 is 2011 WO41 with an absolute magnitude of 16.8. ', 'https://en.wikipedia.org/w/index.php?title=(415029)_2011_UL21&oldid=877055001', '2019-01-06', 'fa'), 9: ExctractedCCQuery('23', 'Technical US-South Korean Military Exercise', 'Find information regarding technical or computer-related military exercises that took place after the original U.S. and North Korean summit.', 'رزمایش فنی آمریکا و کره جنوبی', 'اطلاعات مربوط به تمرینات نظامی فنی یا رایانه ای را که پس از اتحاد اصلی ایالات متحده و اجلاس کره شمالی انجام شد ، بیابید.', 'فنی ایالات متحده و کره جنوبی ورزش نظامی', 'اطلاعات مربوط به تمرینات نظامی فنی یا کامپیوتری را که پس از اجلاس اصلی ایالات متحده و کره شمالی برگزار شد ، پیدا کنید.', {'very_valuable': 'simulated computer and technical drills between the US and South Korea.', 'somewhat_valuable': 'na', 'not_that_valuable': 'na', 'non_relevant': 'Info about large exercises, tensions between the US and North Korea, missile tests, letters sent to Trump. Computer game addiction. US simulating battle against Iran. '}, "Foal Eagle (Korean: 독수리 연습) is a combined field training exercise (FTX) conducted annually by the Republic of Korea Armed Forces and the United States Armed Forces under the auspices of the Combined Forces Command. It is one of the largest military exercises conducted annually in the world. Foal Eagle has been a source of friction with the government of Democratic People's Republic of Korea (DPRK) and domestic ROK critics.\n\nFoal Eagle is an exercise conducted by the US and ROK armed forces, consisting of rear area security and stability operations, onward movement of critical assets to the forward area, special operations, ground maneuver, amphibious operations, combat air operations, maritime action group operations and counter special operations forces exercises (CSOFEX).\n\nThe United Nations Command informs the North Korean People's Army that South Korea and the United States will be conducting the exercise. The United Nations Command also reassured the Korean People's Army at general officer-level talks that these exercises, conducted annually in or around March, are purely defensive in nature[citation needed] and have no connection to ongoing or current events. The Neutral Nations Supervisory Commission monitors the exercise for violations of the Korean Armistice Agreement.\n\nSince 2001, Foal Eagle combined with the annual American-South Korean Reception, Staging, Onward movement, and Integration (RSOI) combined exercises, with RSOI being renamed Key Resolve in 2008. On June 12, 2018, US President Donald Trump announced that the US would abandon the joint military exercises with South Korea.", 'https://en.wikipedia.org/w/index.php?title=Foal_Eagle&oldid=845573161', '2018-06-12', 'fa'), 59: ExctractedCCQuery('208', 'Bird reintroduction to wild', 'Which species of birds of prey have been successfully reintroduced into the wild?', 'معرفی مجدد پرندگان به طبیعت', 'کدام گونه از پرندگان شکاری با موفقیت دوباره در طبیعت وارد شده اند؟', 'معرفی مجدد پرنده به وحشی', 'کدام گونه از پرندگان شکاری با موفقیت در حیات وحش معرفی شده \u200c اند ؟', {'very_valuable': 'Articles which specified specific species, such as the Golden Eagle, or the Horned Owl, and further specified them as birds of prey, and spoke about injuries they had suffered causing them to need rehab, and then being reintroduced to the wild. ', 'somewhat_valuable': 'Articles which specified general species, such as eagles and owls and raptors, but did not specify specific species, then went on to talk about their rehab and reintroduction to the wild', 'not_that_valuable': 'Articles which spoke of birds of prey being reintroduced to the wild, but spoke of neither general nor specific species', 'non_relevant': 'Articles about Eagles, poachers catching birds, the reintroduction of foxes and squirrels into the wild and various other animal topics that were close but not central'}, 'North African ostrich in Morocco, Nigeria, Niger and Tunisia (ongoing)\n\nSarus cranes in Thailand (ongoing)\n\nShort-tailed albatross in Japan (successful)\n\nBlack grouse to Derbyshire, England – (ongoing)\n\nCommon crane to Somerset, England – (ongoing)\n\nCorncrake to Cambridgeshire, England – (ongoing)\n\nEuropean black vulture in the Massif Central in France - (successful)\n\nGolden eagle in Ireland (ongoing)\n\nGreat bustard to Salisbury Plain, England – (ongoing)', 'https://en.wikipedia.org/w/index.php?title=Species_reintroduction&oldid=869254157', '2018-11-17', 'fa'), }) self._test_queries('neuclir/1/ru/hc4-filtered', count=54, items={ 0: ExctractedCCQuery('3', 'British royal news impacts', 'What political and economic impacts does news about the British royal family have domestically and abroad?', 'Влияние британских королевских новостей', 'Какое политическое и экономическое влияние новости о британской королевской семье имеют внутри страны и за рубежом?', 'Британские королевские новости влияют', 'Какие политические и экономические последствия имеют новости о британской королевской семье как внутри страны, так и за рубежом?', {'very_valuable': 'Information regarding economic and political impacts of the British royal family in the UK and worldwide', 'somewhat_valuable': 'Information related to the British royal family and their interactions with politics or the economy', 'not_that_valuable': 'information about the royal family', 'non_relevant': 'information not about the royals'}, "Announcement of engagement\nPrincess Eugenie of York in 2013\n\nPrincess Eugenie of York is the second daughter of Prince Andrew, Duke of York and Sarah, Duchess of York. Jack Brooksbank is a British nightclub manager, a distant relation of the Brooksbank baronets,[notes 1] and a third cousin twice removed of Princess Eugenie through Thomas Coke, 2nd Earl of Leicester.[notes 2] The couple have been dating for seven years; they were first introduced by friends in a ski break in Verbier, Switzerland, where Brooksbank was working at the time.\n\nOn 22 January 2018, Buckingham Palace announced that Princess Eugenie of York would marry Jack Brooksbank in the autumn. They were engaged earlier the same month while in Nicaragua with Brooksbank giving the Princess an oval-cut Padparadscha sapphire surrounded by a halo of diamonds set on a gold band with two further diamonds on the shoulders. The ring bears a striking similarity to the engagement ring of Princess Eugenie's mother.\n\nEugenie was eighth in the line of succession to the British throne at the time the engagement was announced. As of May 2018, she is ninth in the line of succession. Although Eugenie is a member of the British royal family, she does not require the Queen's permission to marry.[notes 3] The Duke and Duchess of York expressed their delight at the news and the British Prime Minister Theresa May congratulated the couple on her Twitter account. After the announcement the couple gave an interview to Matt Baker of BBC One.\n\nThe official engagement photographs were taken in the Picture gallery at Buckingham Palace. ", 'https://en.wikipedia.org/w/index.php?title=Wedding_of_Princess_Eugenie_and_Jack_Brooksbank&oldid=841040200', '2018-05-13', 'ru'), 9: ExctractedCCQuery('111', 'Chinese regulation of Fentanyl', "Has the addition of fentanyl to China's list of regulated narcotic drugs been successful in curbing its production and distribution?", 'Китайское регулирование Фентанила', 'Помогло ли добавление фентанила в список регулируемых наркотиков Китая ограничить его производство и распределение?', 'Китайское регулирование Фентанила', 'Успешно ли было включение фентанила в перечень регулируемых наркотических средств Китая с точки зрения ограничения его производства и распространения?', {'very_valuable': 'One article focused on the very question in the description. The assessment by the US think tank, Rand, China lacks the resources to effectively control opioid exporters. Over 40 years, Chinese manufacturers have reoriented themselves to export, especially through the internet. The industry has about five thousand companies producing about two million tons of product per year. Also, many operate without government oversight. Besides the US, countries like Russia who share a border with China are also suffering from the influx of fentanyl and other drugs into their countries. The Chinese government has so far failed to control the flow. ', 'somewhat_valuable': "Three articles focused on China putting fentanyl on the list of controlled substances for export, but also with the caveat that the China is not the sole source of this product. Also, one article focused on US sanctions against certain Chinese entities because of their exporting fentanyl to the US. US stated that the regulated Chinese list isn't working. ", 'not_that_valuable': 'One article is about Trump announcing new tariffs to China mostly because of the trade war, but one reason mentioned was China’s failure to limit sales of fentanyl to US. Another article was about Trump’s Tweets calling for the ban of fentanyl from China to US', 'non_relevant': 'Trump’s request to Mexico and China to cooperate against the opioid crisis; China’s black listing of certain suspect people and companies (no details); UN’s opioid report; Drug problems in Russia, Estonia, Ukraine and Kazakhstan; Heroin problem; DoJ going after pharmaceutical companies; innovative vaccine to battle opioid crisis; Trump signed legislation to counter the smuggling of opioids into the United States; US sanctions against Korea and China; The Chinese government intends to impose sanctions against American companies that are involved in the supply of arms to Taiwan; how the opioid crisis grew in the US; US sanctions against Russia; Russia’s sanctions against Ukraine'}, 'Several large quantities of illicitly produced fentanyl have been seized by U.S. law enforcement agencies. In November 2016, the DEA uncovered an operation making counterfeit oxycodone and Xanax from a home in Cottonwood Heights, Utah. They found about 70,000 pills in the appearance of oxycodone and more than 25,000 in the appearance of Xanax. The DEA reported that millions of pills could have been distributed from this location over the course of time. The accused owned a pill press and ordered fentanyl in powder form from China. A seizure of a record amount of fentanyl occurred on February 2, 2019 by U.S. Customs and Border Protection in Nogales, Arizona. The 254 pounds (115 kg) of fentanyl, which was estimated to be worth US$3.5M, was buried under a pile of cucumbers and stowed under a special floor compartment.\n\nThe "China White" form of fentanyl refers to any of a number of clandestinely produced analogues, especially α-methylfentanyl (AMF). This Department of Justice document lists "China White" as a synonym for a number of fentanyl analogues, including 3-methylfentanyl and α-methylfentanyl, which today are classified as Schedule I drugs in the United States. Part of the motivation for AMF is that, despite the extra difficulty from a synthetic standpoint, the resultant drug is relatively more resistant to metabolic degradation. This results in a drug with an increased duration.\n\nIn June 2013, the United States Centers for Disease Control and Prevention (CDC) issued a health advisory to emergency departments alerting to 14 overdose deaths among intravenous drug users in Rhode Island associated with acetylfentanyl, a synthetic opioid analog of fentanyl that has never been licensed for medical use. In a separate study conducted by the CDC, 82% of fentanyl overdose deaths involved illegally manufactured fentanyl, while only 4% were suspected to originate from a prescription.\n\nBeginning in 2015, Canada has seen a number of fentanyl overdoses. Authorities suspected that the drug was being imported from Asia to the western coast by organized crime groups in powder form and being pressed into pseudo-OxyContin tablets. Traces of the drug have also been found in other recreational drugs including cocaine, MDMA, and heroin. The drug has been implicated in multiple deaths from the homeless to young professionals, including multiple teens and young parents. Because of the rising deaths across the country, especially in British Columbia where the deaths for 2016 is 668 and deaths for 2017 (January to October) is 999,Health Canada is putting a rush on a review of the prescription-only status of naloxone in an effort to combat overdoses of the drug.\n\nFentanyl has been discovered for sale in illicit markets in Australia in 2017 and in New Zealand in 2018. In response, New Zealand experts called for wider availability of naloxone.\n\nFentanyl has started to make its way into heroin and oxycodone, and more recently, cocaine. A kilogram of heroin laced with fentanyl may sell for US$1.6 million, but the fentanyl itself may be produced far more cheaply for about US$6,000 per kilogram. Fentanyl is often produced in China and exported illegally to the U.S.', 'https://en.wikipedia.org/w/index.php?title=Fentanyl&oldid=890112151#Overdose', '2019-03-30', 'ru'), 53: ExctractedCCQuery('256', 'Causes of Mediterranean migrants drownings', 'What are the causes of migrants drowning while attempting to cross the Mediterranean Sea?', 'Причины утопления средиземноморских мигрантов', 'По каким причинам мигранты тонут при попытке пересечь Средиземное море?', 'Причины утопления средиземноморских мигрантов', 'Каковы причины утопления мигрантов при попытке пересечь Средиземное море?', {'very_valuable': 'N/A', 'somewhat_valuable': 'Articles that describe specifically the causes of drowning, such as smugglers overcrowding boats, as well as using rubber rafts or unseaworthy boats, also if a boat capsized due to weather. Another cause of drowning is whether rescuers, such a the Libyan Navy, have the resources to reach every immigrant. \n ', 'not_that_valuable': 'Articles that hint at the cause of boats capsizing, such as when a number is mentioned, or a rubber raft is mentioned. ', 'non_relevant': 'Articles on politicians’ views of the drownings; UN position or efforts on refugees; article on the aircraft carrier Admiral Kuznetsov; statistical information regarding refugees (country of origin, number, age); European countries’ positions or efforts regarding refugees; articles on successful rescues; immigrant situation impact on European domestic politics; reports of individual incidents without mentioning a cause; boat accidents and drownings not related to the Mediterranean crossing'}, 'Illegal immigration from Africa to Europe is significant. Many people from underdeveloped African countries embark on the dangerous journey for Europe, in hopes of a better life. In parts of Africa, particularly Mauritania and Morocco, trafficking of immigrants to Europe has become more lucrative than drug trafficking. Some migrants die during the journey. Most of those whose claim for asylum were unsuccessful are deported back to Africa. Libya is the major departure point for irregular migrants setting off for Europe.\n\nBetween October 2013 and October 2014, the Italian government ran Operation Mare Nostrum, a naval and air operation intended to reduce irregular immigration to Europe and the incidence of migratory ship wreckages off the coast of Lampedusa. The Italian government ceased the operation as it was judged to be unsustainable, involving a large proportion of the Italian navy. The operation was replaced by a more limited joint EU border protection operation, named Operation Triton managed by the EU border agency, Frontex. Some other European governments, including Britain\'s, argued that the operations such as Mare Nostrum and Triton serve to provide an "unintended pull factor" encouraging further migration.\n\nIn 2014, 170,100 irregular migrants were recorded arriving in Italy by sea (an increase from 42,925 arrivals recorded in 2013), 141,484 of them leaving from Libya. Most of them came from Syria, the Horn of Africa and West Africa.\n\nThe issue returned to international headlines with a series of migrant shipwrecks, part of the 2015 Mediterranean migration crisis. The International Organization for Migration (IOM) estimates suggest that between the start of 2015 and the middle of April, 21,000 migrants had reached the Italian coast and 900 migrants had died in the Mediterranean. Critics of European policy towards irregular migration in the Mediterranean argue that the cancellation of Operation Mare Nostrum failed to deter migrants and that its replacement with Triton "created the conditions for the higher death toll."', 'https://en.wikipedia.org/w/index.php?title=African_immigration_to_Europe&oldid=865282890#Illegal_immigration', '2018-10-22', 'ru'), }) self._test_queries('neuclir/1/zh/trec-2022', count=49, items={ 0: ExctractedCCNoReportQuery('0', 'Iranian female athletes refugees', 'I am looking for stories about Iranian female athletes who seek asylum in other countries.', 'Find articles that identify female athletes who refused to return to Iran after competition and slam government by seeking asylum from other countries. Relevant documents must identify the athlete by name, and spell out the reason for seeking asylum and if they joined Refugee Team in Olympic competition. "Official" or unofficial reasons are equally acceptable, as long as the document gives some reason that the athlete refused to return after the competition,name of the countries that accepted their request, and Iran government reaction.', '伊朗女运动员难民', '我在查找有关在其他国家寻求庇护的伊朗女运动员的故事。', '查找有关在比赛后通过寻求其他国家的庇护拒绝返回伊朗并抨击政府的女运动员的文章。相关文章必须注明运动员的姓名,并说明寻求庇护的原因以及他们在奥运会比赛中是否加入了难民队。 “官方”或非官方理由同样可以接受,只要文章给出运动员在比赛结束后拒绝返回的理由、接受其请求的国家名称以及伊朗政府的反应。', '伊朗女运动员难民', '我正在寻找有关在其他国家寻求庇护的伊朗女运动员的故事。', '查找那些在比赛后拒绝返回伊朗的女运动员的文章,并通过从其他国家寻求庇护来抨击政府。相关文件必须注明运动员的姓名,并说明寻求庇护的原因以及他们是否加入了难民队参加奥运会比赛。 “官方”或非官方理由同样可以接受,只要文件给出运动员在比赛结束后拒绝返回的理由、接受其请求的国家名称以及伊朗政府的反应。', 'zh'), 9: ExctractedCCNoReportQuery('24', 'Wind energy in Russia', 'I am looking for articles on the growth of wind energy in Russia.', 'Find articles on the development and growth of wind energy in Russia. Include information on joint ventures with foreign firms for the production of wind turbines and related equipment. Include information on the planned outputs of windmill parks. Information on wind energy in other countries not involving cooperation with Russian entities is not required. Neither is information on other renewable or non-renewable energy resources in Russia.', '俄罗斯的风能發电', '我要查找有关俄罗斯风能发展的文章。', '查找有关俄罗斯风能发展和增长的文章。 包括与外国公司合资生产风力涡轮机和相关设备的信息。 包括有关预计风电场发电量的信息。其他国家的风能信息,如果不涉及俄罗斯的机构合作,不需要。关于俄罗斯其他可再生或不可再生能源的信息也不需要。', '俄罗斯的风能', '我正在寻找有关俄罗斯风能发展的文章。', '查找有关俄罗斯风能发展和增长的文章。包括与外国公司合资生产风力涡轮机和相关设备的信息。包括有关风车公园计划产出的信息。不需要与俄罗斯实体合作的其他国家的风能信息。也没有关于俄罗斯其他可再生或不可再生能源的信息。', 'zh'), 48: ExctractedCCNoReportQuery('133', 'Whale stranding', "I'm looking for articles on whale stranding.", 'Find articles on whale stranding. Relevant articles should provide the number of whales stranded on the beach, death count if any, and the count of whales successfully rescued by helping them return to the water. Relevant articles should also provide possible cause of such phenomenon.', '鯨魚擱淺', '我在查找鯨魚擱淺的報導。', '查找鯨魚擱淺的報導。相關報導必須提供鯨魚擱淺的數字,死亡數字,以及成功助鯨魚回海的獲救數字。相關報導也須提供鯨魚擱淺的可能原因。', '鲸鱼搁浅', '我正在寻找关于鲸鱼搁浅的文章。', '查找有关鲸鱼搁浅的文章。相关文章应提供搁浅在海滩上的鲸鱼数量、死亡人数(如有)以及帮助鲸鱼返回水中成功获救的鲸鱼数量。相关文章还应提供此类现象的可能原因。', 'zh'), }) self._test_queries('neuclir/1/fa/trec-2022', count=46, items={ 0: ExctractedCCNoReportQuery('0', 'Iranian female athletes refugees', 'I am looking for stories about Iranian female athletes who seek asylum in other countries.', 'Find articles that identify female athletes who refused to return to Iran after competition and slam government by seeking asylum from other countries. Relevant documents must identify the athlete by name, and spell out the reason for seeking asylum and if they joined Refugee Team in Olympic competition. "Official" or unofficial reasons are equally acceptable, as long as the document gives some reason that the athlete refused to return after the competition,name of the countries that accepted their request, and Iran government reaction.', 'بانوان ورزشکار ایرانی پناهنده', 'دنبال مقالات مربوط به ورزشکاران زن ایرانی هستم که به کشورهای دیگر پناهنده شده اند.', 'مقالات مربوط به زنان ورزشکاری که از بازگشت به ایران پس از شرکت در مسابقات بین المللی خودداری کرده و با درخواست پناهندگی از کشورهای دیگربه دولت ایران ضربه زدند. اسناد مربوطه باید نام ورزشکار را مشخص کند و دلیل درخواست پناهندگی و پیوستن آنها به تیم پناهندگان در مسابقات المپیک را مشخص کند. دلایل "رسمی" یا غیررسمی به یک اندازه قابل قبول هستند، تا زمانی که در سند دلایلی وجود داشته باشد که ورزشکار پس از مسابقه از بازگشت خودداری کرده است، همچنین نام کشورهایی که درخواست آنها را پذیرفته اند و واکنش دولت ایران ذکر شده باشد.', 'ورزشکاران زن ایرانی پناهنده', 'من به دنبال داستان هایی در مورد زنان ورزشکار ایرانی هستم که به کشورهای دیگر پناهنده می شوند.', 'مقالاتی را بیابید که ورزشکاران زن را که پس از مسابقات و با درخواست پناهندگی از کشورهای دیگر از بازگشت به ایران امتناع کرده اند، بیابید. اسناد مربوطه باید نام ورزشکار را مشخص کند و دلیل درخواست پناهندگی و پیوستن آنها به تیم پناهندگان در مسابقات المپیک را مشخص کند. دلایل "رسمی" یا غیررسمی به یک اندازه قابل قبول است، تا زمانی که در سند دلیلی وجود داشته باشد که ورزشکار پس از مسابقه از بازگشت خودداری کرده است، نام کشورهایی که درخواست آنها را پذیرفته اند و واکنش دولت ایران ذکر شده باشد.', 'fa'), 9: ExctractedCCNoReportQuery('26', 'Ukrainian Presidential Candidate Zelenskiy', "I am looking for articles that reflect Russia's attitude towards Ukrainian Presidential Candidate Volodymyr Zelenskiy.", 'Find articles that reveal the Russian perspective of then presidential candidate Volodymyr Zelenskiy for the 2019 election. Include articles that highlight what Ukrainian officials say that Russian media circulates about Zelenskiy as that reveals what Russians want readers to be aware of. Russian perspective as to his ability to win, who might be supporting him, how seriously they should be taking him, and so forth. Did not include articles that discussed subsequent elections, local or parliamentary elections, or any actions Zelenskiy took once he became president. Did not include Zelenskiys recommendations on how elections in Donbass should be run. Articles purely about the polls and statistics were not included.', 'زلنسکی نامزد ریاست جمهوری اوکراین', 'دنبال مقالاتی هستم که نشان دهنده نگرش روسیه نسبت به نامزد ریاست جمهوری اوکراین ولودیمیر زلنسکی باشد.', 'مقاله\u200cهایی را بیابید که دیدگاه روسیه را نسبت به ولودیمیر زلنسکی، نامزد ریاست\u200cجمهوری اوکراین در زمان انتخابات ۲۰۱۹ نشان می\u200cدهد. مقالاتی که تاکید داشته باشد برنظر مقامات اوکراینی درباره مطالبی که رسانه های روسی منتشر می کنند درمورد زلنسکی و می خواهند که خوانندگان روسی از آن آگاه باشند. مقالاتی که دیدگاه روسیه در مورد توانایی و قابلیت زلنسکی برای پیروزی و اینکه چه کسی ممکن است از او حمایت کند، و روسیه چقدر باید او را جدی بگیرد، و غیره. مقالاتی که در مورد انتخابات بعدی، انتخابات محلی یا پارلمانی، یا هر اقدامی که زلنسکی پس از ریاست\u200cجمهوری انجام داد، لازم نیست. این مقالات توصیه های زلنسکی در مورد نحوه برگزاری انتخابات در «دونباس» را شامل نمی شود. ضمناً مقالاتی که صرفاً در مورد نظرسنجی ها و آمارباشد، گنجانده نشود.', 'زلنسکی کاندیدای ریاست جمهوری اوکراین', 'من به دنبال مقالاتی هستم که نشان دهنده نگرش روسیه نسبت به نامزد ریاست جمهوری اوکراین ولودیمیر زلنسکی باشد.', 'مقاله\u200cهایی را بیابید که دیدگاه روسیه ولودیمیر زلنسکی، نامزد ریاست\u200cجمهوری آن زمان را برای انتخابات ۲۰۱۹ نشان می\u200cدهد. شامل مقالاتی باشید که بر آنچه مقامات اوکراینی می\u200cگویند که رسانه\u200cهای روسی درباره زلنسکی می\u200cگویند تاکید می\u200cکند، زیرا نشان می\u200cدهد که روس\u200cها می\u200cخواهند خوانندگان از آن آگاه باشند. دیدگاه روسیه در مورد توانایی او برای پیروزی، چه کسی ممکن است از او حمایت کند، چقدر باید او را جدی بگیرند، و غیره. شامل مقالاتی که در مورد انتخابات بعدی، انتخابات محلی یا پارلمانی، یا هر اقدامی که زلنسکی پس از ریاست\u200cجمهوری انجام داد، بحث نمی\u200cکرد. توصیه های Zelenskiys در مورد نحوه برگزاری انتخابات در دونباس را شامل نمی شود. مقالات صرفاً در مورد نظرسنجی ها و آمار گنجانده نشده است.', 'fa'), 45: ExctractedCCNoReportQuery('133', 'Whale stranding', "I'm looking for articles on whale stranding.", 'Find articles on whale stranding. Relevant articles should provide the number of whales stranded on the beach, death count if any, and the count of whales successfully rescued by helping them return to the water. Relevant articles should also provide possible cause of such phenomenon.', ' به گِل نشستن نهنگ ها', 'به دنبال مقالاتی در مورد به گِل نشستن نهنگ ها هستم.', 'مقالاتی در مورد به گِل نشستن نهنگ ها بیابید. مقالات مربوطه باید تعداد نهنگ\u200cهایی را که در ساحل به گل نشسته\u200cاند، تعداد تلفات موجود، و تعداد نهنگ\u200cهایی که با کمک رسانی با موفقیت نجات یافته\u200c و به آب بازگشته اند را ارائه کند. همچنین مقالات مرتبط باید علت احتمالی چنین پدیده ای را ارائه دهند.', 'به گل نشستن نهنگ', 'من به دنبال مقالاتی در مورد به گل نشستن نهنگ هستم.', 'مقالاتی در مورد به گل نشستن نهنگ بیابید. مقالات مربوطه باید تعداد نهنگ\u200cهایی را که در ساحل به گل نشسته\u200cاند، تعداد تلفات در صورت وجود، و تعداد نهنگ\u200cهایی که با کمک به آنها برای بازگشت به آب با موفقیت نجات یافته\u200cاند، ارائه کند. مقالات مرتبط نیز باید علت احتمالی چنین پدیده ای را ارائه دهند.', 'fa'), }) self._test_queries('neuclir/1/ru/trec-2022', count=45, items={ 0: ExctractedCCNoReportQuery('0', 'Iranian female athletes refugees', 'I am looking for stories about Iranian female athletes who seek asylum in other countries.', 'Find articles that identify female athletes who refused to return to Iran after competition and slam government by seeking asylum from other countries. Relevant documents must identify the athlete by name, and spell out the reason for seeking asylum and if they joined Refugee Team in Olympic competition. "Official" or unofficial reasons are equally acceptable, as long as the document gives some reason that the athlete refused to return after the competition,name of the countries that accepted their request, and Iran government reaction.', 'Иранские женские спорт-беженцы', 'Я ищу статьи об иранских спортсменках, которые ищут убежища в других странах.', 'Найдите статьи, в которых упоминаются спортсменки, которые отказались вернуться в Иран после соревнований, и критикули правительство, ища убежища в других странах. В соответствующих документах должно быть указано имя спортсмена, а также указана причина обращения за убежищем, а также то, присоединилась ли она к команде беженцев на олимпийских соревнованиях. «Официальные» или неофициальные причины одинаково приемлемы, если в документе указана причина, по которой спортсменка отказалась вернуться после соревнований, название стран, которые приняли их запрос, и реакция правительства Ирана.', 'Иранские спортсменки-беженцы', 'Я ищу истории об иранских спортсменках, которые ищут убежища в других странах.', 'Найдите статьи, в которых упоминаются спортсменки, отказавшиеся вернуться в Иран после соревнований, и критикуйте правительство, ища убежища в других странах. В соответствующих документах должно быть указано имя спортсмена, а также указана причина обращения за убежищем и то, присоединился ли он к команде беженцев на олимпийских соревнованиях. «Официальные» или неофициальные причины одинаково приемлемы, если в документе указана причина, по которой спортсмен отказался вернуться после соревнований, название стран, принявших его запрос, и реакция правительства Ирана.', 'ru'), 9: ExctractedCCNoReportQuery('24', 'Wind energy in Russia', 'I am looking for articles on the growth of wind energy in Russia.', 'Find articles on the development and growth of wind energy in Russia. Include information on joint ventures with foreign firms for the production of wind turbines and related equipment. Include information on the planned outputs of windmill parks. Information on wind energy in other countries not involving cooperation with Russian entities is not required. Neither is information on other renewable or non-renewable energy resources in Russia.', 'Энергия ветра в России', 'Я ищу статьи о росте ветроэнергетики в России', 'Найдите статьи о развитии и росте ветроэнергетики в России. Включите информацию о совместных предприятиях с иностранными фирмами по производству ветряных турбин и сопутствующего оборудования. Включите информацию о планируемых объемах производства ветряных электростанций. Информация о ветроэнергетике в других странах, не предполагающая сотрудничества с российскими организациями, не требуется. Также нет информации о других возобновляемых или невозобновляемых источниках энергии в России.', 'Энергия ветра в России', 'Я ищу статьи о росте ветроэнергетики в России.', 'Найдите статьи о развитии и росте ветроэнергетики в России. Включите информацию о совместных предприятиях с иностранными фирмами по производству ветряных турбин и сопутствующего оборудования. Включите информацию о планируемых объемах производства ветряных электростанций. Информация о ветроэнергетике в других странах, не предполагающая сотрудничества с российскими организациями, не требуется. Также нет информации о других возобновляемых или невозобновляемых источниках энергии в России.', 'ru'), 44: ExctractedCCNoReportQuery('133', 'Whale stranding', "I'm looking for articles on whale stranding.", 'Find articles on whale stranding. Relevant articles should provide the number of whales stranded on the beach, death count if any, and the count of whales successfully rescued by helping them return to the water. Relevant articles should also provide possible cause of such phenomenon.', 'Выбрасывание китов на берег', 'Я ищу статьи о выбрасывании китов на берег', 'Найдите статьи о выбрасывании китов на берег. В соответствующих статьях должно быть указано количество китов, выброшенных на берег, количество смертей, если таковые имеются, и количество китов, успешно спасенных, помогая им вернуться в воду. В соответствующих статьях также должна быть указана возможная причина такого явления.', 'Кит на мель', 'Я ищу статьи о посадке китов на мель.', 'Найдите статьи о посадке китов на мель. В соответствующих статьях должно быть указано количество китов, выброшенных на берег, количество погибших, если таковые имеются, и количество китов, успешно спасенных путем оказания им помощи в возвращении в воду. Соответствующие статьи также должны указывать возможную причину такого явления.', 'ru'), }) self._test_queries('neuclir/1/zh/trec-2023', count=76, items={ 0: ExctractedCCNoReportNoHtNarQuery('200', 'Corruption Bribery Sports Federation Olympics', 'Are there cases of Institutional Corruption and Bribery in Sports?', 'We are Looking for articles that contain a case of financial corruption or bribery at institutional level. Cases of bribery in Olympics and sports federations are relevant. Cases of individuals’ bribery are excluded. Investigating an institution to find out if they are involved in corruption will not be relevant. Ethical problems or sexual abuse are not relevant.', '奧委會貪污賄賂', '體育界是否有機構性腐敗和賄賂等案件?', '腐败贿赂体育联合会奥运会', '体育中是否存在机构腐败和贿赂的案例?', '我们正在寻找在机构层面上包含金融腐败或贿赂案件的文章。奥运会和体育联合会中的贿赂案件是相关的。排除个人贿赂案件。调查一个机构以找出他们是否参与腐败是无关紧要的。道德问题或性虐待无关。', 'zh'), 9: ExctractedCCNoReportNoHtNarQuery('209', 'How Handcrafting Affects Health', 'How does handcrafting or working with one’s hand affect health?', "Find articles that cite illnesses and conditions that could be combated with handcrafts or working with one's hands. Articles about dealing with trauma, stress, weight loss, helping with longevity by dealing with age related illness, strengthening memory, and others by working with one's hands are relevant. Anecdotal examples of people picking up some sort of craft that helped them deal with health issues are also relevant. Pleasure of handicrafts without mention of health benefits are irrelevant. Articles about various shows and exhibits are irrelevant. How to promote one’s handicrafts is irrelevant.", '手工製作如何影響健康', '手製品或以手製作成品對健康影響為何?', '手工制作如何影响健康', '手工制作或手工处理如何影响健康?', '查找引用疾病和条件的文章,可以用手工艺或手工搭配手工。有关应对创伤,压力,体重减轻,通过处理年龄相关疾病,加强记忆和其他人通过手工工作来帮助长寿的文章是相关的。人们捡起一些帮助他们解决健康问题的工艺的轶事例子也很重要。手工艺品的愉悦而没\u200b\u200b有提及健康益处是无关紧要的。有关各种展览和展览的文章无关紧要。如何促进手工艺品是无关紧要的。', 'zh'), 75: ExctractedCCNoReportNoHtNarQuery('275', 'Olympic swimmer selection', 'How are individual swimmers selected by their country to participate in the Olympics?', 'Find articles about how individual swimmers are selected to represent their country at the Olympics. Information about when and where qualifying events are held is relevant. Articles that only contain information about how the country receives slots to send swimmers to the Olympics or how relay teams are selected to compete at the Olympics would not be considered relevant. Also discussion of how countries select swimmers to compete at other international meets besides the Olympics should not be considered relevant.', '奧運游泳選手遴選', '各國如何遴選奧運游泳選手?', '奥林匹克游泳运动员选择', '他们的国家如何选择参加奥运会的游泳者?', '查找有关如何选择单个游泳者在奥运会上代表自己的国家的文章。有关合格事件的何时何地的信息是相关的。仅包含有关该国如何接收老虎机将游泳者派往奥运会或如何选择接力队参加奥运会竞争的文章将不相关。还讨论各国如何选择游泳者参加其他国际聚会,除了奥运会之外,还不应将其视为相关。', 'zh'), }) self._test_queries('neuclir/1/fa/trec-2023', count=76, items={ 0: ExctractedCCNoReportNoHtNarQuery('200', 'Corruption Bribery Sports Federation Olympics', 'Are there cases of Institutional Corruption and Bribery in Sports?', 'We are Looking for articles that contain a case of financial corruption or bribery at institutional level. Cases of bribery in Olympics and sports federations are relevant. Cases of individuals’ bribery are excluded. Investigating an institution to find out if they are involved in corruption will not be relevant. Ethical problems or sexual abuse are not relevant.', 'فساد رشوه خواری المپیک فدراسیون ورزشی', 'آیا موارد فساد نهادی و رشوه در ورزش وجود دارد؟', 'المپیک فدراسیون ورزشی رشوه خواری فساد', 'آیا مواردی از فساد نهادی و رشوه خواری در ورزش وجود دارد؟', 'ما به دنبال مقالاتی هستیم که حاوی یک مورد فساد مالی یا رشوه خواری در سطح نهادی باشد. موارد رشوه خواری در المپیک و فدراسیون های ورزشی مرتبط است. موارد رشوه خوراکی افراد مستثنی است. تحقیق در مورد یک موسسه برای یافتن اینکه آیا آنها در فساد درگیر هستند ، مرتبط نخواهد بود. مشکلات اخلاقی یا سوء استفاده جنسی مرتبط نیست.', 'fa'), 9: ExctractedCCNoReportNoHtNarQuery('209', 'How Handcrafting Affects Health', 'How does handcrafting or working with one’s hand affect health?', "Find articles that cite illnesses and conditions that could be combated with handcrafts or working with one's hands. Articles about dealing with trauma, stress, weight loss, helping with longevity by dealing with age related illness, strengthening memory, and others by working with one's hands are relevant. Anecdotal examples of people picking up some sort of craft that helped them deal with health issues are also relevant. Pleasure of handicrafts without mention of health benefits are irrelevant. Articles about various shows and exhibits are irrelevant. How to promote one’s handicrafts is irrelevant.", 'چگونه کار دستی بر سلامتی تأثیرمی گذارد؟', 'کار دستی یا کار کردن با دست چه تاثیری بر سلامتی دارد؟', 'دستکاری در سلامت چگونه تأثیر می گذارد', 'دستکاری یا کار با دست شخص چگونه بر سلامت تأثیر می گذارد؟', 'مقالاتی را پیدا کنید که به بیماری ها و شرایطی که می تواند با دست سازها یا کار با دستان خود همراه باشد ، استناد کند. مقالاتی در مورد برخورد با تروما ، استرس ، کاهش وزن ، کمک به طول عمر با برخورد با بیماری مرتبط با سن ، تقویت حافظه و دیگران با کار با دست شخص مرتبط است. نمونه های حکایتی از افرادی که به نوعی کاردستی را انتخاب می کنند که به آنها در مقابله با مسائل بهداشتی کمک می کند نیز مرتبط است. لذت بردن از صنایع دستی بدون ذکر مزایای سلامتی بی ربط است. مقالاتی در مورد نمایش ها و نمایشگاه های مختلف بی ربط است. نحوه ترویج صنایع دستی شخص بی ربط است.', 'fa'), 75: ExctractedCCNoReportNoHtNarQuery('275', 'Olympic swimmer selection', 'How are individual swimmers selected by their country to participate in the Olympics?', 'Find articles about how individual swimmers are selected to represent their country at the Olympics. Information about when and where qualifying events are held is relevant. Articles that only contain information about how the country receives slots to send swimmers to the Olympics or how relay teams are selected to compete at the Olympics would not be considered relevant. Also discussion of how countries select swimmers to compete at other international meets besides the Olympics should not be considered relevant.', 'انتخاب شناگر المپیک', 'روند انتخاب شناگران توسط کشورشان برای شرکت در المپیک چيست؟', 'انتخاب شناگر المپیک', 'چگونه شناگران فردی توسط کشورشان برای شرکت در المپیک انتخاب می شوند؟', 'مقالاتی را در مورد چگونگی انتخاب شناگران فردی برای نمایندگی کشور خود در المپیک پیدا کنید. اطلاعات مربوط به زمان و کجا وقایع واجد شرایط بودن مرتبط است. مقالاتی که فقط حاوی اطلاعاتی در مورد چگونگی دریافت اسلات برای ارسال شناگران به المپیک یا نحوه انتخاب تیم های رله برای رقابت در المپیک نیستند ، مرتبط نیست. همچنین بحث در مورد چگونگی انتخاب کشورها شناگران برای رقابت در سایر جلسات بین المللی علاوه بر المپیک نباید مرتبط باشد.', 'fa'), }) self._test_queries('neuclir/1/ru/trec-2023', count=76, items={ 0: ExctractedCCNoReportNoHtNarQuery('200', 'Corruption Bribery Sports Federation Olympics', 'Are there cases of Institutional Corruption and Bribery in Sports?', 'We are Looking for articles that contain a case of financial corruption or bribery at institutional level. Cases of bribery in Olympics and sports federations are relevant. Cases of individuals’ bribery are excluded. Investigating an institution to find out if they are involved in corruption will not be relevant. Ethical problems or sexual abuse are not relevant.', 'Коррупция, взяточничество, спортивные организации, и Олимпиада', 'Бывают ли случаи институциональной коррупции и взяточничества в спорте?', 'Олимпийские игры Федерации спортивных игр коррупции Хруппов', 'Существуют ли случаи институциональной коррупции и взяточничества в спорте?', 'Мы ищем статьи, которые содержат случай финансовой коррупции или взяточничества на институциональном уровне. Случаи взяточничества на Олимпийских играх и спортивных федерациях актуальны. Случаи взяточничества отдельных лиц исключаются. Исследование учреждения, чтобы выяснить, не будут ли они участвовать в коррупции, не будет актуально. Этические проблемы или сексуальное насилие не актуальны.', 'ru'), 9: ExctractedCCNoReportNoHtNarQuery('209', 'How Handcrafting Affects Health', 'How does handcrafting or working with one’s hand affect health?', "Find articles that cite illnesses and conditions that could be combated with handcrafts or working with one's hands. Articles about dealing with trauma, stress, weight loss, helping with longevity by dealing with age related illness, strengthening memory, and others by working with one's hands are relevant. Anecdotal examples of people picking up some sort of craft that helped them deal with health issues are also relevant. Pleasure of handicrafts without mention of health benefits are irrelevant. Articles about various shows and exhibits are irrelevant. How to promote one’s handicrafts is irrelevant.", 'Как Рукоделие Влияет на Здоровье', 'Как рукоделие или работать своими руками влияет на здоровье?', 'Как ручной работы влияют на здоровье', 'Как ручная обработка или работа с рукой влияет на здоровье?', 'Найдите статьи, в которых приводятся болезни и условия, которые можно бороться с ручными формами или работать с руками. Статьи о борьбе с травмами, стрессом, потерей веса, помощи с долговечностью путем борьбы с возрастными заболеваниями, укреплением памяти и других, работая своими руками, актуальны. Анекдотические примеры того, как люди собирают какое -то ремесло, которое помогло им решать проблемы со здоровьем, также актуальны. Удовольствие от ручной работы без упоминания о пользе для здоровья не имеет значения. Статьи о различных шоу и экспонатах не имеют значения. Как продвигать ручной работы не имеет значения.', 'ru'), 75: ExctractedCCNoReportNoHtNarQuery('275', 'Olympic swimmer selection', 'How are individual swimmers selected by their country to participate in the Olympics?', 'Find articles about how individual swimmers are selected to represent their country at the Olympics. Information about when and where qualifying events are held is relevant. Articles that only contain information about how the country receives slots to send swimmers to the Olympics or how relay teams are selected to compete at the Olympics would not be considered relevant. Also discussion of how countries select swimmers to compete at other international meets besides the Olympics should not be considered relevant.', 'отбор пловцов на Олимпийские игры', 'как страны отбирают пловцов для участия в Олимпийских играх ', 'Олимпийский отбор пловца', 'Как отдельные пловцы отбираются их страной для участия в Олимпийских играх?', 'Найдите статьи о том, как отдельные пловцы выбираются, чтобы представлять свою страну на Олимпийских играх. Информация о том, когда и где проводятся квалификационные мероприятия, актуальна. Статьи, которые содержат только информацию о том, как страна получает слоты для отправки пловцов на Олимпийские игры или о том, как выбираются эстафетные команды для соревнования на Олимпийских играх, не считаются актуальными. Также обсуждение того, как страны выбирают пловцов, чтобы конкурировать на других международных встречах, помимо Олимпийских игр, не следует считать актуальными.', 'ru'), }) self._test_queries('neuclir/1/multi/trec-2023', count=76, items={ 0: ExctractedCCMultiMtQuery('200', 'Corruption Bribery Sports Federation Olympics', 'Are there cases of Institutional Corruption and Bribery in Sports?', 'We are Looking for articles that contain a case of financial corruption or bribery at institutional level. Cases of bribery in Olympics and sports federations are relevant. Cases of individuals’ bribery are excluded. Investigating an institution to find out if they are involved in corruption will not be relevant. Ethical problems or sexual abuse are not relevant.', 'المپیک فدراسیون ورزشی رشوه خواری فساد', 'آیا مواردی از فساد نهادی و رشوه خواری در ورزش وجود دارد؟', 'ما به دنبال مقالاتی هستیم که حاوی یک مورد فساد مالی یا رشوه خواری در سطح نهادی باشد. موارد رشوه خواری در المپیک و فدراسیون های ورزشی مرتبط است. موارد رشوه خوراکی افراد مستثنی است. تحقیق در مورد یک موسسه برای یافتن اینکه آیا آنها در فساد درگیر هستند ، مرتبط نخواهد بود. مشکلات اخلاقی یا سوء استفاده جنسی مرتبط نیست.', 'Олимпийские игры Федерации спортивных игр коррупции Хруппов', 'Существуют ли случаи институциональной коррупции и взяточничества в спорте?', 'Мы ищем статьи, которые содержат случай финансовой коррупции или взяточничества на институциональном уровне. Случаи взяточничества на Олимпийских играх и спортивных федерациях актуальны. Случаи взяточничества отдельных лиц исключаются. Исследование учреждения, чтобы выяснить, не будут ли они участвовать в коррупции, не будет актуально. Этические проблемы или сексуальное насилие не актуальны.', '腐败贿赂体育联合会奥运会', '体育中是否存在机构腐败和贿赂的案例?', '我们正在寻找在机构层面上包含金融腐败或贿赂案件的文章。奥运会和体育联合会中的贿赂案件是相关的。排除个人贿赂案件。调查一个机构以找出他们是否参与腐败是无关紧要的。道德问题或性虐待无关。'), 9: ExctractedCCMultiMtQuery('209', 'How Handcrafting Affects Health', 'How does handcrafting or working with one’s hand affect health?', "Find articles that cite illnesses and conditions that could be combated with handcrafts or working with one's hands. Articles about dealing with trauma, stress, weight loss, helping with longevity by dealing with age related illness, strengthening memory, and others by working with one's hands are relevant. Anecdotal examples of people picking up some sort of craft that helped them deal with health issues are also relevant. Pleasure of handicrafts without mention of health benefits are irrelevant. Articles about various shows and exhibits are irrelevant. How to promote one’s handicrafts is irrelevant.", 'دستکاری در سلامت چگونه تأثیر می گذارد', 'دستکاری یا کار با دست شخص چگونه بر سلامت تأثیر می گذارد؟', 'مقالاتی را پیدا کنید که به بیماری ها و شرایطی که می تواند با دست سازها یا کار با دستان خود همراه باشد ، استناد کند. مقالاتی در مورد برخورد با تروما ، استرس ، کاهش وزن ، کمک به طول عمر با برخورد با بیماری مرتبط با سن ، تقویت حافظه و دیگران با کار با دست شخص مرتبط است. نمونه های حکایتی از افرادی که به نوعی کاردستی را انتخاب می کنند که به آنها در مقابله با مسائل بهداشتی کمک می کند نیز مرتبط است. لذت بردن از صنایع دستی بدون ذکر مزایای سلامتی بی ربط است. مقالاتی در مورد نمایش ها و نمایشگاه های مختلف بی ربط است. نحوه ترویج صنایع دستی شخص بی ربط است.', 'Как ручной работы влияют на здоровье', 'Как ручная обработка или работа с рукой влияет на здоровье?', 'Найдите статьи, в которых приводятся болезни и условия, которые можно бороться с ручными формами или работать с руками. Статьи о борьбе с травмами, стрессом, потерей веса, помощи с долговечностью путем борьбы с возрастными заболеваниями, укреплением памяти и других, работая своими руками, актуальны. Анекдотические примеры того, как люди собирают какое -то ремесло, которое помогло им решать проблемы со здоровьем, также актуальны. Удовольствие от ручной работы без упоминания о пользе для здоровья не имеет значения. Статьи о различных шоу и экспонатах не имеют значения. Как продвигать ручной работы не имеет значения.', '手工制作如何影响健康', '手工制作或手工处理如何影响健康?', '查找引用疾病和条件的文章,可以用手工艺或手工搭配手工。有关应对创伤,压力,体重减轻,通过处理年龄相关疾病,加强记忆和其他人通过手工工作来帮助长寿的文章是相关的。人们捡起一些帮助他们解决健康问题的工艺的轶事例子也很重要。手工艺品的愉悦而没\u200b\u200b有提及健康益处是无关紧要的。有关各种展览和展览的文章无关紧要。如何促进手工艺品是无关紧要的。'), 75: ExctractedCCMultiMtQuery('275', 'Olympic swimmer selection', 'How are individual swimmers selected by their country to participate in the Olympics?', 'Find articles about how individual swimmers are selected to represent their country at the Olympics. Information about when and where qualifying events are held is relevant. Articles that only contain information about how the country receives slots to send swimmers to the Olympics or how relay teams are selected to compete at the Olympics would not be considered relevant. Also discussion of how countries select swimmers to compete at other international meets besides the Olympics should not be considered relevant.', 'انتخاب شناگر المپیک', 'چگونه شناگران فردی توسط کشورشان برای شرکت در المپیک انتخاب می شوند؟', 'مقالاتی را در مورد چگونگی انتخاب شناگران فردی برای نمایندگی کشور خود در المپیک پیدا کنید. اطلاعات مربوط به زمان و کجا وقایع واجد شرایط بودن مرتبط است. مقالاتی که فقط حاوی اطلاعاتی در مورد چگونگی دریافت اسلات برای ارسال شناگران به المپیک یا نحوه انتخاب تیم های رله برای رقابت در المپیک نیستند ، مرتبط نیست. همچنین بحث در مورد چگونگی انتخاب کشورها شناگران برای رقابت در سایر جلسات بین المللی علاوه بر المپیک نباید مرتبط باشد.', 'Олимпийский отбор пловца', 'Как отдельные пловцы отбираются их страной для участия в Олимпийских играх?', 'Найдите статьи о том, как отдельные пловцы выбираются, чтобы представлять свою страну на Олимпийских играх. Информация о том, когда и где проводятся квалификационные мероприятия, актуальна. Статьи, которые содержат только информацию о том, как страна получает слоты для отправки пловцов на Олимпийские игры или о том, как выбираются эстафетные команды для соревнования на Олимпийских играх, не считаются актуальными. Также обсуждение того, как страны выбирают пловцов, чтобы конкурировать на других международных встречах, помимо Олимпийских игр, не следует считать актуальными.', '奥林匹克游泳运动员选择', '他们的国家如何选择参加奥运会的游泳者?', '查找有关如何选择单个游泳者在奥运会上代表自己的国家的文章。有关合格事件的何时何地的信息是相关的。仅包含有关该国如何接收老虎机将游泳者派往奥运会或如何选择接力队参加奥运会竞争的文章将不相关。还讨论各国如何选择游泳者参加其他国际聚会,除了奥运会之外,还不应将其视为相关。'), }) def test_qrels(self): self._test_qrels('neuclir/1/zh/hc4-filtered', count=3217, items={ 0: TrecQrel('1', '07fdce5f-e00b-4a40-b9eb-f0f78cfaefc4', 0, '0'), 9: TrecQrel('1', '6cf22079-e41d-45d9-b3d1-c3717fd439e7', 0, '0'), 3216: TrecQrel('228', 'fb670b0a-1449-42c7-aed1-6e68ea2c1453', 1, '0'), }) self._test_qrels('neuclir/1/fa/hc4-filtered', count=3087, items={ 0: TrecQrel('1', '079eba9b-fda2-4c26-ac18-2c35a7a08e88', 0, '0'), 9: TrecQrel('1', '3bc0b037-dda0-4eb4-8941-d88990be1cc2', 0, '0'), 3086: TrecQrel('208', 'fcbd86e4-e28c-449c-88a8-77f40264c2a1', 0, '0'), }) self._test_qrels('neuclir/1/ru/hc4-filtered', count=3235, items={ 0: TrecQrel('3', '07d5796e-e55f-469f-b158-88a8aa53131d', 0, '0'), 9: TrecQrel('3', '31ef93ab-b558-4646-bf00-9fb51833b937', 0, '0'), 3234: TrecQrel('256', 'daa39a1a-f98b-4939-a0ea-d3a93f0ff20d', 0, '0'), }) self._test_qrels('neuclir/1/zh/trec-2022', count=36575, items={ 0: TrecQrel('5', 'c15b7bf7-2300-4253-816e-0c0c6daf86c3', 0, 'zho'), 9: TrecQrel('5', 'f5931ab8-3695-4a23-9981-164b9faee131', 0, 'zho'), 36574: TrecQrel('58', 'b57cb57f-f107-4c77-b3ce-2c0e1e9c24f0', 0, 'zho'), }) self._test_qrels('neuclir/1/fa/trec-2022', count=34174, items={ 0: TrecQrel('5', '2fa789e9-9d04-443e-a2de-45f160c9daa9', 3, 'fas'), 9: TrecQrel('5', 'e7e959ec-5685-4ee7-8ecb-ce4feda3bc0e', 0, 'fas'), 34173: TrecQrel('114', '8b7e706c-8cf5-48ba-8830-4e14f781ad3b', 0, 'fas'), }) self._test_qrels('neuclir/1/ru/trec-2022', count=33006, items={ 0: TrecQrel('5', '3887b53d-dfef-463f-9543-46e295a5321a', 3, 'rus'), 9: TrecQrel('5', '3dd3d61d-b1e8-4daf-a62f-e54cf6a985bf', 3, 'rus'), 33005: TrecQrel('114', '7254202b-6a7f-477a-b4b9-dae50cf3629e', 0, 'rus'), }) self._test_qrels('neuclir/1/zh/trec-2023', count=22061, items={ 0: TrecQrel('200', '033a72f3-b44e-4159-bf28-71273d635d80', 0, 'zho'), 9: TrecQrel('200', '08d7940c-ce11-449f-83d3-4fff72087ec3', 0, 'zho'), 22060: TrecQrel('275', 'ff76fb03-2547-4f79-a727-5fd41cfe6345', 0, 'zho'), }) self._test_qrels('neuclir/1/fa/trec-2023', count=20450, items={ 0: TrecQrel('200', '00258365-6d48-49cc-901f-578b883c8226', 0, 'fas'), 9: TrecQrel('200', '041eb44c-5e2d-41dd-83d6-1458df12e543', 0, 'fas'), 20449: TrecQrel('274', 'ff33f614-0e2a-4060-b3d5-a006b1866db8', 0, 'fas'), }) self._test_qrels('neuclir/1/ru/trec-2023', count=20062, items={ 0: TrecQrel('200', '0123b71d-7dc9-4797-ace1-484d7ac23e5c', 0, 'rus'), 9: TrecQrel('200', '0640f622-4ac0-4f48-890e-5d66d3495136', 1, 'rus'), 20061: TrecQrel('275', 'ffea67f2-2e85-46d0-9c73-2074de381063', 0, 'rus'), }) self._test_qrels('neuclir/1/multi/trec-2023', count=62573, items={ 0: TrecQrel('200', '00258365-6d48-49cc-901f-578b883c8226', 0, 'fas'), 9: TrecQrel('200', '041eb44c-5e2d-41dd-83d6-1458df12e543', 0, 'fas'), 62572: TrecQrel('275', 'ff76fb03-2547-4f79-a727-5fd41cfe6345', 0, 'zho'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/neumarco.py ================================================ import re import unittest from ir_datasets.formats import TrecQrel, GenericQuery, GenericDoc, GenericDocPair from .base import DatasetIntegrationTest class TestNeuMarco(DatasetIntegrationTest): def test_docs(self): self._test_docs('neumarco/fa', count=8841823, items={ 0: GenericDoc('0', re.compile('^ حضور ارتباطات در میان ذهن \u200c های علمی برای موفقیت پروژه منهتن به همان اندازه مهم بود که هوش علمی بود.{57}می معلق است ، چیزی است که موفقیت آن \u200c ها به معنای واقعی آن است ؛ صدها هزار زندگی بی \u200c گناه نابود شد\\.$', flags=48)), 9: GenericDoc('9', ' یکی از دلایل اصلی انتخاب هنفورد به عنوان یک مکان برای پروژه منهتن رآکتور B نزدیکی به رودخانه کلمبیا بود ، بزرگترین رودخانه \u200c ای که از ساحل آمریکای شمالی به اقیانوس آرام می \u200c ریزد.'), 8841822: GenericDoc('8841822', re.compile('^ تصویر کامل اندازه را مشاهده کنید\\. پشت صحنه نور خیره کننده نشان می دهد که تماشاگران اووه و آه در چها.{272}ده نمک \u200c های فلزی و اکسید فلزی وجود دارد که برای تولید مجموعه \u200c ای از رنگ \u200c ها واکنش نشان می \u200c دهند\\.$', flags=48)), }) self._test_docs('neumarco/ru', count=8841823, items={ 0: GenericDoc('0', re.compile('^ Присутствие общения среди научных умов было не менее важным для успеха Манхэттенского проекта, как .{98}лей и инженеров, это то, что их успех действительно означал; сотни тысяч невинных жизней уничтожены\\.$', flags=48)), 9: GenericDoc('9', re.compile('^ Одной из главных причин, по которой Хэнфорд был выбран в качестве объекта для « B Reactor » Манхэтт.{31}сть к реке Колумбия, самой большой реке, протекающей в Тихий океан с северо американского побережья\\.$', flags=48)), 8841822: GenericDoc('8841822', re.compile('^ Просмотр полноразмерного изображения\\. За кулисами ослепительного света видно, что зрители ooh и ahh.{313}ми, в основном солями металлов и оксидами металлов, которые реагируют на получение множества цветов\\.$', flags=48)), }) self._test_docs('neumarco/zh', count=8841823, items={ 0: GenericDoc('0', ' 在科学头脑中的交流对曼哈顿项目的成功同样重要,因为科学智慧是科学智慧。 原子研究人员和工程师令人印象深刻的成就中唯一的云就是他们的成功真正意味着什么;数十万无辜的生命被消灭了。'), 9: GenericDoc('9', ' 汉福德被选定为曼哈顿项目B反应堆的一个主要原因是它靠近哥伦比亚河,这是北美海岸流入太平洋的最大河流。'), 8841822: GenericDoc('8841822', ' 查看全尺寸图像。 在耀眼的灯光的背后,7月4日的观众们都是精心制作的烟花。 不管是红色、白色和蓝色的喷泉还是紫色的火花,每个烟花都充满了正确的化学物质组合,以创造这些五颜六色的灯光。 在每一个手工烟花中,都有少量的特殊化学物质,主要是金属盐和金属氧化物,它们会反应产生一系列的颜色。'), }) def test_queries(self): for lang in ['fa', 'ru', 'zh']: self._test_queries(f'neumarco/{lang}/train', count=808731, items={ 0: GenericQuery('121352', 'define extreme'), 9: GenericQuery('492875', 'sanitizer temperature'), 808730: GenericQuery('50393', 'benefits of boiling lemons and drinking juice.') }) self._test_queries(f'neumarco/{lang}/train/judged', count=502939, items={ 0: GenericQuery('121352', 'define extreme'), 9: GenericQuery('54528', 'blood clots in urine after menopause'), 502938: GenericQuery('50393', 'benefits of boiling lemons and drinking juice.') }) self._test_queries(f'neumarco/{lang}/dev', count=101093, items={ 0: GenericQuery('1048578', 'cost of endless pools/swim spa'), 9: GenericQuery('1048587', 'what is patron'), 101092: GenericQuery('524285', 'treadmill incline meaning') }) self._test_queries(f'neumarco/{lang}/dev/small', count=6980, items={ 0: GenericQuery('1048585', "what is paula deen's brother"), 9: GenericQuery('524699', 'tricare service number'), 6979: GenericQuery('1048565', 'who plays sebastian michaelis'), }) self._test_queries(f'neumarco/{lang}/dev/judged', count=55578, items={ 0: GenericQuery('1048578', 'cost of endless pools/swim spa'), 9: GenericQuery('1048601', 'what is pastoral medicine'), 55577: GenericQuery('1048570', 'what is pearls before swine?') }) def test_qrels(self): for lang in ['fa', 'ru', 'zh']: self._test_qrels(f'neumarco/{lang}/train', count=532761, items={ 0: TrecQrel('1185869', '0', 1, '0'), 9: TrecQrel('186154', '1160', 1, '0'), 532760: TrecQrel('405466', '8841735', 1, '0') }) self._test_qrels(f'neumarco/{lang}/train/judged', count=532761, items={ 0: TrecQrel('1185869', '0', 1, '0'), 9: TrecQrel('186154', '1160', 1, '0'), 532760: TrecQrel('405466', '8841735', 1, '0') }) self._test_qrels(f'neumarco/{lang}/dev', count=59273, items={ 0: TrecQrel('1102432', '2026790', 1, '0'), 9: TrecQrel('300674', '7067032', 1, '0'), 59272: TrecQrel('371455', '8009476', 1, '0') }) self._test_qrels(f'neumarco/{lang}/dev/small', count=7437, items={ 0: TrecQrel('300674', '7067032', 1, '0'), 9: TrecQrel('54544', '7068203', 1, '0'), 7436: TrecQrel('195199', '8009377', 1, '0'), }) self._test_qrels(f'neumarco/{lang}/dev/judged', count=59273, items={ 0: TrecQrel('1102432', '2026790', 1, '0'), 9: TrecQrel('300674', '7067032', 1, '0'), 59272: TrecQrel('371455', '8009476', 1, '0') }) def test_docpairs(self): for lang in ['fa', 'ru', 'zh']: self._test_docpairs(f'neumarco/{lang}/train', count=269919004, items={ 0: GenericDocPair('662731', '193249', '2975302'), 9: GenericDocPair('411362', '31018', '4238671'), 269919003: GenericDocPair('88228', '5117891', '7075853') }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/nfcorpus.py ================================================ import re import unittest import ir_datasets from ir_datasets.datasets.nfcorpus import NfCorpusDoc, NfCorpusQuery, NfCorpusVideoQuery from ir_datasets.formats import TrecQrel, GenericQuery from .base import DatasetIntegrationTest _logger = ir_datasets.log.easy() class TestNf(DatasetIntegrationTest): def test_nf_docs(self): self._test_docs('nfcorpus', count=5371, items={ 0: NfCorpusDoc('MED-1', 'http://www.ncbi.nlm.nih.gov/pubmed/23092936', 'Birth Weight, Head Circumference, and Prenatal Exposure to Acrylamide from Maternal Diet: The European Prospective Mother–Child Study (NewGeneris)', re.compile('^Abstract Background: Acrylamide is a common dietary exposure that crosses the human placenta\\. It is .{1582}ed, these findings suggest that dietary intake of acrylamide should be reduced among pregnant women\\.$', flags=48)), 9: NfCorpusDoc('MED-10', 'http://www.ncbi.nlm.nih.gov/pubmed/25329299', 'Statin Use and Breast Cancer Survival: A Nationwide Cohort Study from Finland', re.compile('^Abstract Recent studies have suggested that statins, an established drug group in the prevention of .{1533}evaluated further in a clinical trial testing statins’ effect on survival in breast cancer patients\\.$', flags=48)), 5370: NfCorpusDoc('MED-5371', 'http://www.ncbi.nlm.nih.gov/pubmed/21931319', 'Omega-3 Fatty Acids for the Treatment of Depression: Systematic Review and Meta-Analysis', re.compile('^Abstract We conducted a meta\\-analysis of randomized, placebo\\-controlled trials of omega\\-3 fatty acid.{1604}the treatment efficacy observed in the published literature may be attributable to publication bias\\.$', flags=48)), }) def test_nf_queries(self): self._test_queries('nfcorpus/train', count=2594, items={ 0: NfCorpusQuery('PLAIN-10', 'how contaminated are our children ?', re.compile("^how contaminated are our children \\? in a study .*health , tuna , turkey , vegans , vegetarians , women 's health - -$", flags=48)), 9: NfCorpusQuery('PLAIN-1010', 'deli meat', "deli meat - - processed meat , meat , beef , eggs , pork , plant-based diets , chicken , diabetes , turkey , poultry , animal products , women 's health , processed foods , nitrosamines , vegans - -"), 2593: NfCorpusQuery('PLAIN-999', 'dandelion', 'dandelion - - phytonutrients , peppermint , lemongrass , lemon verbena , red tea , rooibos , thyme , tea , rosemary , rose hips , lavender , korea , chamomile tea , beverages , bergamots - -'), }) self._test_queries('nfcorpus/train/nontopic', count=1141, items={ 0: GenericQuery('PLAIN-10', 'how contaminated are our children ?'), 9: GenericQuery('PLAIN-110', 'how to get our kids to eat their vegetables'), 1140: GenericQuery('PLAIN-99', 'quadrupling breast cancer survival'), }) self._test_queries('nfcorpus/train/video', count=812, items={ 0: NfCorpusVideoQuery('PLAIN-2427', 'heart of gold : turmeric vs. exercise', 'diet and exercise synergize to improve endothelial function , the ability of our arteries to relax normally .'), 9: NfCorpusVideoQuery('PLAIN-2438', 'what causes diabetes ?', 'saturated fat can be toxic to the insulin-producing beta cells in the pancreas , explaining why animal fat consumption can impair insulin secretion , not just insulin sensitivity .'), 811: NfCorpusVideoQuery('PLAIN-3474', 'fish consumption and suicide', 'the mercury content in fish may help explain links found between fish intake and mental disorders , depression , and suicide .'), }) self._test_queries('nfcorpus/dev', count=325, items={ 0: NfCorpusQuery('PLAIN-1', 'why deep fried foods may cause cancer', re.compile("^why deep fried foods may cause cancer in the .*, throat cancer , turkey , vitamin c , women 's health - -$", flags=48)), 9: NfCorpusQuery('PLAIN-1087', 'easter island', 'easter island - - mortality , muscle strength , morbidity , mood , mitochondria , oxidative stress , rapamycin , wound healing , tor , sexual health , reproductive health , longevity , lifespan , caloric restriction , calories - -'), 324: NfCorpusQuery('PLAIN-996', 'cytoskeleton', "cytoskeleton - - natural toxins , nutrition myths , monounsaturated fats , metastases , guacamole , insecticides , nuts , oral cancer , taxol , women 's health , phytosterols , persin , paclitaxel , fungicides , fda - -"), }) self._test_queries('nfcorpus/dev/nontopic', count=144, items={ 0: GenericQuery('PLAIN-1', 'why deep fried foods may cause cancer'), 9: GenericQuery('PLAIN-174', 'cinnamon for diabetes'), 143: GenericQuery('PLAIN-90', 'how to boost the benefits of exercise'), }) self._test_queries('nfcorpus/dev/video', count=102, items={ 0: NfCorpusVideoQuery('PLAIN-2429', 'diverticulosis : when our most common gut disorder hardly existed', 'more than two-thirds of americans over age 60 have diverticulosis , but it was nearly unknown a century ago and remained extremely rare among populations eating whole food plant-based diets .'), 9: NfCorpusVideoQuery('PLAIN-2519', 'how much added sugar is too much ?', 'are table sugar and high fructose corn syrup just empty calories or can they be actively harmful ?'), 101: NfCorpusVideoQuery('PLAIN-3471', 'uprooting the leading causes of death', 'death in america is largely a foodborne illness . focusing on studies published just over the last year in peer-reviewed scientific medical journals , dr. greger offers practical advice on how best to feed ourselves and our families to prevent , treat , and even reverse many of the top 15 killers in the united states .'), }) self._test_queries('nfcorpus/test', count=325, items={ 0: NfCorpusQuery('PLAIN-1008', 'deafness', 'deafness - - industrial toxins , infants , lead , medications , india , in vitro studies , haritaki fruit , heavy metals , herbs , mercury , mortality , phytonutrients , side effects , supplements , triphala - -'), 9: NfCorpusQuery('PLAIN-1098', 'eggnog', 'eggnog - - nutmeg , safety limits , spices , mood , miscarriage , cost savings , amphetamines - -'), 324: NfCorpusQuery('PLAIN-997', 'czechoslovakia', "czechoslovakia - - body odor , men 's health , spain , singapore , omnivores , physical attraction , vegetarians , sexual health , plant-based diets , persistent organic pollutants , new york city , flame-retardant chemicals , fibroids , china , california - -"), }) self._test_queries('nfcorpus/test/nontopic', count=144, items={ 0: GenericQuery('PLAIN-102', 'stopping heart disease in childhood'), 9: GenericQuery('PLAIN-186', 'best treatment for constipation'), 143: GenericQuery('PLAIN-91', 'chronic headaches and pork parasites'), }) self._test_queries('nfcorpus/test/video', count=102, items={ 0: NfCorpusVideoQuery('PLAIN-2430', 'preventing brain loss with b vitamins ?', 'one week on a plant-based diet can significantly drop blood levels of homocysteine , a toxin associated with cognitive decline and alzheimer ’ s disease . without vitamin b12 supplementation , though , a long-term plant-based diet could make things worse .'), 9: NfCorpusVideoQuery('PLAIN-2520', 'caloric restriction vs. plant-based diets', 'what is the best strategy to lower the level of the cancer-promoting growth hormone igf-1 ?'), 101: NfCorpusVideoQuery('PLAIN-3472', 'how doctors responded to being named a leading killer', 'what was the medical community ’ s reaction to being named the third leading cause of death in the united states ?'), }) def test_gov2_qrels(self): self._test_qrels('nfcorpus/train', count=139350, items={ 0: TrecQrel('PLAIN-3', 'MED-2436', 3, '0'), 9: TrecQrel('PLAIN-3', 'MED-2431', 2, '0'), 139349: TrecQrel('PLAIN-3474', 'MED-4634', 2, '0'), }) self._test_qrels('nfcorpus/train/nontopic', count=37383, items={ 0: TrecQrel('PLAIN-3', 'MED-2436', 3, '0'), 9: TrecQrel('PLAIN-3', 'MED-2431', 2, '0'), 37382: TrecQrel('PLAIN-3474', 'MED-4634', 2, '0'), }) self._test_qrels('nfcorpus/train/video', count=27465, items={ 0: TrecQrel('PLAIN-2427', 'MED-1507', 3, '0'), 9: TrecQrel('PLAIN-2427', 'MED-1637', 2, '0'), 27464: TrecQrel('PLAIN-3474', 'MED-4634', 2, '0'), }) self._test_qrels('nfcorpus/dev', count=14589, items={ 0: TrecQrel('PLAIN-1', 'MED-2421', 3, '0'), 9: TrecQrel('PLAIN-1', 'MED-4070', 2, '0'), 14588: TrecQrel('PLAIN-3471', 'MED-5342', 3, '0'), }) self._test_qrels('nfcorpus/dev/nontopic', count=4353, items={ 0: TrecQrel('PLAIN-1', 'MED-2421', 3, '0'), 9: TrecQrel('PLAIN-1', 'MED-4070', 2, '0'), 4352: TrecQrel('PLAIN-3471', 'MED-5342', 3, '0'), }) self._test_qrels('nfcorpus/dev/video', count=3068, items={ 0: TrecQrel('PLAIN-2429', 'MED-974', 3, '0'), 9: TrecQrel('PLAIN-2439', 'MED-5325', 2, '0'), 3067: TrecQrel('PLAIN-3471', 'MED-5342', 3, '0'), }) self._test_qrels('nfcorpus/test', count=15820, items={ 0: TrecQrel('PLAIN-2', 'MED-2427', 3, '0'), 9: TrecQrel('PLAIN-2', 'MED-5324', 1, '0'), 15819: TrecQrel('PLAIN-3472', 'MED-3627', 2, '0'), }) self._test_qrels('nfcorpus/test/nontopic', count=4540, items={ 0: TrecQrel('PLAIN-2', 'MED-2427', 3, '0'), 9: TrecQrel('PLAIN-2', 'MED-5324', 1, '0'), 4539: TrecQrel('PLAIN-3472', 'MED-3627', 2, '0'), }) self._test_qrels('nfcorpus/test/video', count=3108, items={ 0: TrecQrel('PLAIN-2430', 'MED-980', 3, '0'), 9: TrecQrel('PLAIN-2430', 'MED-3137', 3, '0'), 3107: TrecQrel('PLAIN-3472', 'MED-3627', 2, '0'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/nyt.py ================================================ import re import unittest from ir_datasets.datasets.nyt import NytDoc from ir_datasets.formats import GenericQrel, GenericQuery, TrecQuery, TrecQrel from .base import DatasetIntegrationTest class TestNyt(DatasetIntegrationTest): def test_nyt_docs(self): self._test_docs('nyt', count=1864661, items={ 0: NytDoc('8454', 'MARSH & MCLENNAN INC reports earnings for Qtr to Dec 31', re.compile('^LEAD:\n\\*3\\*\\*\\* COMPANY REPORTS \\*\\*\n\\*3\\*MARSH \\& MCLENNAN INC \\(NYSE\\)\nQtr to Dec 31\n1986\n1985\nRevenue\n444,70.{307}rns\n\\.67\n\\.50\nYr rev\n1,804,100,000\n1,367,600 000\nNet inc\n243,200,000\n162,900 000\nShare earns\n3\\.30\n2\\.23$', flags=48), re.compile(b'^<\\?xml version="1\\.0" encoding="UTF\\-8"\\?>\n\n

3\\.30

\n

2\\.23

\n \n \n \n\n$', flags=16)), 9: NytDoc('8579', 'SALINGER BIOGRAPHY IS BLOCKED', re.compile('^LEAD: A biography of J\\. D\\. Salinger was blocked yesterday by a Federal appeals court in Manhattan th.{6380}ers in his lifetime, the appeals court said he was entitled to protect his opportunity to sell them\\.$', flags=48), re.compile(b'^<\\?xml version="1\\.0" encoding="UTF\\-8"\\?>\n\n \n \n \n\n$', flags=16)), 1864660: NytDoc('1854817', 'STRATEGY ON IRAN STIRS NEW DEBATE AT WHITE HOUSE', re.compile("^A year after President Bush and Secretary of State Condoleezza Rice announced a new strategy toward .{8110}, who left his post partly over his opposition to the administration's recent deal with North Korea\\.$", flags=48), re.compile(b'^<\\?xml version="1\\.0" encoding="UTF\\-8"\\?>\n\n \n \n \n\n$', flags=16)), }) def test_nyt_queries(self): self._test_queries('nyt/wksup', count=1864661, items={ 0: GenericQuery('8454', 'MARSH & MCLENNAN INC reports earnings for Qtr to Dec 31'), 9: GenericQuery('8579', 'SALINGER BIOGRAPHY IS BLOCKED'), 1864660: GenericQuery('1854817', 'STRATEGY ON IRAN STIRS NEW DEBATE AT WHITE HOUSE'), }) self._test_queries('nyt/wksup/train', count=1863657, items={ 0: GenericQuery('8454', 'MARSH & MCLENNAN INC reports earnings for Qtr to Dec 31'), 9: GenericQuery('8579', 'SALINGER BIOGRAPHY IS BLOCKED'), 1863656: GenericQuery('1854817', 'STRATEGY ON IRAN STIRS NEW DEBATE AT WHITE HOUSE'), }) self._test_queries('nyt/wksup/valid', count=1004, items={ 0: GenericQuery('6461', "Why We're Forced To Be Slumlords"), 9: GenericQuery('13148', 'NOVAMETRIX MEDICAL SYSTEMS INC reports earnings for Qtr to Dec 31'), 1003: GenericQuery('1854529', 'The Newest Antique: Atari'), }) self._test_queries('nyt/trec-core-2017', count=50, items={ 0: TrecQuery('307', 'New Hydroelectric Projects', 'Identify hydroelectric projects proposed or under construction by country and location. Detailed description of nature, extent, purpose, problems, and consequences is desirable.', 'Relevant documents would contain as a minimum a clear statement that a hydroelectric project is planned or construction is under way and the location of the project. Renovation of existing facilities would be judged not relevant unless plans call for a significant increase in acre-feet or reservoir or a marked change in the environmental impact of the project. Arguments for and against proposed projects are relevant as long as they are supported by specifics, including as a minimum the name or location of the project. A statement that an individual or organization is for or against such projects in general would not be relevant. Proposals or projects underway to dismantle existing facilities or drain existing reservoirs are not relevant, nor are articles reporting a decision to drop a proposed plan.'), 9: TrecQuery('347', 'Wildlife Extinction', 'The spotted owl episode in America highlighted U.S. efforts to prevent the extinction of wildlife species. What is not well known is the effort of other countries to prevent the demise of species native to their countries. What other countries have begun efforts to prevent such declines?', 'A relevant item will specify the country, the involved species, and steps taken to save the species.'), 49: TrecQuery('690', 'college education advantage', 'Find documents which describe an advantage in hiring potential or increased income for graduates of U.S. colleges.', 'Relevant documents cite some advantage of a college education for job opportunities. Documents citing better opportunities for non-college vocational-training is not relevant.'), }) def test_nyt_qrels(self): self._test_qrels('nyt/wksup', count=1864661, items={ 0: GenericQrel('8454', '8454', 1), 9: GenericQrel('8579', '8579', 1), 1864660: GenericQrel('1854817', '1854817', 1), }) self._test_qrels('nyt/wksup/train', count=1863657, items={ 0: GenericQrel('8454', '8454', 1), 9: GenericQrel('8579', '8579', 1), 1863656: GenericQrel('1854817', '1854817', 1), }) self._test_qrels('nyt/wksup/valid', count=1004, items={ 0: GenericQrel('6461', '6461', 1), 9: GenericQrel('13148', '13148', 1), 1003: GenericQrel('1854529', '1854529', 1), }) self._test_qrels('nyt/trec-core-2017', count=30030, items={ 0: TrecQrel('307', '1001536', 1, '0'), 9: TrecQrel('307', '1029429', 1, '0'), 30029: TrecQrel('690', '996059', 0, '0'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/pmc.py ================================================ import re import unittest import ir_datasets from ir_datasets.datasets.pmc import PmcDoc, TrecCdsQuery, TrecCds2016Query from ir_datasets.formats import TrecQrel from .base import DatasetIntegrationTest _logger = ir_datasets.log.easy() class TestPmc(DatasetIntegrationTest): def test_pmc_docs(self): self._test_docs('pmc/v1', count=733111, items={ 0: PmcDoc('1469887', 'Environmental Health Perspectives', 'Neoplastic transformation of cultured mammalian cells by estrogens and estrogenlike chemicals.', re.compile('^Estrogens are clearly carcinogenic in humans and rodents but the mechanisms by which these hormones .{1529}tiple effects of estrogens acting together cause genetic alterations leading to cell transformation\\.$', flags=48), ''), 9: PmcDoc('1500872', 'Nucleic Acids Research', 'Metabolic regulation of \nApoB\n mRNA editing is associated with phosphorylation of APOBEC-1 complementation factor', re.compile('^Apolipoprotein B \\(\napoB\n\\) mRNA editing is a nuclear event that minimally requires the RNA substrate,.{1451}tion of ACF with APOBEC\\-1 and thereby increasing the probability of editosome assembly and activity\\.$', flags=48), re.compile('^INTRODUCTION\nApoB\n mRNA editing involves the site\\-specific deamination of cytidine 6666 to uridine w.{33462}modulate \napoB\n mRNA editing in the context of current models of editosome composition and assembly\\.$', flags=48)), 733110: PmcDoc('3808453', 'PLoS Computational Biology', 'The Influence of Synaptic Weight Distribution on Neuronal Population Dynamics', re.compile('^The manner in which different distributions of synaptic weights onto cortical neurons shape their sp.{1093}s enable the network to respond faster and with more stability in the face of external fluctuations\\.$', flags=48), re.compile('^Introduction\nExperiments analyzing the distribution of synaptic weights impinging onto neurons typic.{74264}ns obtained from different synaptic weight distributions\\.\n\\(PDF\\)\nClick here for additional data file\\.$', flags=48)), }) self._test_docs('pmc/v2', count=1255260, items={ 0: PmcDoc('4504085', 'Burns', 'Preventing childhood scalds within the home: Overview of systematic reviews and a systematic review of primary studies', re.compile('^Highlights\n•\nWe performed an overview of published systematic reviews and a systematic review of pri.{388}long with thermometers or thermostatic mixing valves is effective in reducing hot water temperature\\.$', flags=48), re.compile('^1\nIntroduction\nChildren are at particular risk of thermal injuries\\. Globally, thermal injuries are t.{26998}, engineering and educational approaches to reduce scalds risk\\.\nConflict of interest statement\nNone\\.$', flags=48)), 9: PmcDoc('4210750', 'Bundesgesundheitsblatt, Gesundheitsforschung, Gesundheitsschutz', 'Web-based questionnaires to capture acute infections in long-term cohorts', re.compile('^Background\nIncidence of acute respiratory infections \\(ARI\\) and gastrointestinal infections \\(GII\\) are.{2411}election bias\n is possible and must be kept in mind when discussing generalizability of the results\\.$', flags=48), re.compile('^Acute respiratory \\(ARI\\) and gastrointestinal infections \\(GII\\) have an important share of the overall.{18063}er forms of data collection as long as its limitations—specifically selection bias—are kept in mind\\.$', flags=48)), 1255259: PmcDoc('4486443', 'OMICS : a Journal of Integrative Biology', 'Towards Development of Clustering Applications for Large-Scale Comparative Genotyping and Kinship Analysis Using Y-Short Tandem Repeats', re.compile('^Abstract\nY\\-chromosome short tandem repeats \\(Y\\-STRs\\) are genetic markers with practical applications .{1186}ential for further development towards fully automatic clustering of any large\\-scale genotypic data\\.$', flags=48), re.compile('^Introduction\nY\\-\nchromosome short tandem repeats\n \\(Y\\-STRs\\) are a class of genetic markers found only .{32192}s, implementing the algorithm within the aforementioned clustering tools is experimentally feasible\\.$', flags=48)), }) def test_pmc_queries(self): self._test_queries('pmc/v1/trec-cds-2014', count=30, items={ 0: TrecCdsQuery('1', 'diagnosis', 'A 58-year-old African-American woman presents to the ER with episodic pressing/burning anterior chest pain that began two days earlier for the first time in her life. The pain started while she was walking, radiates to the back, and is accompanied by nausea, diaphoresis and mild dyspnea, but is not increased on inspiration. The latest episode of pain ended half an hour prior to her arrival. She is known to have hypertension and obesity. She denies smoking, diabetes, hypercholesterolemia, or a family history of heart disease. She currently takes no medications. Physical examination is normal. The EKG shows nonspecific changes.', '58-year-old woman with hypertension and obesity presents with exercise-related episodic chest pain radiating to the back.'), 9: TrecCdsQuery('10', 'diagnosis', 'A physician is called to see a 67-year-old woman who underwent cardiac catheterization via the right femoral artery earlier in the morning. She is now complaining of a cool right foot. Upon examination she has a pulsatile mass in her right groin with loss of distal pulses, and auscultation reveals a bruit over the point at which the right femoral artery was entered.', '67-year-old woman status post cardiac catheterization via right femoral artery, now with a cool, pulseless right foot and right femoral bruit.'), 29: TrecCdsQuery('30', 'treatment', 'A 72-year-old man complains of increasing calf pain when walking uphill. The symptoms have gradually increased over the past 3 months. The patient had an uncomplicated myocardial infarction 2 years earlier and a transient ischemic attack 6 months ago. Over the past month, his blood pressure has worsened despite previous control with diltiazem, hydrochlorothiazide, and propranolol. His is currently taking isosorbide dinitrate, hydrochlorothiazide, and aspirin. On physical examination, his blood pressure is 151/91 mm Hg, and his pulse is 67/min. There is a right carotid bruit. His lower extremities are slightly cool to the touch and have diminished pulses at the dorsalis pedis.', '72-year-old man with calf pain while walking uphill. History of ischemic heart disease and worsening hypertension despite medication compliance. On physical exam he has a right carotid bruit and his lower extremities are cool, with diminished dorsalis pedis pulses.'), }) self._test_queries('pmc/v1/trec-cds-2015', count=30, items={ 0: TrecCdsQuery('1', 'diagnosis', 'A 44 yo male is brought to the emergency room after multiple bouts of vomiting that has a "coffee ground" appearance. His heart rate is 135 bpm and blood pressure is 70/40 mmHg. Physical exam findings include decreased mental status and cool extremities. He receives a rapid infusion of crystalloid solution followed by packed red blood cell transfusion and is admitted to the ICU for further care.', 'A 44-year-old man with coffee-ground emesis, tachycardia, hypoxia, hypotension and cool, clammy extremities.'), 9: TrecCdsQuery('10', 'diagnosis', 'A 38 year old woman complains of severe premenstrual and menstrual pelvic pain, heavy, irregular periods and occasional spotting between periods. Past medical history remarkable for two years of infertility treatment and an ectopic pregnancy at age 26.', 'A 38 year old woman with severe dysmenorrhea, menorrhagia, and menometrorrhagia. PMH of infertility treatment and ectopic pregnancy'), 29: TrecCdsQuery('30', 'treatment', 'A 47 year old male who fell on his outstretched left arm presents with pain and bruising on the inside and outside of the elbow, swelling, and inability to bend the arm. On the x-ray, the ulna has dislocated posteriorly from the trochlea of the humerus. The radius has dislocated from the capitulum of the humerus.', 'A 47 year old male who fell on his outstretched left arm presents with pain, swelling, and inability to bend the arm. The x-ray, shows posterior elbow dislocation.'), }) self._test_queries('pmc/v2/trec-cds-2016', count=30, items={ 0: TrecCds2016Query('1', 'diagnosis', '\n 78 M w/ pmh of CABG in early [**Month (only) 3**] at [**Hospital6 4406**]\n (transferred to nursing home for rehab on [**12-8**] after several falls out\n of bed.) He was then readmitted to [**Hospital6 1749**] on\n [**3120-12-11**] after developing acute pulmonary edema/CHF/unresponsiveness?.\n There was a question whether he had a small MI; he reportedly had a\n small NQWMI. He improved with diuresis and was not intubated. \n .\n Yesterday, he was noted to have a melanotic stool earlier this evening\n and then approximately 9 loose BM w/ some melena and some frank blood\n just prior to transfer, unclear quantity.\n ', '78 M transferred to nursing home for rehab after CABG. Reportedly readmitted with a small NQWMI. Yesterday, he was noted to have a melanotic stool and then today he had approximately 9 loose BM w/ some melena and some frank blood just prior to transfer, unclear quantity.', 'A 78 year old male presents with frequent stools and melena.'), 9: TrecCds2016Query('10', 'diagnosis', '\n The patient is a 55-year-old woman with hepatic sarcoidosis and\n regenerative hyperplasia s/p TIPS [**10/3245**] placed [**1-27**] variceal bleeding\n and portal hypertensive gastropathy s/p TIPS re-do with angioplasty and\n portal vein embolectomy, who was brought to the ED by her husband for\n evaluation after he noted worsening asterixis. While in the waiting room \n the pt became more combative and then unresponsive. \n\n In the ED: VS - Temp 97.9F, HR 115, BP 122/80, R 18, O2-sat 98% 2L NC.\n She was unresponsive but able to protect her airway and so not\n intubated. She vomited x1 and received Zofran as well as 1.5 L NS. Labs\n were significant for K 5.5, BUN 46, Cr 2.2 (up from baseline of 0.8),\n and ammonia of 280. Stool was Guaiac negative. A urinalysis and CXR\n were done and are pending, and a FAST revealed\n hepatosplenomegaly but no intraperitoneal fluid. \n \n On arrival to the ICU the pt had another episode of emesis. NGT was\n placed to suction and 1.5L bilious material was drained.\n Allergies:\n Cipro (Oral) (Ciprofloxacin Hcl)\n Hives;\n Doxycycline\n Hives; hallucin\n Paxil (Oral) (Paroxetine Hcl)\n hair loss;\n Quinine\n Rash;\n Compazine (Injection) (Prochlorperazine Edisylate)\n muscle spasm;\n Levaquin (Oral) (Levofloxacin)\n tendinitis of t\n Lithium\n Hives;\n ', 'A 55y/o F with sarcoidosis, COPD, idiopathic cardiomyopathy with EF 40% and diastolic dysfunction, varices s/p TIPS and hypothyroidism presenting today with confusion. She was brought to the ED by her husband for evaluation after he noted worsening asterixis. While in the waiting room the pt became more combative and then unresponsive. In the ED: VS - Temp 97.9F, HR 115, BP 122/80, R 18, O2-sat 98% 2L NC. She was unresponsive but able to protect her airway and so not intubated. She vomited x1 and received Zofran as well as 1.5 L NS. Labs were significant for K 5.5, BUN 46, Cr 2.2 (up from baseline of 0.8), and ammonia of 280. Stool was Guaiac negative. A urinalysis and CXR were done and are pending, and a FAST revealed hepatosplenomegaly but no intraperitoneal fluid.', 'A 55-year-old woman with sarcoidosis, presenting today with confusion and worsening asterixis. In the waiting room, the pt became more combative and then unresponsive. Ammonia level 280 on admission.'), 29: TrecCds2016Query('30', 'treatment', '\n 85 y/o F with PMHx of HTN, HL, h/o breast CA and 3cm renal pelvis\n transitional cell tumor who presented for nephrectomy on [**2575-8-15**]. Her\n post op course was complicated by agitation thought due to narcotics.\n Today, she was restarted on her home meds and while on telemetry, pt\n was noted to be bradycardic to 40s. Pt was triggered for SBP of 70 and\n HR of 40 during which she remained asymptomatic. She was given 1L IVF\n and her HR/BP trended back up to baseline. However, there was a second\n event an hour later when she sat up and became bradycardic in the 30s\n with associated hypotension. Second episode occurred with position change\n and again, pt developped junctional rhythm in 30s. \n \n home meds:\n Verapamil 240mg daily\n Lisinopril 5mg\n Rosuvastatin 10mg\n Meclizine 25 TID PRN\n Imipramine 25 QHS\n Colace 100mg \n Loratidine 10mg daily\n\n Physical Examination\n T: 98 BP: 111/47 P: 74 R: 16 O2: 98% on 2L NC\n General: oriented to person only, NAD, comfortable\n HEENT: Sclera anicteric, dry MM, oropharynx clear\n Neck: supple, unable to appreciate JVP due to habitus\n Lungs: poor effort but [**Month (only) 199**] BS at bases and some audible airway\n secretion in upper airways\n CV: Regular rate and rhythm, no m/r/g, diff to auscult [**2-13**] habitus\n Abdomen: diffusely tender, bowel sounds present, multiple surgical\n incisions, clean dry and intact, abd binder in place\n GU: foley in place\n Ext: cool, no edema, 1+ pulses, pneumoboots in place\n ', '85 y/o F with PMHx of HTN, HL, h/o breast CA and 3cm renal pelvis transitional cell tumor who presented for nephrectomy. Her post op course was complicated by agitation thought due to narcotics. Today, she was restarted on her home meds and while on telemetry, pt was noted to be bradycardic to 40s. Pt was triggered for SBP of 70 and HR of 40 during which she remained asymptomatic. She was given 1L IVF and her HR/BP trended back up to baseline. However, there was a second event an hour later when she sat up and became bradycardic in the 30s with associated hypotension. Second episode occurred with position change and again, pt developped junctional rhythm in 30s.', 'An 85 year-old woman on verapamil presents with junctional heart rhythm in 30s with associated hypotension.'), }) def test_pmc_qrels(self): self._test_qrels('pmc/v1/trec-cds-2014', count=37949, items={ 0: TrecQrel('1', '1033658', 0, '0'), 9: TrecQrel('1', '1037001', 0, '0'), 37948: TrecQrel('30', '80153', 0, '0'), }) self._test_qrels('pmc/v1/trec-cds-2015', count=37807, items={ 0: TrecQrel('1', '1065003', 1, '0'), 9: TrecQrel('1', '117132', 0, '0'), 37806: TrecQrel('30', '64646', 2, '0'), }) self._test_qrels('pmc/v2/trec-cds-2016', count=37707, items={ 0: TrecQrel('1', '1036067', 0, '0'), 9: TrecQrel('1', '1160569', 0, '0'), 37706: TrecQrel('30', '65042', 0, '0'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/sara.py ================================================ import ir_datasets from ir_datasets.formats import GenericQuery, GenericDoc, TrecQrel from ir_datasets.datasets.sara import SaraDoc from .base import DatasetIntegrationTest import unittest import re class TestSara(DatasetIntegrationTest): def test_docs(self): self._test_docs('sara', count=129821, items={ 55555: SaraDoc('gladly-united-eagle', "----- Forwarded by Jeff Dasovich/NA/Enron on 12/21/2000 10:04 AM ----- Jeff Dasovich Sent by: Jeff Dasovich 12/20/2000 08:16 PM To: skean@enron.com, Richard Shapiro/NA/Enron@Enron, Susan J Mara/NA/Enron@ENRON, Sandra McCubbin/NA/Enron@Enron, Paul Kaufman/PDX/ECT@ECT, James D Steffes/NA/Enron@Enron, Harry Kingerski/NA/Enron@Enron, Wanda Curry/HOU/EES@EES, Dennis Benevides/HOU/EES@EES, Roger Yang/SFO/EES@EES, Scott Stoness/HOU/EES@EES, Mary Hain/HOU/ECT@ECT, Alan Comnes/PDX/ECT@ECT, Joe Hartsoe/Corp/Enron@ENRON, Sarah Novosel/Corp/Enron@ENRON, Mona L Petrochko/NA/Enron@Enron, Jennifer Rudolph/HOU/EES@EES, Eric Letke/DUB/EES@EES cc: Joseph Alamo/NA/Enron@Enron, Lysa Akin/PDX/ECT@ECT Subject: Call to Discuss California PUC Action We will set up a call-in number to relay to folks any actions the Commission takes tomorrow. The Commission meeting starts at 10 AM PST, but it's unclear when they will take up our issue. Since the press will likely have the place surrounded, they may decide to do that item first. We'll send out a notice with a call-in # and time as soon as we have the information. Best, Jeff", 0), 12345: SaraDoc('daily-hip-bass', "David: Enclosed is a new draft of the Kennecott amendment letter. Mark and I have discussed the counterparty termination language and have come up with the language that I have high;lighted. Upon your approval, I plan to use the same language for the Duke and Cinnergy letters. Carol St. Clair EB 3892 713-853-3989 (Phone) 713-646-3393 (Fax)", 0), 5432: SaraDoc('badly-sound-marmot', "Dear UT Team: As you know, we conducted our intern interviews at UT last week. We interviewed almost 50 candidates during round 1 (thanks to Chris Sherman, Jim Cole, Hunter Shively, Dwight Fruge', Stan Dowell, Rick Carson, and Kim Chick), and our round 2 interviewers (Rick Causey, Brent Price, Mark Lindsey, and Mike Deville) have selected the following 11 candidates for a summer internship: Cathy Wang 512-479-7264 Ameet Rane 512-505-2045 Michelle Yee 512-495-3264 Jessica Payne 512-499-8729 Wesley Thoman 512-343-8895 Pranav Gandhi 512-294-4311 (active in student government - elections to be held next Wednesday and Thursday) Daniel Payne 512-472-6739 Kruti Patel 512-356-2321 Rachel Ravanzo 512-689-3814 (also interested in the tax group - will decide between the 2 options, but was extended 1 offer) Vivek Shah 512-495-4066 Vini Adenwala 512-457-8744 I will send cultivation assignments next week, but in the mean time please feel free to call and congratulate these candidates. I have extended verbal offers to each of them and they should receive their offer letters by the end of next week. Thanks to each of our interviewers for their help and long hours! lexi 3-4585", 0), }) def test_queries(self): # Test that the dataset 'dummy' has 4 queries, and test the specific queries at indices 0 and 3 self._test_queries('sara', count=150, items={ 0: GenericQuery('1', 'Politicians that decide the plans from state to state'), 87: GenericQuery('88', 'Enron procurement'), 122: GenericQuery('123', 'Senators linked to enron') }) def test_qrels(self): # Test that the dataset 'dummy' has 60 qrels, and test the specific qrels at indices 0, 9, and 59 self._test_qrels('sara', count=800157, items={ 0: TrecQrel( query_id='1', doc_id= '201645', iteration= "0", relevance=0), 5: TrecQrel( query_id='3', doc_id= '175389', iteration= "0", relevance=0), 0: TrecQrel( query_id='1', doc_id= '201645', iteration= "0", relevance=0) }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/touche.py ================================================ from unittest import main from re import compile from ir_datasets.formats import ToucheQuery, TrecQrel, ToucheTitleQuery, \ ToucheComparativeQuery, ToucheQualityQrel, ToucheQualityCoherenceQrel, \ ToucheQualityComparativeStanceQrel, ToucheComparativeStance, \ ToucheControversialStanceQrel, ToucheControversialStance from test.integration.base import DatasetIntegrationTest class TestTouche(DatasetIntegrationTest): # noinspection PyTypeChecker def test_queries(self): self._test_queries( "argsme/2020-04-01/touche-2020-task-1", count=49, items={ 0: ToucheQuery( query_id="1", title="Should teachers get tenure?", description=compile("A user has heard that some countries do give teach.{159}teachers vs. university professors is of interest\."), narrative=compile("Highly relevant arguments make a clear statement a.{181}the situation of teachers' financial independence\."), ), 48: ToucheQuery( query_id="50", title="Should everyone get a universal basic income?", description=compile("Redistribution of wealth is a fundamental concept .{93}ver, a user wonders whether this truly would help\."), narrative=compile("Highly relevant arguments take a clear stance towa.{134}mentioning universal basic income only in passing\."), ), } ) self._test_queries( "argsme/1.0/touche-2020-task-1/uncorrected", count=49, items={ 0: ToucheQuery( query_id="1", title="Should teachers get tenure?", description=compile("A user has heard that some countries do give teach.{159}teachers vs. university professors is of interest\."), narrative=compile("Highly relevant arguments make a clear statement a.{181}the situation of teachers' financial independence\."), ), 48: ToucheQuery( query_id="50", title="Should everyone get a universal basic income?", description=compile("Redistribution of wealth is a fundamental concept .{93}ver, a user wonders whether this truly would help\."), narrative=compile("Highly relevant arguments take a clear stance towa.{134}mentioning universal basic income only in passing\."), ), } ) self._test_queries( "argsme/2020-04-01/touche-2020-task-1/uncorrected", count=49, items={ 0: ToucheQuery( query_id="1", title="Should teachers get tenure?", description=compile("A user has heard that some countries do give teach.{159}teachers vs. university professors is of interest\."), narrative=compile("Highly relevant arguments make a clear statement a.{181}the situation of teachers' financial independence\."), ), 48: ToucheQuery( query_id="50", title="Should everyone get a universal basic income?", description=compile("Redistribution of wealth is a fundamental concept .{93}ver, a user wonders whether this truly would help\."), narrative=compile("Highly relevant arguments take a clear stance towa.{134}mentioning universal basic income only in passing\."), ), } ) self._test_queries( "clueweb12/touche-2020-task-2", count=50, items={ 0: ToucheQuery( query_id="1", title="What is the difference between sex and love?", description=compile("A potentially younger user has heard people talk a.{147}ontrast, what characterizes a loving relationship\."), narrative=compile("Relevant documents will contain some description o.{155}f what people are looking for in either direction\."), ), 49: ToucheQuery( query_id="50", title="Whose salary is higher: basketball or soccer players?", description=compile("A young married couple raises a 14-year old boy wh.{313}income to players in different parts of the world\."), narrative=compile("Highly relevant documents provide information on a.{496}iptions of basketball and soccer are not relevant\."), ), } ) self._test_queries( "argsme/2020-04-01/touche-2021-task-1", count=50, items={ 0: ToucheTitleQuery( query_id="51", title="Do we need sex education in schools?" ), 49: ToucheTitleQuery( query_id="100", title="Do we need cash?" ), } ) self._test_queries( "clueweb12/touche-2021-task-2", count=50, items={ 0: ToucheQuery( query_id="51", title="What is better at reducing fever in children, Ibuprofen or Aspirin?", description=compile("Younger parents have their 8-year old child sick\. .{400}en and aspirin for reducing the fever in children\."), narrative=compile("Relevant documents will describe ibuprofen, aspiri.{258} or ingredients of the medicines are not relevant\."), ), 49: ToucheQuery( query_id="100", title="Should I learn Python or R for data analysis?", description=compile("Wondering whether you should use Python or R for d.{318}ore useful, flexible, easy to learn and efficient\."), narrative=compile("Relevant documents should compare two programming .{430}re not related to data analysis, are not relevant\."), ), } ) self._test_queries( "argsme/2020-04-01/processed/touche-2022-task-1", count=50, items={ 0: ToucheQuery( query_id="1", title="Should teachers get tenure?", description=compile("A user has heard .{215} professors is of interest."), narrative=compile("Highly relevant arguments make a clear statement about tenure for teachers in schools or universities. Relevant arguments consider tenure more generally, not specifically for teachers, or, instead of talking about tenure, consider the situation of teachers' financial independence."), ), 49: ToucheQuery( query_id="50", title="Should everyone get a universal basic income?", description=compile("Redistribution of wealth is a fundamental concept of many economies and social systems. A key component might be a universal basic income, however, a user wonders whether this truly would help."), narrative=compile("Highly relevant arguments take a clear stance toward the universal basic income, giving clear premises. Relevant arguments offer only emotional arguments, or talk about minimum wages, mentioning universal basic income only in passing."), ), } ) self._test_queries( "clueweb12/touche-2022-task-2", count=50, items={ 0: ToucheComparativeQuery( query_id="2", title="Which is better, a laptop or a desktop?", objects=("laptop", "desktop"), description=compile("A user wants to buy .{272} gaming desktop PC."), narrative=compile("Highly relevant documents will describe the major similarities and dissimilarities of laptops and desktops along with the respective advantages and disadvantages of specific usage scenarios. A comparison of the technical and architectural characteristics without personal opinion, recommendation, or pros/cons is not relevant."), ), 49: ToucheComparativeQuery( query_id="100", title="Should I learn Python or R for data analysis?", objects=("Python", "R"), description=compile("Wondering whether you should use Python or R for data analysis\? It's hard to know whether to use Python or R for data analysis. And that's especially true if you're a newbie data analyst looking for the right language to start with. Users are looking for documents that help them decide which programming language, Python or R, is better suited for data analysis, is more useful, flexible, easy to learn and efficient."), narrative=compile("Relevant documents should compare two programming languages for data analysis: Python and R. Highly relevant documents should compare the two objects in terms of efficiency, usefulness, easiness to learn and so on. Highly relevant documents would ideally compare most of the features. The documents that only describe one language, but provide insights on how it is good for data analysis are relevant. Arguments and opinions of using one of the two or both for other tasks that are not related to data analysis, are not relevant."), ), } ) self._test_queries( "touche-image/2022-06-13/touche-2022-task-3", count=50, items={ 0: ToucheQuery( query_id="1", title="Should teachers get tenure?", description=compile("A user has heard .{215} professors is of interest."), narrative=compile("Highly relevant arguments make a clear statement about tenure for teachers in schools or universities. Relevant arguments consider tenure more generally, not specifically for teachers, or, instead of talking about tenure, consider the situation of teachers' financial independence."), ), 49: ToucheQuery( query_id="50", title="Should everyone get a universal basic income?", description=compile("Redistribution of wealth is a fundamental concept of many economies and social systems. A key component might be a universal basic income, however, a user wonders whether this truly would help."), narrative=compile("Highly relevant arguments take a clear stance toward the universal basic income, giving clear premises. Relevant arguments offer only emotional arguments, or talk about minimum wages, mentioning universal basic income only in passing."), ), } ) # noinspection PyTypeChecker def test_qrels(self): self._test_qrels( "argsme/2020-04-01/touche-2020-task-1", count=2298, items={ 0: TrecQrel( query_id="1", doc_id="S197beaca-A971412e6", relevance=0, iteration="0" ), 2297: TrecQrel( query_id="50", doc_id="Sffdf2e2e-A307df259", relevance=2, iteration="0" ), } ) self._test_qrels( "argsme/1.0/touche-2020-task-1/uncorrected", count=2964, items={ 0: TrecQrel( query_id="1", doc_id="197beaca-2019-04-18T11:28:59Z-00001-000", relevance=4, iteration="0" ), 2963: TrecQrel( query_id="50", doc_id="799d051-2019-04-18T11:47:02Z-00000-000", relevance=-2, iteration="Q0" ), } ) self._test_qrels( "argsme/2020-04-01/touche-2020-task-1/uncorrected", count=2298, items={ 0: TrecQrel( query_id="1", doc_id="S21dc5a14-A8b896cb0", relevance=4, iteration="0" ), 2297: TrecQrel( query_id="50", doc_id="Sffdf2e2e-A307df259", relevance=2, iteration="0" ), } ) self._test_qrels( "clueweb12/touche-2020-task-2", count=1783, items={ 0: TrecQrel( query_id="1", doc_id="clueweb12-0001wb-05-12311", relevance=0, iteration="0" ), 1782: TrecQrel( query_id="50", doc_id="clueweb12-0206wb-00-16297", relevance=0, iteration="0" ), } ) self._test_qrels( "argsme/2020-04-01/touche-2021-task-1", count=3711, items={ 0: ToucheQualityQrel( query_id="94", doc_id="S522c7c3b-A8a87130b", relevance=2, quality=2, iteration="0" ), 3710: ToucheQualityQrel( query_id="91", doc_id="Sf0770da-A760eca8e", relevance=0, quality=1, iteration="0" ), } ) self._test_qrels( "clueweb12/touche-2021-task-2", count=2076, items={ 0: ToucheQualityQrel( query_id="54", doc_id="clueweb12-0205wb-64-11095", relevance=0, quality=0, iteration="0" ), 2075: ToucheQualityQrel( query_id="86", doc_id="clueweb12-0008wb-85-29079", relevance=0, quality=0, iteration="0" ), } ) self._test_qrels( "argsme/2020-04-01/processed/touche-2022-task-1", count=6841, items={ 0: ToucheQualityCoherenceQrel( query_id="1", doc_id="Sc065954f-Ae72bc9c6__PREMISE__41,Sc065954f-Ae72bc9c6__CONC__1", relevance=2, quality=2, coherence=2, iteration="0" ), 6840: ToucheQualityCoherenceQrel( query_id="50", doc_id="Sffdf2e2e-A20e9dd06__PREMISE__4,Sffdf2e2e-A20e9dd06__PREMISE__5", relevance=1, quality=1, coherence=1, iteration="0" ), } ) self._test_qrels( "clueweb12/touche-2022-task-2", count=2107, items={ 0: ToucheQualityComparativeStanceQrel( query_id="12", doc_id="clueweb12-0002wb-18-34442___2", relevance=0, quality=2, stance=ToucheComparativeStance.NO, iteration="0" ), 2106: ToucheQualityComparativeStanceQrel( query_id="70", doc_id="clueweb12-1900tw-42-07368___7", relevance=1, quality=1, stance=ToucheComparativeStance.NO, iteration="0" ), } ) self._test_qrels( "touche-image/2022-06-13/touche-2022-task-3", count=19821, items={ 0: ToucheControversialStanceQrel( query_id="1", doc_id="Ib7fc7d5f8ee59d62", relevance=1, stance=ToucheControversialStance.ONTOPIC, ), 19820: ToucheControversialStanceQrel( query_id="50", doc_id="I490ab3908d308757", relevance=1, stance=ToucheControversialStance.CON, ), } ) if __name__ == "__main__": main() ================================================ FILE: test/integration/touche_image.py ================================================ from re import compile from unittest import main from ir_datasets.formats import ToucheImageDoc from test.integration.base import DatasetIntegrationTest class TestToucheImage(DatasetIntegrationTest): # noinspection PyTypeChecker def test_docs(self): self._test_docs("touche-image/2022-06-13", count=23841, items={ 0: ToucheImageDoc( doc_id="I000330ba4ea0ad13", png=compile(b"\x89PNG.*"), webp=compile(b"RIFF\xd0\xf3\x05\x00WEBPVP8.*"), url="https://www.e-dmj.org/upload//thumbnails/dmj-2020-0258f3.jpg", phash="1000000011001011011101010011101010010111011010101000011101101100", pages=[] ), 23840: ToucheImageDoc( doc_id="Iffff8be6926a808e", png=compile(b"\x89PNG.*"), webp=compile(b"RIFF\x0e\\+\x00\x00WEBPVP8.*"), url="https://assets.pewresearch.org/wp-content/uploads/sites/11/2012/07/death-penalty-2011-1.png", phash="0001011011111110101001010100010110101011101000101111000001110000", pages=[] ), }) if __name__ == "__main__": main() ================================================ FILE: test/integration/trec_arabic.py ================================================ import re import unittest from ir_datasets.formats import TrecQrel, TrecDoc, TrecQuery from .base import DatasetIntegrationTest class TestTrecArabic(DatasetIntegrationTest): def test_trec_arabic_docs(self): self._test_docs('trec-arabic', count=383872, items={ 0: TrecDoc('19940513_AFP_ARB.0001', re.compile('^\n\n \\&HT; العلم الفلسطيني لم يُرفع فوق كنيس اريحا \n\n\n\n اريحا \\(الضفة الغربية\\) 31\\-5 \\(اف ب\\)\\- يقوم احد .{783}ارا عن استيائهم وغضبهم من ذلك ونظموا تظاهرات عدة الا ان الشرطة الاسرائيلية قامت كل مرة بتفريقهم\\. \n\n\n$', flags=48), re.compile('^\n \\&HT; العلم الفلسطيني لم يُرفع فوق كنيس اريحا \n\n\n

\n اريحا \\(الضفة ال.{858}ائهم وغضبهم من ذلك ونظموا تظاهرات عدة الا ان الشرطة الاسرائيلية قامت كل مرة بتفريقهم\\. \n

\n
\n$', flags=48)), 9: TrecDoc('19940513_AFP_ARB.0010', re.compile('^\n\n \\&HT; وفد الجامعة \\.\\. \n\n\n\n صنعاء \\- وكانت الجامعة العربية استبعدت فكرة ارسال قوة فصل خلال اجتماعه.{1334}ب في حين صرح قائد البحرية الجنوبية العقيد علي قاسم طالب بانه تم صد الهجوم الشمالي في هذا القطاع\\. \n\n\n$', flags=48), re.compile('^\n \\&HT; وفد الجامعة \\.\\. \n\n\n

\n صنعاء \\- وكانت الجامعة العربية استبعدت فك.{1423}ح قائد البحرية الجنوبية العقيد علي قاسم طالب بانه تم صد الهجوم الشمالي في هذا القطاع\\. \n

\n
\n$', flags=48)), 383871: TrecDoc('20001220_AFP_ARB.0148', re.compile('^\n\nبوش يختار مهاجرا كوبيا وزيرا للاسكان\n\n\n\n اوستن \\(الولايات المتحدة\\) 02\\-21 \\(اف ب\\)\\- اعلن الرئيس المنتخ.{642}بير في قضية الطفل الكوبي اللاجىء أليان غونزاليس من خلال مشاركته في جهود منع عودة الطفل الى كوبا\\. \n\n\n$', flags=48), re.compile('^\nبوش يختار مهاجرا كوبيا وزيرا للاسكان\n\n\n

\n اوستن \\(الولايات المتحدة\\) 02\\-.{710} الطفل الكوبي اللاجىء أليان غونزاليس من خلال مشاركته في جهود منع عودة الطفل الى كوبا\\. \n

\n
\n$', flags=48)), }) def test_trec_arabic_queries(self): self._test_queries('trec-arabic/ar2001', count=25, items={ 0: TrecQuery(query_id='1', title='فنون العرض و المؤسسات الاسلامية في العالم العربي', description='ما هو اثر المؤسسات الاسلامية على فنون العرض مثل الرقص\n و الموسيقى في العالم العربي؟', narrative='المقالات المتعلقة بالفنون الرياضية او التشكيلية\n او بفنون العرض خارج العالم العربي \nاو بالسلوكيات الدينية خارج اطار فنون العرض \nاو بالديون و القروض المالية لا علاقة لها بالموضوع'), 9: TrecQuery(query_id='10', title='القضاء على شلل الأطفال في الشرق الأوسط', description='هل تحقق أي تقدم في القضاء على شلل الأطفال \n في الشرق الأوسط؟', narrative='ان المقالات التي تتحدث عن القضاء على شلل الاطفال عالميا\n او في مناطق اخرى في العالم من غير ذكر الشرق الاوسط\n ليست لها علاقة بالموضوع\n و يشمل الشرق الاوسط هنا الصومال و افغانستان و باكستان'), 24: TrecQuery(query_id='25', title='الدور الاوروبى والامريكى فى عملية السلام فى الشرق الأوسط', description='ما هي أدوار الدول الأوربية وامريكا في عملية السلام فى الشرق\n الأسط؟', narrative='يتعلق بهذا الموضوع كل مقال يخص التدخل الاوروبى والامريكى في\n القرارات العربية لتوجيه عملية السلام في الشرق الاوسط\n وما لا يرتبط بهذا الموضوع هو التدخل الاوروبى و الامريكى فى \nالشؤون الداخلية لدول الشرق الاوسط') }) self._test_queries('trec-arabic/ar2002', count=50, items={ 0: TrecQuery(query_id='26', title='مجلس المقاومة الوطني الكردستاني', description='كيف ينظر مجلس المقاومة الوطنية الى الإستقلال المحتمل للاكراد؟', narrative='الموضوع يتضمن نصوص متعلقة بتحركات مجلس المقاومة الوطنية ، مقالات تتحدث عن قيادة اوجلان ضمن جهود الاكراد للاستقلال .'), 9: TrecQuery(query_id='35', title='ميناء سلمان وام قصر', description='ما أهمية العلاقة التجارية البحرية بين مينء سلمان في البحرين\n وأم قصر في العراق؟', narrative='المقالات التي اعتبرت مناسبة تناقش المصالح الاقتصادية في وجود\n خط تجاري بحري بين ميناء سلمان في البحرين و ام قصر في العراق'), 49: TrecQuery(query_id='75', title='فيروسات الكمبيوتر في الوطن العربي', description='ما هو تأثير فيروسات الكمبيوتر على القطاع الصناعي والتجاري في الدول العربية؟', narrative='المقالات التي اعتبرت جزءا من موضوع البحث تناولت المشاكل الناجمة\n عن فيروسات الكمبيوتر في الدول العربية. المقالات التي تناولت الآثار\n الناجمة عن الفيروسات في دول اخرى اعتبرت خارج نطاق البحث') }) def test_trec_arabic_qrels(self): self._test_qrels('trec-arabic/ar2001', count=22744, items={ 0: TrecQrel(query_id='1', doc_id='19940515_AFP_ARB.0095', relevance=0, iteration='0'), 9: TrecQrel(query_id='1', doc_id='19940606_AFP_ARB.0009', relevance=0, iteration='0'), 22743: TrecQrel(query_id='25', doc_id='20001220_AFP_ARB.0044', relevance=0, iteration='0') }) self._test_qrels('trec-arabic/ar2002', count=38432, items={ 0: TrecQrel(query_id='26', doc_id='19940515_AFP_ARB.0115', relevance=0, iteration='0'), 9: TrecQrel(query_id='26', doc_id='19940519_AFP_ARB.0114', relevance=0, iteration='0'), 38431: TrecQrel(query_id='75', doc_id='20001211_AFP_ARB.0042', relevance=0, iteration='0') }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/trec_cast.py ================================================ import re import unittest from ir_datasets.formats import TrecQrel, GenericDoc, GenericScoredDoc from ir_datasets.datasets.trec_cast import Cast2019Query, Cast2020Query, Cast2021Query, Cast2022Query, CastDoc, CastPassage, CastPassageDoc from .base import DatasetIntegrationTest class TestTrecCast(DatasetIntegrationTest): def test_docs(self): self._test_docs('trec-cast/v1', count=None, items={ 0: GenericDoc(doc_id='MARCO_0', text='The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.'), 49: GenericDoc(doc_id='MARCO_49', text='Color—urine can be a variety of colors, most often shades of yellow, from very pale or colorless to very dark or amber. Unusual or abnormal urine colors can be the result of a disease process, several medications (e.g., multivitamins can turn urine bright yellow), or the result of eating certain foods.') }) self._test_docs('trec-cast/v2', count=None, items={ 0: CastDoc(doc_id='MARCO_D1555982', title='The hot glowing surfaces of stars emit energy in the form of electromagnetic radiation.?', url='https://answers.yahoo.com/question/index?qid=20071007114826AAwCFvR', passages=['Science & Mathematics Physics The hot glowing surfaces of stars emit energy in the form of electromagnetic radiation.? It is a good approximation to assume that the emissivity e is equal to 1 for these surfaces. Find the radius of the star Rigel, the bright blue star in the constellation Orion that radiates energy at a rate of 2.7 x 10^32 W and has a surface temperature of 11,000 K. Assume that the star is spherical. Use σ =... show more Follow 3 answers Answers Relevance Rating Newest Oldest Best Answer: Stefan-Boltzmann law states that the energy flux by radiation is proportional to the forth power of the temperature: q = ε · σ · T^4 The total energy flux at a spherical surface of Radius R is Q = q·π·R² = ε·σ·T^4·π·R² Hence the radius is R = √ ( Q / (ε·σ·T^4·π) ) = √ ( 2.7x10+32 W / (1 · 5.67x10-8W/m²K^4 · (1100K)^4 · π) ) = 3.22x10+13 m Source (s):http://en.wikipedia.org/wiki/Stefan_bolt...schmiso · 1 decade ago0 18 Comment Schmiso, you forgot a 4 in your answer. Your link even says it: L = 4pi (R^2)sigma (T^4). Using L, luminosity, as the energy in this problem, you can find the radius R by doing sqrt (L/ (4pisigma (T^4)). Hope this helps everyone. Caroline · 4 years ago4 1 Comment (Stefan-Boltzmann law) L = 4pi*R^2*sigma*T^4 Solving for R we get: => R = (1/ (2T^2)) * sqrt (L/ (pi*sigma)) Plugging in your values you should get: => R = (1/ (2 (11,000K)^2)) *sqrt ( (2.7*10^32W)/ (pi * (5.67*10^-8 W/m^2K^4))) R = 1.609 * 10^11 m? · 3 years ago0 1 Comment Maybe you would like to learn more about one of these? Want to build a free website? Interested in dating sites? Need a Home Security Safe? How to order contacts online?']), 49: CastDoc(doc_id='MARCO_D1256481', title='NIST definition for SaaS, PaaS, IaaS', url='https://cloudinfosec.wordpress.com/2013/05/04/nist-definition-for-saas-paas-iaas/', passages=['Software as a Service (Saa S) — The capability provided to the consumer is to use the provider’s applications running on a cloud infrastructure. The applications are accessible from various client devices through a thin client interface such as a web browser (e.g., web-based email). The consumer does not manage or control the underlying cloud infrastructure including network, servers, operating systems, storage, or even individual application capabilities, with the possible exception of limited user-specific application configuration settings. Examples: Gov-Apps, Internet Services Blogging/Surveys/Twitter, Social Networking Information/Knowledge Sharing (Wiki)Communication (e-mail), Collaboration (e-meeting)Productivity Tools (office)Enterprise Resource Planning (ERP)Platform as a Service (Paa S) — The capability provided to the consumer is to deploy onto the cloud infrastructure consumer-created or acquired applications created using programming languages and tools supported by the provider. The consumer does not manage or control the underlying cloud infrastructure including network, servers, operating systems, or storage, but has control over the deployed applications and possibly application hosting environment configurations. Examples: Application Development, Data, Workflow, etc. Security Services (Single Sign-On, Authentication, etc. )Database Management Directory Services Infrastructure as a Service (Iaa S) — The capability provided to the consumer is to provision processing, storage, networks, and other fundamental computing resources where the consumer is able to deploy and run arbitrary software, which can include operating systems and applications. The consumer does not manage or control the underlying cloud infrastructure but has control over operating systems, storage, deployed applications, and possibly limited control of select networking components (e.g., host firewalls). Examples: Mainframes, Servers, Storage IT Facilities/Hosting Services Advertisements Share this: Twitter Facebook Loading...']) }) self._test_docs('trec-cast/v2/passages', count=None, items={ 0: CastPassageDoc(doc_id='MARCO_D1555982-1', title='The hot glowing surfaces of stars emit energy in the form of electromagnetic radiation.?', url='https://answers.yahoo.com/question/index?qid=20071007114826AAwCFvR', text='Science & Mathematics Physics The hot glowing surfaces of stars emit energy in the form of electromagnetic radiation.? It is a good approximation to assume that the emissivity e is equal to 1 for these surfaces. Find the radius of the star Rigel, the bright blue star in the constellation Orion that radiates energy at a rate of 2.7 x 10^32 W and has a surface temperature of 11,000 K. Assume that the star is spherical. Use σ =... show more Follow 3 answers Answers Relevance Rating Newest Oldest Best Answer: Stefan-Boltzmann law states that the energy flux by radiation is proportional to the forth power of the temperature: q = ε · σ · T^4 The total energy flux at a spherical surface of Radius R is Q = q·π·R² = ε·σ·T^4·π·R² Hence the radius is R = √ ( Q / (ε·σ·T^4·π) ) = √ ( 2.7x10+32 W / (1 · 5.67x10-8W/m²K^4 · (1100K)^4 · π) ) = 3.22x10+13 m Source (s):http://en.wikipedia.org/wiki/Stefan_bolt...schmiso · 1 decade ago0 18 Comment Schmiso, you forgot a 4 in your answer. Your link even says it: L = 4pi (R^2)sigma (T^4).'), 49: CastPassageDoc(doc_id='MARCO_D1311240-14', title='President Roosevelt Led US To Victory In World War 2', url='http://vanrcook.tripod.com/presidentroosevelt.htm', text='President Roosevelt died on April 12, 1945, a little over two weeks before the death of Hitler and a month before Germany surrendered and the European part of World War 2 ended. After his death, Churchill wrote Eleanor: "I have lost a dear and cherished friendship which was forged in the fire of war. "Overall, Roosevelt was a wonderful leader for America. He pulled us through both the great depression of the 30s and the World War 2. He made his share of mistakes. For example, there is no doubt that he waffled on the Jewish problem even as news of German atrocities toward the Jews began to be publicized. However, when you look at how the other American leaders have handled different crisis in the twentieth century, he was, in my opinion, clearly superior to all the others. Web Sites: President Roosevelt In World War 2.1. Germany in World War 2 . Germany fought long and hard in World War 2 but the U. S., Great Britain, and Russia were too smart and tough.2. American Generals - World War 2. Roosevelt had some great generals working with him in World War 2. They had to be great! They were facing fine German generals who knew new warfare tactics, e.g., blitzkrieg.3. Pacific War.') }) self._test_docs('trec-cast/v3', count=None, items={ 0: CastPassageDoc(doc_id='MARCO_00_0-1', title='0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews', url='http://0-60.reviews/0-60-times/', text='0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews 0-60 Times There are many ways to measure the power a vehicle has – top speed, horsepower, foot-pounds of torque. Those are all important, but the most asked question is, “What’s the 0-60 time?” This is nothing more than a measure of how quickly a vehicle can reach the 60 mile per hour mark. It is a measure of acceleration of a vehicle. 0-60 times differ a great deal depending on the amount of power a motor puts out, of course. But anyone who spends any amount of time with car enthusiasts are sure to hear the ubiquitous term bantered around more often than most other metrics by which cars are measured in terms of power. The only other measure that comes close as far as how acceleration is commonly measures in cars in the United States is the quarter mile time. Enthusiasts will often ask about how quickly a car can get through a quarter mile, but that can be seen as less accurate a estimate of acceleration than the amount of time it takes a vehicle to reach the sixty miler per hour mark. The quarter mile time can often have more variable such as driver experience.'), 49: CastPassageDoc(doc_id='MARCO_00_44206-10', title='', url='http://001yourtranslationservice.com/kenax/Translators/Resources/TimeZones.htm', text='A year, which works out to 365 days (with some leap years, because our means of creating time is not exact), is the number of times the earth spins around its own axis (creating what we call a day) while the earth revolves in its orbit around the sun to come back to the same place it started at. This is how the Sumerians defined time. But because the world is a round globe, the beginning of night and day is different depending on where you are located in the world. What is high noon for someone in the US would be pitch black midnight for someone in China, on the other side of the world. Therefore, over time, we humans started to draw imaginary lines on the planet, cutting up the planet into 24 parts, one for each hour of the day. In the days of old, people used a sundial to tell the time, which is basically a solar clock. A little stick at a certain angle which would cast a shadow as the sun, from our perspective, would revolve around our planet.') }) def test_queries(self): self._test_queries('trec-cast/v0/train', count=269, items={ 0: Cast2019Query('1_1', "What is a physician's assistant?", 1, 1, "Career choice for Nursing and Physician's Assistant", "Considering career options for becoming a physician's assistant vs a nurse. Discussion topics include required education (including time, cost), salaries, and which is better overall."), 9: Cast2019Query('1_10', 'Is a PA above a NP?', 1, 10, "Career choice for Nursing and Physician's Assistant", "Considering career options for becoming a physician's assistant vs a nurse. Discussion topics include required education (including time, cost), salaries, and which is better overall."), 268: Cast2019Query('30_7', 'Tell me about how I can share files.', 30, 7, 'Linux and Windows', 'A comparison of Windows and Linux, followed by some tips regarding software installation etc.'), }) self._test_queries('trec-cast/v0/train/judged', count=120, items={ 0: Cast2019Query('1_1', "What is a physician's assistant?", 1, 1, "Career choice for Nursing and Physician's Assistant", "Considering career options for becoming a physician's assistant vs a nurse. Discussion topics include required education (including time, cost), salaries, and which is better overall."), 9: Cast2019Query('1_10', 'Is a PA above a NP?', 1, 10, "Career choice for Nursing and Physician's Assistant", "Considering career options for becoming a physician's assistant vs a nurse. Discussion topics include required education (including time, cost), salaries, and which is better overall."), 119: Cast2019Query('30_7', 'Tell me about how I can share files.', 30, 7, 'Linux and Windows', 'A comparison of Windows and Linux, followed by some tips regarding software installation etc.'), }) self._test_queries('trec-cast/v1/2019', count=479, items={ 0: Cast2019Query('31_1', 'What is throat cancer?', 31, 1, 'head and neck cancer', 'A person is trying to compare and contrast types of cancer in the throat, esophagus, and lungs.'), 9: Cast2019Query('32_1', 'What are the different types of sharks?', 32, 1, 'sharks', 'Information about sharks including several of the main types of sharks, their biological properties including size (whether they have teeth), as well as adaptations. This includes difference between sharks and whales.'), 478: Cast2019Query('80_10', 'What was the impact of the expedition?', 80, 10, 'Lewis and Clark expedition', 'Information about the Lewis and Clark expedition, findings, and its significance in US history.'), }) self._test_queries('trec-cast/v1/2019/judged', count=173, items={ 0: Cast2019Query('31_1', 'What is throat cancer?', 31, 1, 'head and neck cancer', 'A person is trying to compare and contrast types of cancer in the throat, esophagus, and lungs.'), 9: Cast2019Query('32_1', 'What are the different types of sharks?', 32, 1, 'sharks', 'Information about sharks including several of the main types of sharks, their biological properties including size (whether they have teeth), as well as adaptations. This includes difference between sharks and whales.'), 172: Cast2019Query('79_9', 'What are modern examples of conflict theory?', 79, 9, 'sociology', 'Information about the field of sociology including important people, theories, and how they relate to one another.'), }) self._test_queries('trec-cast/v1/2020', count=216, items={ 0: Cast2020Query('81_1', 'How do you know when your garage door opener is going bad?', 'How do you know when your garage door opener is going bad?', 'How do you know when your garage door opener is going bad?', 'MARCO_5498474', 81, 1), 9: Cast2020Query('82_2', 'What are the pros and cons?', 'What are the pros and cons of GMO Food labeling?', 'What are the pros and cons of GMO food labeling?', 'CAR_bafb3c1c72e23c444e182cac4e0ea9e4330d21c9', 82, 2), 215: Cast2020Query('105_9', 'What else motivates the Black Lives Matter movement?', 'What else motivates the Black Lives Matter movement?', 'What else motivates the Black Lives Matter movement?', 'MARCO_801480', 105, 9), }) self._test_queries('trec-cast/v1/2020/judged', count=208, items={ 0: Cast2020Query('81_1', 'How do you know when your garage door opener is going bad?', 'How do you know when your garage door opener is going bad?', 'How do you know when your garage door opener is going bad?', 'MARCO_5498474', 81, 1), 9: Cast2020Query('82_2', 'What are the pros and cons?', 'What are the pros and cons of GMO Food labeling?', 'What are the pros and cons of GMO food labeling?', 'CAR_bafb3c1c72e23c444e182cac4e0ea9e4330d21c9', 82, 2), 207: Cast2020Query('105_9', 'What else motivates the Black Lives Matter movement?', 'What else motivates the Black Lives Matter movement?', 'What else motivates the Black Lives Matter movement?', 'MARCO_801480', 105, 9), }) self._test_queries('trec-cast/v2/2021', count=239, items={ 0: Cast2021Query('106_1', 'I just had a breast biopsy for cancer. What are the most common types?', 'What are the most common types of cancer in regards to breast biopsy?', 'I just had a breast biopsy for cancer. What are the most common types of breast cancer?', 'MARCO_D59865', 106, 1), 9: Cast2021Query('106_10', 'Does freezing work?', 'Does freezing work for breast cancer?', 'Does freezing tumors work as an alternative to surgery for stage 1 invasive lobular cancer?', 'MARCO_D909677', 106, 10), 238: Cast2021Query('131_10', 'How is it different from a heat pump?', 'How is an AC compressor different from a heat pump?', 'How is an AC system different from a heat pump?', 'MARCO_D981398', 131, 10), }) self._test_queries('trec-cast/v3/2022', count=408, items={ 0: Cast2022Query('132_1-1', None, 'User', 'I remember Glasgow hosting COP26 last year, but unfortunately I was out of the loop. What was it about?', 'I remember Glasgow hosting COP26 last year, but unfortunately I was out of the loop. What was the conference about?', None, [], 132, '1-1'), 9: Cast2022Query(query_id='132_2-2', parent_id='132_2-1', participant='System', raw_utterance=None, manual_rewritten_utterance=None, response='For several years, there have been concerns that climate change negotiations will essentially ignore a key principle of climate change negotiation frameworks: the common but differentiated responsibilities. Realizing that greenhouse emissions remain in the atmosphere for a very long time, this principle recognizes that historically: Industrialized nations have emitted far more greenhouse gas emissions (even if some developing nations are only now increasing theirs); Rich countries, therefore, face the biggest responsibility and burden for action to address climate change; and Rich countries, therefore, must support developing nations adapt—through financing and technology transfer, for example. This notion of climate justice is typically ignored by many rich nations and their mainstream media, making it easy to blame China, India and other developing countries for failures in climate change mitigation negotiations.', provenance=['MARCO_06_772219573-9'], topic_number=132, turn_number='2-2'), 407: Cast2022Query(query_id='149_4-2', parent_id='149_4-1', participant='System', raw_utterance=None, manual_rewritten_utterance=None, response='Consider using a number of different search engines. While Google is a globally recognized search engine and an industry giant, in fact, even the second biggest “search engine” is Google Images, according to this study: Search engine market share Even if it’s the biggest and most well known, it doesn’t mean it’s your only choice. One of the main reasons that people choose to use an alternative search engine instead is for increased privacy, as Google is known to track user data both for its own and third-party use. If you’ve only ever used Google, check out some of the other search engines and you might find something that you prefer. Bing Bing search engine Microsoft’s Bing is the second largest search engine after Google. It’s easy to use and provides a more visual experience with beautiful daily background photos. Bing is great for video searches, as it displays results as large thumbnails that can be previewed with sound by hovering over them. DuckDuckGo is a popular search engine for those who value their privacy and are put off by the thought of their every query being tracked and logged.It has a very clean interface with minimal ads and infinite scrolling, so the user experience is nice and streamlined. There’s absolutely zero user tracking, and you can even add DuckDuckGo’s extension to your browser to keep your activity private.', provenance=['MARCO_23_183358734-1', 'MARCO_23_183358734-2', 'MARCO_23_183358734-31'], topic_number=149, turn_number='4-2'), }) def test_qrels(self): self._test_qrels('trec-cast/v0/train', count=2399, items={ 0: TrecQrel('1_1', 'MARCO_955948', 2, '0'), 9: TrecQrel('1_1', 'MARCO_4903530', 0, '0'), 2398: TrecQrel('30_7', 'MARCO_4250016', 0, '0'), }) self._test_qrels('trec-cast/v0/train/judged', count=2399, items={ 0: TrecQrel('1_1', 'MARCO_955948', 2, '0'), 9: TrecQrel('1_1', 'MARCO_4903530', 0, '0'), 2398: TrecQrel('30_7', 'MARCO_4250016', 0, '0'), }) self._test_qrels('trec-cast/v1/2019', count=29350, items={ 0: TrecQrel('31_1', 'CAR_116d829c4c800c2fc70f11692fec5e8c7e975250', 0, 'Q0'), 9: TrecQrel('31_1', 'CAR_40c64256e988c8103550008f4e9b7ce436d9536d', 2, 'Q0'), 29349: TrecQrel('79_9', 'MARCO_8795237', 3, 'Q0'), }) self._test_qrels('trec-cast/v1/2019/judged', count=29350, items={ 0: TrecQrel('31_1', 'CAR_116d829c4c800c2fc70f11692fec5e8c7e975250', 0, 'Q0'), 9: TrecQrel('31_1', 'CAR_40c64256e988c8103550008f4e9b7ce436d9536d', 2, 'Q0'), 29349: TrecQrel('79_9', 'MARCO_8795237', 3, 'Q0'), }) self._test_qrels('trec-cast/v1/2020', count=40451, items={ 0: TrecQrel('81_1', 'CAR_3add84966af079ed84e8b2fc412ad1dc27800127', 1, '0'), 9: TrecQrel('81_1', 'MARCO_1381086', 1, '0'), 40450: TrecQrel('105_9', 'MARCO_8757526', 0, '0'), }) self._test_qrels('trec-cast/v1/2020/judged', count=40451, items={ 0: TrecQrel('81_1', 'CAR_3add84966af079ed84e8b2fc412ad1dc27800127', 1, '0'), 9: TrecQrel('81_1', 'MARCO_1381086', 1, '0'), 40450: TrecQrel('105_9', 'MARCO_8757526', 0, '0'), }) self._test_qrels('trec-cast/v3/2022', count=41749, items={ 0: TrecQrel('132_1-1', 'KILT_1168284-2', 0, '0'), 9: TrecQrel('132_1-1', 'KILT_39457508-1', 0, '0'), 41748: TrecQrel('149_3-9', 'MARCO_56_987506629-2', 2, '0'), }) self._test_qrels('trec-cast/v2/2021', count=19334, items={ 0: TrecQrel('106_1', 'KILT_105219', 0, '0'), 9: TrecQrel('106_1', 'KILT_30271975', 0, '0'), 19333: TrecQrel('131_10', 'MARCO_D981403', 4, '0'), }) def test_scoreddocs(self): self._test_scoreddocs('trec-cast/v0/train', count=269000, items={ 0: GenericScoredDoc('1_1', 'MARCO_955948', -5.32579), 9: GenericScoredDoc('1_1', 'CAR_87772d4208721133d00d7d62f4eaaf164da5b4e3', -5.44505), 268999: GenericScoredDoc('30_7', 'WAPO_595c1be2ba9e3b1e66d552a174219c12-3', -7.07828), }) self._test_scoreddocs('trec-cast/v0/train/judged', count=120000, items={ 0: GenericScoredDoc('1_1', 'MARCO_955948', -5.32579), 9: GenericScoredDoc('1_1', 'CAR_87772d4208721133d00d7d62f4eaaf164da5b4e3', -5.44505), 119999: GenericScoredDoc('30_7', 'WAPO_595c1be2ba9e3b1e66d552a174219c12-3', -7.07828), }) self._test_scoreddocs('trec-cast/v1/2019', count=479000, items={ 0: GenericScoredDoc('31_1', 'MARCO_789620', -5.71312), 9: GenericScoredDoc('31_1', 'MARCO_291004', -5.88053), 478999: GenericScoredDoc('80_10', 'CAR_268dcb1c6bc4326f81500513e0ad9d11acb2a693', -5.23496), }) self._test_scoreddocs('trec-cast/v1/2019/judged', count=173000, items={ 0: GenericScoredDoc('31_1', 'MARCO_789620', -5.71312), 9: GenericScoredDoc('31_1', 'MARCO_291004', -5.88053), 172999: GenericScoredDoc('79_9', 'MARCO_1431776', -6.75024), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/trec_fair.py ================================================ import re import unittest from ir_datasets.formats import TrecQrel from ir_datasets.datasets.trec_fair import FairTrecDoc, FairTrec2022Doc, FairTrecQuery, FairTrecEvalQuery, FairTrec2022TrainQuery from .base import DatasetIntegrationTest class TestFairTrec(DatasetIntegrationTest): def test_docs(self): self._test_docs('trec-fair-2021', count=6280328, items={ 0: FairTrecDoc('12', 'Anarchism', re.compile('^\n\n\n\n\n\n\n\n\nAnarchism is a political philosophy and movement that is sceptical of authority and rejects.{46355}ance without government\n\n\nList of anarchist political ideologies\n\n\nList of books about anarchism\n\n\n\n$', flags=48), re.compile('^\\{\\{short description\\|Political philosophy and movement\\}\\}\n\\{\\{redirect2\\|Anarchist\\|Anarchists\\|other uses\\|.{96002}al movements\\]\\]\n\\[\\[Category:Political ideologies\\]\\]\n\\[\\[Category:Social theories\\]\\]\n\\[\\[Category:Socialism\\]\\]$', flags=48), 'https://en.wikipedia.org/wiki/Anarchism', 0.909, [], 'FA'), 9: FairTrecDoc('316', 'Academy Award for Best Production Design', re.compile("^\n\n\nThe Academy Award for Best Production Design recognizes achievement for art direction in film\\. Th.{829} Award for Best Production Design\n\n\nCritics' Choice Movie Award for Best Art Direction\n\n\n\nNotes\n\n\n\n\n$", flags=48), re.compile('^\\{\\{Use mdy dates\\|date=June 2013\\}\\}\n\\{\\{Infobox award\n\\| name = Academy Award for Best Production Des.{97494}\\]\n\\[\\[Category:Best Art Direction Academy Award winners\\|\\*\\]\\]\n\\[\\[Category:Awards for best art direction\\]\\]$', flags=48), 'https://en.wikipedia.org/wiki/Academy_Award_for_Best_Production_Design', 0.67862654, ['Northern America'], 'GA'), 6280327: FairTrecDoc('67277921', 'Oldřich I of Rosenberg', re.compile('^\n\n\n\nOldřich I of Rosenberg \\(died 4 March 1390\\) was the fourth son of the Peter I of Rosenberg and hi.{345}II, Duke of Austria\\. The dispute was eventually mediated by Czech King Wenceslas IV\\.\n\nCitations\n\n\n\n\n$', flags=48), re.compile('^\\{\\{inline\\|date=April 2021\\}\\}\n\\{\\{short description\\|14th\\-century Bohemian nobleman\\}\\}\n\\{\\{Infobox noble\\|type.{3160}gory:1390 deaths\\]\\]\n\\[\\[Category:14th\\-century Bohemian people\\]\\]\n\\[\\[Category:Medieval Bohemian nobility\\]\\]$', flags=48), 'https://en.wikipedia.org/wiki/Oldřich_I_of_Rosenberg', 0.55746955, ['Europe'], 'B'), }) self._test_docs('trec-fair/2022', count=6475537, items={ 0: FairTrec2022Doc('12148915', 'Keith Osik', re.compile('^Keith Richard Osik \\(born October 22, 1968\\), is a former Major League Baseball catcher who played in .{935}ers\nAlbuquerque Isotopes players\nNew Orleans Zephyrs players\nFarmingdale State Rams baseball coaches$', flags=48), 'https://en.wikipedia.org/wiki/Keith_Osik', 0.30334318, 'Stub', ['United States of America'], ['Northern America'], {}, {}, ['male'], ['athlete'], [1968], 2, 0.20211963, 'K', '2007-07-08', 'e-k', 'Man', '2007-2011', '20th century', 'Medium-Low', '2-4 languages'), 9: FairTrec2022Doc('69605138', 'Chang Gum-chol', re.compile('^Chang Gum\\-chol is a North Korean former footballer\\. He represented North Korea on at least nine occa.{209}rea international footballers\nAssociation football midfielders\nYear of birth missing \\(living people\\)$', flags=48), 'https://en.wikipedia.org/wiki/Chang_Gum-chol', 0.29654545, 'Stub', [], [], {}, {}, ['male'], ['athlete'], [], 1, 0.024254356, 'C', '2021-12-26', 'a-d', 'Man', '2017-2022', 'Unknown', 'Low', 'English only'), 6475536: FairTrec2022Doc('66716319', 'Royal & the Serpent', re.compile('^Ryan Santiago \\(born May 25, 1994\\), known professionally as Royal \\& the Serpent, is an American singe.{1684}indie pop musicians\nElectropop musicians\nAmerican women in electronic music\nAtlantic Records artists$', flags=48), 'https://en.wikipedia.org/wiki/Royal_%26_the_Serpent', 0.62231106, 'C', ['United States of America'], ['Northern America'], {'UNK': 5, 'United States of America': 24, 'Canada': 1}, {'UNK': 5, 'Northern America': 25}, ['female'], ['musician'], [1994], 1, 0.7524933, 'R', '2021-02-11', 'l-r', 'Woman', '2017-2022', '20th century', 'High', 'English only'), }) def test_queries(self): self._test_queries('trec-fair/2021/train', count=57, items={ 0: FairTrecQuery('1', 'Agriculture', ['agriculture', 'crops', 'livestock', 'forests', 'farming'], 'This WikiProject strives to develop and improve articles at Wikipedia related to crop production, livestock management, aquaculture, dairy farming and forest management. The project also covers related areas, including both governmental and NGO regulatory agencies, agribusiness, support agencies such as 4H, agricultural products including fertilizers and herbicides, pest management, veterinary medicine and farming equipment and facilities.', 'https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Agriculture'), 9: FairTrecQuery('10', 'Buddhism', ['buddhism', 'buddha', 'buddhist', 'buddhist temple', 'gautama buddha', 'monk'], 'WikiProject Buddhism is a group of people dedicated to improving Buddhism-related contents in Wikipedia.', 'https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Buddhism'), 56: FairTrecQuery('57', 'Nigeria', ['nigeria', 'nigerians', 'nigerian languages', 'nigerian history'], 'This WikiProject aims to help coordinate efforts to improve and maintain pages related to: Nigeria, the history of Nigeria, languages of Nigeria, Nigerians, Local government areas of Nigeria', 'https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Nigeria'), }) self._test_queries('trec-fair/2021/eval', count=49, items={ 0: FairTrecEvalQuery('101', 'Mathematicians', ['mathematician', 'arithmetician', 'trigonometrician', 'geometer', 'algebraist', 'statistician', 'geometrician', 'number theorist'], 'WikiProject Mathematicians aims to improve Wikipedia coverage of all articles on persons that have notable achievements in the field of mathematics.'), 9: FairTrecEvalQuery('110', 'Clean Energy', ['clean energy', 'sustainable energy', 'renewable energy', 'emissions', 'carbon capture', 'carbon storage'], 'Wikiproject Clean Energy aims to improve Wikipedia coverage of all aspects relating to sustainable and renewable energy sources, such as wind, hydro, solar and geothermal energy.'), 48: FairTrecEvalQuery('150', 'Street Art', ['street art', 'street performance', 'graffiti', 'busking'], 'This WikiProject aims to improve the coverage and quality of Wikipedia articles relating to street art and its creators. We are interested in both static street art, such as graffiti and murals, and dynamic street performances and theater. Both authorized and unauthorized art is in our scope.'), }) self._test_queries('trec-fair/2022/train', count=50, items={ 0: FairTrec2022TrainQuery('84', 'Agriculture', 'https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Agriculture'), 9: FairTrec2022TrainQuery('475', 'Business', 'https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Business'), 49: FairTrec2022TrainQuery('2859', 'Women', 'https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Women'), }) def test_qrels(self): self._test_qrels('trec-fair/2021/train', count=2185446, items={ 0: TrecQrel("1", "572", 1, "0"), 9: TrecQrel("1", "4514", 1, "0"), 2185445: TrecQrel("57", "67253426", 1, "0"), }) self._test_qrels('trec-fair/2021/eval', count=13757, items={ 0: TrecQrel('101', '915', 1, '0'), 9: TrecQrel('101', '80143', 1, '0'), 13756: TrecQrel('150', '65355704', 1, '0'), }) self._test_qrels('trec-fair/2022/train', count=2088306, items={ 0: TrecQrel('84', '572', 1, '0'), 9: TrecQrel('84', '4487', 1, '0'), 2088305: TrecQrel('2859', '69891491', 1, '0'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/trec_mandarin.py ================================================ import re import unittest from ir_datasets.datasets.trec_mandarin import TrecMandarinQuery from ir_datasets.formats import TrecQrel, TrecDoc from .base import DatasetIntegrationTest class TestTrecMandarin(DatasetIntegrationTest): def test_trec_mandarin_docs(self): self._test_docs('trec-mandarin', count=164789, items={ 0: TrecDoc('pd9312-1', re.compile('^\n\\#1 1993年12月01日\\#2 1\\#3 要闻\\#4 结束美国古巴巴西葡萄牙之行\\#4 江泽民主席回到北京\\#4 李鹏乔石刘华清荣毅仁等在人民大会堂迎接\\#4 祝贺江主席出访达到和平谅解合作互利目的\\#5 吴.{669}解和友谊。”他表示相信,在双方共同努力下,中葡两国友好合作关系必将发展到一个新的水平。\n\n陪同江泽民主席出访的国务院副总理兼外交部长钱其琛,特别助理曾庆红、杨德中,外交部副部长刘华秋等也同机离开。\n\n$', flags=48), re.compile('^\\#1 1993年12月01日\\#2 1\\#3 要闻\\#4 结束美国古巴巴西葡萄牙之行\\#4 江泽民主席回到北京\\#4 李鹏乔石刘华清荣毅仁等在人民大会堂迎接\\#4 祝贺江主席出访达到和平谅解合作互利目的\\#.{690}表示相信,在双方共同努力下,中葡两国友好合作关系必将发展到一个新的水平。\n\n陪同江泽民主席出访的国务院副总理兼外交部长钱其琛,特别助理曾庆红、杨德中,外交部副部长刘华秋等也同机离开。\n\n$', flags=48)), 9: TrecDoc('pd9312-10', '\n\n�#1 1993年12月01日#2 1#3 要闻#4 江泽民主席返回北京李鹏、乔石、刘华清、荣毅仁等在人民大会堂迎接#5 刘建国\n\n\n江泽民主席返回北京,李鹏、乔石、刘华清、荣毅仁等在北京人民大会堂迎接。\n\n新华社记者刘建国摄\n\n', '\n\n�#1 1993年12月01日#2 1#3 要闻#4 江泽民主席返回北京李鹏、乔石、刘华清、荣毅仁等在人民大会堂迎接#5 刘建国\n\n\n江泽民主席返回北京,李鹏、乔石、刘华清、荣毅仁等在北京人民大会堂迎接。\n\n新华社记者刘建国摄\n\n'), 164788: TrecDoc('CB049030-BFW-744-737', re.compile('^\n\n CB049030\\.BFW \\( 744\\) \n 1994\\-05\\-01 00:40:54 \\(5\\) \n\n 李鹏参加内蒙古庆祝“五一”国际劳动节活动 \n\n 新华社呼和浩特四月三十日电(记.{654}节文艺晚会,观看了具有浓郁民族风格的精彩文艺演出。 \n 晚会结束时,李鹏总理走上舞台,穿上蒙古服装,与演员们合影。 \n\n\n 司马义·艾买提和叶青、姚振炎、韩杼滨等也出席了文艺晚会。 \n (完) \n\n\n$', flags=48), re.compile('^\n CB049030\\.BFW \\( 744\\) \n 1994\\-05\\-01 00:40:54 \\(5\\) \n\n\n

\n

\n 司马义·艾买提和叶青、姚振炎、韩杼滨等也出席了文艺晚会。 \n (完) \n

\n
\n$', flags=48)), }) def test_trec_mandarin_queries(self): self._test_queries('trec-mandarin/trec5', count=28, items={ 0: TrecMandarinQuery(query_id='1', title_en='U.S. to separate the most-favored-nation status from human rights issue in China.', title_zh='美国决定将中国大陆的人权状况与其是否给予中共最惠国待\u3000遇分离.', description_en='most-favored nation status, human rights in\nChina, economic sanctions, separate, untie', description_zh='最惠国待遇,中国,人权,经济制裁,分离,脱钩', narrative_en='A relevant document should describe why the U.S.\nseparates most-favored nation status from \nhuman rights. A relevant document should \nalso mention why China opposes U.S. attempts\nto tie human rights to most-favored-nation\nstatus.', narrative_zh='相关文件必须提到美国为何将最惠国待遇与人权分离;\n\u3000相关文件也必须提到中共为什么反对美国将人权与最\n\u3000\u3000\u3000惠国待遇相提并论.'), 9: TrecMandarinQuery(query_id='10', title_en='Border Trade in Xinjiang', title_zh='新疆的边境贸易', description_en='Xinjiang, Uigur, border trade, market,', description_zh='新疆,维吾尔,边境贸易,边贸,市场', narrative_en='A relevant document should contain information on\nthe trading relationship between Xinjiang, China \nand its neighboring nations, including treaties \nsigned by China and former Soviet Republics\nthat are bordering China and foreign investment. \nIf a document contains information on how China\ndevelops Xinjiang, it is not relevant.', narrative_zh='相关文件必须包括中国新疆与其邻近国家的贸易关系,此关系包括\n中国与前苏联共和国之间所签署的贸易条约以及彼此间的外贸投\n资.如果文件只论及中国如何建设发展新疆,则属非相关文件.'), 27: TrecMandarinQuery(query_id='28', title_en='The Spread of Cellular Phones in China', title_zh='移动电话在中国的成长', description_en='digital, cellular, cellular phone, net, automatic roaming', description_zh='数字,蜂窝式,移动电话,网络,自动漫游', narrative_en='A relevant document contains the following kinds of information: \nthe number of cellular phone users, area coverage, or how PSDN \nis implemented for national cellular communication. A non-relevant \ndocument includes reports on commercial manufacturers or brand name \ncellular phones.', narrative_zh='相关文件应包括下列信息: 中国移动电话用户数,\n 覆盖地区, 中国如何以数据分组交换网覆盖\n 全国移动电话的通讯. 不相关文件则包括 有关\n 制造移动电话厂商的报道, 以及移动电话的\n 厂牌.') }) self._test_queries('trec-mandarin/trec6', count=26, items={ 0: TrecMandarinQuery(query_id='29', title_en='Building the Information Super Highway', title_zh='信息高速公路的建设', description_en='Information Super Highway, building', description_zh='信息高速公路,建设', narrative_en='A relevant document should discuss building the Information Super Highway, including any technical problems, problems with the information infrastructure, or plans for use of the Internet by developed or developing countries.', narrative_zh='相关文件应提到信息高速公路的建设,包括任何技术上的,或与信息基础设施有关的问题,以及有关发达国家或发展中国家对国际网络的应用计划.'), 9: TrecMandarinQuery(query_id='38', title_en='Protection of Wildlife in China', title_zh='中国野生动物保护形势', description_en='Protection of Wildlife, Legislation Protecting Wildlife, Associations for the Protection of Wildlife, rare and precious animals, endangered species', description_zh='野生动物保护,《野生动物保护法》,野生动物保护协会,珍稀动物,濒危动物', narrative_en='A relevant document should discuss protection of endangered species in China. Relevant documents include the following information: (1) "Legislation protecting endangered species", (2)\u3000rare and precious animals, (3) hunting and selling wild animals, (4) adopting measures to rescue rare animals, (5) market survelliance work, or (6) establishing preservation grounds for endangered species.', narrative_zh='相关文件应提到中国野生动物保护形势.相关文件包括下列信息:(一)《野生动物保护法》,(二)珍稀动物,(三)捕猎和销售野生动物,(四)采取措施抢救珍稀动物,(五)市场管制工作,或(六)建设濒危动物的保护区.'), 25: TrecMandarinQuery(query_id='54', title_en="China's Reaction to U.S. Sale of F-16 Fighters to Taiwan", title_zh='中国关于美国政府向台湾出售F-16战斗机的反应', description_en='China, U.S., Taiwan, F-16 fighter, sale', description_zh='中国,美国,台湾,F—16战斗机,出售', narrative_en='A relevant document should discuss the resolution concerning U.S. weapon sales to Taiwan in the Sino-American "8-17" Joint Communique and why the Chinese consider President Bush\'s decision to sell F-16 fighters to Taiwan to be in violation of the spirit of the Sino-American "8-17" Joint Communique and to be damaging to Sino-American relations.', narrative_zh='相关文件应提到中美“八·一七”联合公报中对美国向台湾出售武器之决定,以及为何中国认为布什总统决定售予台湾F—16战斗机是违反中美“八·一七”联合公报之精神并损害中美关系.') }) def test_trec_mandarin_qrels(self): self._test_qrels('trec-mandarin/trec5', count=15588, items={ 0: TrecQrel(query_id='1', doc_id='CB001007-BFJ-588-408', relevance=0, iteration='0'), 9: TrecQrel(query_id='1', doc_id='CB006019-BFJ-2117-506', relevance=0, iteration='0'), 15587: TrecQrel(query_id='28', doc_id='pd9312-91', relevance=0, iteration='0') }) self._test_qrels('trec-mandarin/trec6', count=9236, items={ 0: TrecQrel(query_id='29', doc_id='CB001004-BFW-1143-212', relevance=1, iteration='0'), 9: TrecQrel(query_id='29', doc_id='CB002028-BFW-1086-1035', relevance=0, iteration='0'), 9235: TrecQrel(query_id='54', doc_id='pd9312-1824', relevance=0, iteration='0') }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/trec_robust04.py ================================================ import re import unittest from ir_datasets.formats import TrecQrel, TrecDoc, TrecQuery from .base import DatasetIntegrationTest class TestTrecRobust04(DatasetIntegrationTest): def test_trec_robust04_docs(self): self._test_docs('trec-robust04', count=528155, items={ 0: TrecDoc('FBIS3-1', re.compile('^\n\nPOLITICIANS, PARTY PREFERENCES \n\n Summary: Newspapers in the Former Yugoslav Republic of \n M.{6912}SE CALL CHIEF, \nBALKANS BRANCH AT \\(703\\) 733\\-6481\\) \n\nELAG/25 February/POLCHF/EED/DEW 28/2023Z FEB \n\n\n$', flags=48), re.compile('^\nPOLITICIANS, PARTY PREFERENCES \n\n Summary: Newspapers in the Former Yugoslav Republic of .{6924} CHIEF, \nBALKANS BRANCH AT \\(703\\) 733\\-6481\\) \n\nELAG/25 February/POLCHF/EED/DEW 28/2023Z FEB \n\n\n$', flags=48)), 9: TrecDoc('FBIS3-10', re.compile('^\n\nHanoi Finds New Outlet for Surplus Labor \n\n Judging by a 1 March VNA report, Hanoi has found.{787}TS, PLEASE CALL CHIEF, \nASIA DIVISION ANALYSIS TEAM, \\(703\\) 733\\-6534\\.\\) \nEAG/BIETZ/ta 07/2051z mar \n\n\n$', flags=48), re.compile('^\nHanoi Finds New Outlet for Surplus Labor \n\n Judging by a 1 March VNA report, Hanoi has .{799}ASE CALL CHIEF, \nASIA DIVISION ANALYSIS TEAM, \\(703\\) 733\\-6534\\.\\) \nEAG/BIETZ/ta 07/2051z mar \n\n\n$', flags=48)), 528154: TrecDoc('LA123190-0134', re.compile("^\n\n\nDecember 31, 1990, Monday, P\\.M\\. Final \n\n\n\n\nSHORT TAKES; \n\n\nTAMMY SEES COUNTRY'S REBIRTH \n\n\n\n\nTamm.{547}icky Van Shelton, \nClint Black, Patty Loveless and Garth Brooks as among those making an impact\\. \n\n\n$", flags=48), re.compile('^\n

\nDecember 31, 1990, Monday, P\\.M\\. Final \n

\n
\n\n

\nSHORT TAKES; \n

\n

.{642}elton, \nClint Black, Patty Loveless and Garth Brooks as among those making an impact\\. \n

\n\n$', flags=48)), }) def test_trec_robust04_queries(self): self._test_queries('trec-robust04', count=250, items={ 0: TrecQuery(query_id='301', title='International Organized Crime', description='Identify organizations that participate in international criminal\nactivity, the activity, and, if possible, collaborating organizations\nand the countries involved.', narrative='A relevant document must as a minimum identify the organization and the\ntype of illegal activity (e.g., Columbian cartel exporting cocaine).\nVague references to international drug trade without identification of\nthe organization(s) involved would not be relevant.'), 9: TrecQuery(query_id='310', title='Radio Waves and Brain Cancer', description='Evidence that radio waves from radio towers or car phones affect\nbrain cancer occurrence.', narrative='Persons living near radio towers and more recently persons using\ncar phones have been diagnosed with brain cancer. The argument \nrages regarding the direct association of one with the other.\nThe incidence of cancer among the groups cited is considered, by\nsome, to be higher than that found in the normal population. A \nrelevant document includes any experiment with animals, statistical \nstudy, articles, news items which report on the incidence of brain \ncancer being higher/lower/same as those persons who live near a \nradio tower and those using car phones as compared to those in the \ngeneral population.'), 249: TrecQuery(query_id='700', title='gasoline tax U.S.', description='What are the arguments for and against an increase in gasoline\ntaxes in the U.S.?', narrative='Relevant documents present reasons for or against raising gasoline taxes\nin the U.S. Documents discussing rises or decreases in the price of\ngasoline are not relevant.') }) self._test_queries('trec-robust04/fold1', count=50, items={ 0: TrecQuery(query_id='302', title='Poliomyelitis and Post-Polio', description='Is the disease of Poliomyelitis (polio) under control in the\nworld?', narrative='Relevant documents should contain data or outbreaks of the \npolio disease (large or small scale), medical protection \nagainst the disease, reports on what has been labeled as \n"post-polio" problems. Of interest would be location of \nthe cases, how severe, as well as what is being done in \nthe "post-polio" area.'), 9: TrecQuery(query_id='341', title='Airport Security', description='A relevant document would discuss how effective\ngovernment orders to better scrutinize passengers\nand luggage on international flights and to step\nup screening of all carry-on baggage has been.', narrative='A relevant document would contain reports on what\nnew steps airports worldwide have taken to better \nscrutinize passengers and their luggage on \ninternational flights and to step up screening of\nall carry-on baggage. With the increase in \ninternational terrorism and in the wake of the\nTWA Flight 800 disaster, articles on airport \nsecurity relating in particular to additional\nsteps taken by airports to increase flight safety\nwould be relevant. The mere mention of enhanced \nsecurity does not constitute relevance. Additional\nsteps refer to something beyond just passenger\nand carry-on screening using the normal methods.\nExamples of new steps would be additional personnel, \nsophisticated monitoring and screening devices, \nand extraordinary measures to check luggage in the \nbaggage compartment.'), 49: TrecQuery(query_id='700', title='gasoline tax U.S.', description='What are the arguments for and against an increase in gasoline\ntaxes in the U.S.?', narrative='Relevant documents present reasons for or against raising gasoline taxes\nin the U.S. Documents discussing rises or decreases in the price of\ngasoline are not relevant.') }) self._test_queries('trec-robust04/fold2', count=50, items={ 0: TrecQuery(query_id='301', title='International Organized Crime', description='Identify organizations that participate in international criminal\nactivity, the activity, and, if possible, collaborating organizations\nand the countries involved.', narrative='A relevant document must as a minimum identify the organization and the\ntype of illegal activity (e.g., Columbian cartel exporting cocaine).\nVague references to international drug trade without identification of\nthe organization(s) involved would not be relevant.'), 9: TrecQuery(query_id='349', title='Metabolism', description='Document will discuss the chemical reactions \nnecessary to keep living cells healthy and/or\nproducing energy.', narrative='A relevant document will contain specific information\non the catabolic and anabolic reactions of the metabolic\nprocess. Relevant information includes, but is not \nlimited to, the reactions occurring in metabolism, \nbiochemical processes (Glycolysis or Krebs cycle for\nproduction of energy), and disorders associated with \nthe metabolic rate.'), 49: TrecQuery(query_id='698', title='literacy rates Africa', description='What are literacy rates in African countries?', narrative='A relevant document will contain information about the\nliteracy rate in an African country.\nGeneral education levels that do not specifically include literacy rates\nare not relevant.') }) self._test_queries('trec-robust04/fold3', count=50, items={ 0: TrecQuery(query_id='306', title='African Civilian Deaths', description='How many civilian non-combatants have been killed in \nthe various civil wars in Africa?', narrative='A relevant document will contain specific casualty \ninformation for a given area, country, or region. \nIt will cite numbers of civilian deaths caused \ndirectly or indirectly by armed conflict.'), 9: TrecQuery(query_id='354', title='journalist risks', description='Identify instances where a journalist has been put at risk (e.g.,\nkilled, arrested or taken hostage) in the performance of his work.', narrative='Any document identifying an instance where a journalist or \ncorrespondent has been killed, arrested or taken hostage in the \nperformance of his work is relevant.'), 49: TrecQuery(query_id='693', title='newspapers electronic media', description='What has been the effect of the electronic media on the newspaper\nindustry?', narrative='Relevant documents must explicitly attribute effects to the electronic\nmedia: information about declining readership is irrelevant unless\nit attributes the cause to the electronic media.') }) self._test_queries('trec-robust04/fold4', count=50, items={ 0: TrecQuery(query_id='320', title='Undersea Fiber Optic Cable', description="Fiber optic link around the globe (Flag) will be\nthe world's longest undersea fiber optic cable.\nWho's involved and how extensive is the technology\non this system. What problems exist?", narrative='Relevant documents will reference companies involved\nin building the system or the technology needed for\nsuch an endeavor. Of relevance also would be information\non the link up points of FLAG or landing sites or \ninterconnection with other telecommunication cables.\nRelevant documents may reference any regulatory problems\nwith the system once constructed. A non-relevant \ndocument would contain information on other fiber optic\nsystems currently in place.'), 9: TrecQuery(query_id='355', title='ocean remote sensing', description='Identify documents discussing the development and application of\nspaceborne ocean remote sensing.', narrative='Documents discussing the development and application of spaceborne \nocean remote sensing in oceanography, seabed prospecting and \nmining, or any marine-science activity are relevant. Documents \nthat discuss the application of satellite remote sensing in \ngeography, agriculture, forestry, mining and mineral prospecting \nor any land-bound science are not relevant, nor are references \nto international marketing or promotional advertizing of any \nremote-sensing technology. Synthetic aperture radar (SAR) \nemployed in ocean remote sensing is relevant.'), 49: TrecQuery(query_id='697', title='air traffic controller', description='What are working conditions and pay for U.S. air traffic controllers?', narrative='Relevant documents tell something about working conditions\nor pay for American controllers. Documents about foreign\ncontrollers or an individual controller are not relevant.') }) self._test_queries('trec-robust04/fold5', count=50, items={ 0: TrecQuery(query_id='304', title='Endangered Species (Mammals)', description='Compile a list of mammals that are considered to be endangered,\nidentify their habitat and, if possible, specify what threatens them.', narrative='Any document identifying a mammal as endangered is relevant. \nStatements of authorities disputing the endangered status would also\nbe relevant. A document containing information on habitat and\npopulations of a mammal identified elsewhere as endangered would also\nbe relevant even if the document at hand did not identify the species\nas endangered. Generalized statements about endangered species \nwithout reference to specific mammals would not be relevant.'), 9: TrecQuery(query_id='339', title="Alzheimer's Drug Treatment", description="What drugs are being used in the treatment of \nAlzheimer's Disease and how successful are they?", narrative="A relevant document should name a drug used in \nthe treatment of Alzheimer's Disease and also \nits manufacturer, and should give some indication \nof the drug's success or failure."), 49: TrecQuery(query_id='699', title='term limits', description='What are the pros and cons of term limits?', narrative='Relevant documents reflect an opinion on the value of term limits\nwith accompanying reason(s). Documents that cite the status of term\nlimit legislation or opinions on the issue sans reasons for the opinion\nare not relevant.') }) def test_trec_robust04_qrels(self): self._test_qrels('trec-robust04', count=311410, items={ 0: TrecQrel(query_id='301', doc_id='FBIS3-10082', relevance=1, iteration='0'), 9: TrecQrel(query_id='301', doc_id='FBIS3-10635', relevance=0, iteration='0'), 311409: TrecQrel(query_id='700', doc_id='LA123090-0137', relevance=0, iteration='0') }) self._test_qrels('trec-robust04/fold1', count=62789, items={ 0: TrecQrel(query_id='302', doc_id='FBIS3-10615', relevance=0, iteration='0'), 9: TrecQrel(query_id='302', doc_id='FBIS3-22470', relevance=0, iteration='0'), 62788: TrecQrel(query_id='700', doc_id='LA123090-0137', relevance=0, iteration='0') }) self._test_qrels('trec-robust04/fold2', count=63917, items={ 0: TrecQrel(query_id='301', doc_id='FBIS3-10082', relevance=1, iteration='0'), 9: TrecQrel(query_id='301', doc_id='FBIS3-10635', relevance=0, iteration='0'), 63916: TrecQrel(query_id='698', doc_id='LA123190-0100', relevance=0, iteration='0') }) self._test_qrels('trec-robust04/fold3', count=62901, items={ 0: TrecQrel(query_id='306', doc_id='FBIS3-1010', relevance=0, iteration='0'), 9: TrecQrel(query_id='306', doc_id='FBIS3-13331', relevance=0, iteration='0'), 62900: TrecQrel(query_id='693', doc_id='LA122789-0115', relevance=0, iteration='0') }) self._test_qrels('trec-robust04/fold4', count=57962, items={ 0: TrecQrel(query_id='320', doc_id='FBIS3-10291', relevance=0, iteration='0'), 9: TrecQrel(query_id='320', doc_id='FBIS3-20327', relevance=0, iteration='0'), 57961: TrecQrel(query_id='697', doc_id='LA122589-0068', relevance=0, iteration='0') }) self._test_qrels('trec-robust04/fold5', count=63841, items={ 0: TrecQrel(query_id='304', doc_id='FBIS3-1584', relevance=0, iteration='0'), 9: TrecQrel(query_id='304', doc_id='FBIS3-37947', relevance=0, iteration='0'), 63840: TrecQrel(query_id='699', doc_id='LA123190-0008', relevance=0, iteration='0') }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/trec_spanish.py ================================================ import re import unittest from ir_datasets.datasets.trec_spanish import TrecSpanish3Query, TrecSpanish4Query from ir_datasets.formats import TrecQrel, TrecDoc from .base import DatasetIntegrationTest class TestTrecSpanish(DatasetIntegrationTest): def test_trec_spanish_docs(self): self._test_docs('trec-spanish', count=120605, items={ 0: TrecDoc('AF940512-0001', '\n\n1948 GMT 94/05/12\n', '\n1948 GMT 94/05/12\n'), 9: TrecDoc('AF940512-0010', '\n\n1956 GMT 94/05/12\n', '\n1956 GMT 94/05/12\n'), 120604: TrecDoc('SP94-0202972', re.compile('^\n\n\nFirmarían Rusia y EU\n\n\nel tratado START II\n\nFuncionario de EU dice desconocer acuerdo final sobre.{4662}SARME\n REUNION\n SECCION INTERNACIONALES\n FIRMA\n PERSPECTIVA\n POLITICA\n CHINA\n ARMA ATOMICA\n\n\n$', flags=48), re.compile('^\n\nFirmarían Rusia y EU\n\n\nel tratado START II\n\nFuncionario de EU dice desc.{4695} REUNION\n SECCION INTERNACIONALES\n FIRMA\n PERSPECTIVA\n POLITICA\n CHINA\n ARMA ATOMICA\n\n\n$', flags=48)), }) def test_trec_spanish_queries(self): self._test_queries('trec-spanish/trec3', count=25, items={ 0: TrecSpanish3Query(query_id='1', title_es='Oposición Mexicana al TLC Mexican Opposition to NAFTA ', title_en='Documento contendrá información sobre oposición en México al tratado de libre comercio norteamericano.', description_es=' Document will contain information on opposition in Mexico to the North American Free Trade Agreement. ', description_en='Oposición al TLC en el gobierno mexicano, en medios comerciales, y en el sector privado. Dificultará esta oposición la implementación plena del TLC? Documento debe incluir nombres de compañías opuestas al TLC con razones y sectores específicos económicos y empresariales de oposición.', narrative_es=' Opposition to NAFTA in the Mexican Government, in commercial entities, and in the private sector. Will this opposition make full implementation of NAFTA impossible? The document should include names of companies, and specific economic and business sectors of opposition, including reasons for their opposition. ', narrative_en=''), 9: TrecSpanish3Query(query_id='10', title_es='México es importante país de tránsito en la guerra antinarcótica Mexico is an important transit country in the war against narcotics ', title_en='México es importante para los narcotraficantes de Colombia y Perú como punto de entrada para los Estados Unidos. Como se usan México como país de tránsito?', description_es=' Mexico is important to the narcotraffickers of Colombia and Peru as an entry point for the U.S. How is Mexico used as a transit country? ', description_en='Documento debe indicar métodos empleados por narcotraficantes para utilizar México como país de tránsito para hacer llegar las drogas ilegales a los Estados Unidos. Debe incluir ejemplos y locales específicos y medidas para impedir esta actividad.', narrative_es=' The document should indicate methods used by narcotraffickers to utilize Mexico as a transit country for getting drugs into the U.S. It should include specific examples and locations and measures for stopping this activity. ', narrative_en=''), 24: TrecSpanish3Query(query_id='25', title_es="Programa de Privatización de Empresas Públicas Mexicanas Program for Privatization of Mexico's Public Enterprises ", title_en='El programa mexicano de privatización es considerado uno de los más exitosos en América Latina. El documento debe describir el proceso de privatización de empresas públicas en México.', description_es=" Mexico's privatization program is considered one of the most successful in Latin America. The document should describe the process of privatization of public companies in Mexico. ", description_en='Para ser relevante el documento debe mencionar la empresa pública mexicana privatizada, incluyendo resultados y pronósticos de otros sectores que pueden ser privatizados.', narrative_es=' To be relevant, the document should mention the Mexican public enterprise that has been privatized, including results and predictions of other sectors that might be privatized. ', narrative_en='') }) self._test_queries('trec-spanish/trec4', count=25, items={ 0: TrecSpanish4Query(query_id='26', description_es1='Indicaciones de las relaciones económicas y comerciales de México con los paises europeos.', description_en1='Indications of Mexican economic and trade relations with European countries.', description_es2='Indicaciones de las relaciones económicas y comerciales de México con europa.', description_en2=' Indications of Mexican economic and trade relations with Europe. '), 9: TrecSpanish4Query(query_id='35', description_es1='Indicaciones de los potenciales y debilidades de las fuerzas aéreas militares de México.', description_en1='Indications of potentials and weaknesses of the Mexican Air Force.', description_es2='Indicaciones de fortalezas y debilidades de las fuerzas aéreas militares de México.', description_en2=' Indications of strengths and weaknesses of the Mexican Air Force. '), 24: TrecSpanish4Query(query_id='50', description_es1='La fabricación en México de joyas de plata y oro.', description_en1=' Manufacture of gold and silver jewelry in Mexico. ', description_es2='', description_en2='') }) def test_trec_spanish_qrels(self): self._test_qrels('trec-spanish/trec3', count=19005, items={ 0: TrecQrel(query_id='1', doc_id='SP94-0000082', relevance=1, iteration='0'), 9: TrecQrel(query_id='1', doc_id='SP94-0001385', relevance=0, iteration='0'), 19004: TrecQrel(query_id='25', doc_id='SP94-0202950', relevance=1, iteration='0') }) self._test_qrels('trec-spanish/trec4', count=13109, items={ 0: TrecQrel(query_id='26', doc_id='SP94-0000054', relevance=1, iteration='0'), 9: TrecQrel(query_id='26', doc_id='SP94-0000700', relevance=0, iteration='0'), 13108: TrecQrel(query_id='50', doc_id='SP94-0202879', relevance=0, iteration='0') }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/trec_tot.py ================================================ import re import unittest from ir_datasets.formats import TrecQrel from ir_datasets.datasets.trec_tot import TipOfTheTongueDoc, TipOfTheTongueQuery from .base import DatasetIntegrationTest import ir_datasets class TestTipOfTheTongue(DatasetIntegrationTest): def test_tip_of_the_tongue_docs(self): self._test_docs('trec-tot/2023', count=231852, items={ 0: TipOfTheTongueDoc('330', 'Actrius', 'Q2823770', [['Q11424', 'film']], re.compile('^Actresses \\(Catalan: Actrius\\) is a 1997 Catalan language Spanish drama film produced and directed by .{1648}ical work is true to the original, but does not stray far from a theatrical rendering of the story\\."$', flags=48), {'abstract': 'Actresses (Catalan: Actrius) is a 1997 Catalan language Spanish drama film produced and directed by Ventura Pons and based on the award-winning stage play "E.R." by Josep Maria Benet i Jornet. The film has no male actors, with all roles played by females. The film was produced in 1996. ', 'synopsis': "Synopsis.\nIn order to prepare herself to play a role commemorating the life of legendary actress Empar Ribera, young actress (Mercè Pons) interviews three established actresses who had been the Ribera's pupils: the international diva Glòria Marc (Núria Espert), the television star Assumpta Roca (Rosa Maria Sardà), and dubbing director Maria Caminal (Anna Lizaran).", 'recognition': 'Recognition.\nScreenings.\n"Actrius" screened in 2001 at the Grauman\'s Egyptian Theatre in an American Cinematheque retrospective of the works of its director. The film had first screened at the same location in 1998. It was also shown at the 1997 Stockholm International Film Festival.\nReception.\nIn "Movie - Film - Review", Christopher Tookey wrote that though the actresses were "competent in roles that may have some reference to their own careers", the film "is visually unimaginative, never escapes its stage origins, and is almost totally lacking in revelation or surprising incident". Noting that there were "occasional, refreshing moments of intergenerational bitchiness", they did not "justify comparisons to "All About Eve"", and were "insufficiently different to deserve critical parallels with "Rashomon"". He also wrote that "The Guardian" called the film a "slow, stuffy chamber-piece", and that "The Evening Standard" stated the film\'s "best moments exhibit the bitchy tantrums seething beneath the threesome\'s composed veneers". MRQE wrote "This cinematic adaptation of a theatrical work is true to the original, but does not stray far from a theatrical rendering of the story."', 'screenings': 'Screenings.\n"Actrius" screened in 2001 at the Grauman\'s Egyptian Theatre in an American Cinematheque retrospective of the works of its director. The film had first screened at the same location in 1998. It was also shown at the 1997 Stockholm International Film Festival.', 'reception': 'Reception.\nIn "Movie - Film - Review", Christopher Tookey wrote that though the actresses were "competent in roles that may have some reference to their own careers", the film "is visually unimaginative, never escapes its stage origins, and is almost totally lacking in revelation or surprising incident". Noting that there were "occasional, refreshing moments of intergenerational bitchiness", they did not "justify comparisons to "All About Eve"", and were "insufficiently different to deserve critical parallels with "Rashomon"". He also wrote that "The Guardian" called the film a "slow, stuffy chamber-piece", and that "The Evening Standard" stated the film\'s "best moments exhibit the bitchy tantrums seething beneath the threesome\'s composed veneers". MRQE wrote "This cinematic adaptation of a theatrical work is true to the original, but does not stray far from a theatrical rendering of the story."'}, [{'name': 'film', 'params': {'name': 'Actresses', 'image': 'Actrius film poster.jpg', 'caption': 'Catalan language film poster', 'native_name': "([[Catalan language|Catalan]]: '''''Actrius''''')", 'director': '[[Ventura Pons]]', 'producer': 'Ventura Pons', 'writer': '[[Josep Maria Benet i Jornet]]', 'screenplay': 'Ventura Pons', 'based_on': "{{based on|(stage play) ''E.R.''|Josep Maria Benet i Jornet}}", 'starring': '{{ubl|[[Núria Espert]]|[[Rosa Maria Sardà]]|[[Anna Lizaran]]|[[Mercè Pons]]}}', 'narrator': '', 'music': 'Carles Cases', 'cinematography': 'Tomàs Pladevall', 'editing': 'Pere Abadal', 'production_companies': '{{ubl|[[Canal+|Canal+ España]]|Els Films de la Rambla S.A.|[[Generalitat de Catalunya|Generalitat de Catalunya - Departament de Cultura]]|[[Televisión Española]]}}', 'distributor': '[[Buena Vista International]]', 'released': '{{film date|df=yes|1997|1|17|[[Spain]]}}', 'runtime': '100 minutes', 'country': 'Spain', 'language': 'Catalan', 'gross': ''}}]), 9: TipOfTheTongueDoc('3837', 'Blazing Saddles', 'Q957323', [['Q11424', 'film']], re.compile('^Blazing Saddles is a 1974 American satirical western black comedy film directed by Mel Brooks, who a.{16434}tion DVD in 2004 and a Blu\\-ray version in 2006\\. A 40th anniversary Blu\\-ray set was released in 2014\\.$', flags=48), {'abstract': 'Blazing Saddles is a 1974 American satirical western black comedy film directed by Mel Brooks, who also wrote the screenplay with Andrew Bergman, Richard Pryor, Norman Steinberg, and Alan Uger. The film stars Cleavon Little and Gene Wilder. The film received generally positive reviews from critics and audiences, was nominated for three Academy Awards and is ranked No. 6 on the American Film Institute\'s "100 Years...100 Laughs" list.\nBrooks appears in three supporting roles, Governor William J. Le Petomane, a Yiddish-speaking Native American chief and "a director" in line to help invade Rock Ridge (a nod to Hitchcock); he also dubs lines for one of Lili Von Shtupp\'s backing troupe. The supporting cast includes Slim Pickens, Alex Karras, and David Huddleston, as well as Brooks regulars Dom DeLuise, Madeline Kahn, and Harvey Korman. Bandleader Count Basie has a cameo as himself, appearing with his orchestra.\nThe film is full of deliberate anachronisms, from the Count Basie Orchestra playing "April in Paris" in the Wild West, to Slim Pickens referring to the "Wide World of Sports".\nIn 2006, "Blazing Saddles" was deemed "culturally, historically, or aesthetically significant" by the Library of Congress and was selected for preservation in the National Film Registry.', 'plot': 'Plot.\nOn the American frontier of 1874, a new railroad under construction will have to be rerouted through the town of Rock Ridge in order to avoid quicksand. Realizing this will make Rock Ridge worth millions, territorial attorney general Hedley Lamarr plans to force Rock Ridge\'s residents out of the town and sends a gang of thugs, led by his flunky Taggart, to shoot the sheriff and trash the town. The townspeople demand that Governor William J. Le Petomane appoint a new sheriff to protect them. Lamarr persuades dim-witted Le Petomane to appoint Bart, a black railroad worker about to be executed for assaulting Taggart. A black sheriff, Lamarr reasons, will offend the townspeople, create chaos, and leave Rock Ridge at his mercy.\nAfter an initial hostile reception (Bart takes himself "hostage" to escape), he relies on his quick wits and the assistance of Jim, an alcoholic gunslinger known as the "Waco Kid", to overcome the townspeople\'s hostility. Bart subdues Mongo, an immensely strong and dim-witted yet philosophical henchman sent to kill him, then outwits German seductress-for-hire Lili Von Shtupp at her own game, with Lili falling in love with him. Upon release, Mongo vaguely informs Bart of Lamarr\'s connection to the railroad, so Bart and Jim visit the railroad worksite and discover from Bart\'s best friend Charlie that the railway is planned to go through Rock Ridge. Taggart and his men arrive to kill Bart, but Jim outshoots them and forces their retreat. Lamarr, furious that his schemes have backfired, recruits an army of thugs, including common criminals, motorcycle gangsters, Ku Klux Klansmen, Nazis, and Methodists.\nEast of Rock Ridge, Bart introduces the white townspeople to the black, Chinese, and Irish railroad workers who have agreed to help in exchange for acceptance by the community, and explains his plan to defeat Lamarr\'s army. They labor all night to build a perfect copy of the town as a diversion. When Bart realizes it will not fool the villains, the townsfolk construct copies of themselves. Bart, Jim, and Mongo buy time by constructing the "Gov. William J. Le Petomane Thruway", forcing the raiding party to send for change to pay the toll. Once through the tollbooth, the raiders attack the fake town and its population of dummies, which have been booby-trapped with dynamite. After Jim detonates the bombs with his sharpshooting, launching bad guys and horses skyward, the Rock Ridgers attack the villains.\nThe resulting brawl between townsfolk, railroad workers, and Lamarr\'s thugs literally breaks the fourth wall and bursts onto a neighboring movie set where director Buddy Bizarre is filming a Busby Berkeley-style top-hat-and-tails musical number; into the studio commissary for a food fight; and spilling out of the Warner Bros. film lot onto the streets of Burbank. Lamarr, realizing he has been beaten, hails a taxi and orders the cabbie to "drive me off this picture". He ducks into Mann\'s Chinese Theatre, which is showing the premiere of "Blazing Saddles". As he settles into his seat, he sees onscreen Bart arriving on horseback outside the theatre. Bart blocks Lamarr\'s escape and shoots him in the groin. Bart and Jim then enter the theater to watch the end of the film, in which Bart announces to the townspeople that he is moving on because his work is done (and he is bored). Riding out of town, he finds Jim, still eating his popcorn, and invites him along to "nowhere special". The two friends briefly ride into the desert before dismounting and boarding a limousine, which drives off into the sunset.', 'cast': 'Cast.\nCast notes:', 'production': 'Production.\nThe idea for the film came from a story outline written by Andrew Bergman that he originally intended to develop and produce himself. "I wrote a first draft called "Tex-X"" (a play on Malcolm X\'s name), he said. "Alan Arkin was hired to direct and James Earl Jones was going to play the sheriff. That fell apart, as things often do." Brooks was taken with the story, which he described as "hip talk—1974 talk and expressions—happening in 1874 in the Old West", and purchased the film rights from Bergman. Though he had not worked with a writing team since "Your Show of Shows", he hired a group of writers (including Bergman) to expand the outline, and posted a large sign: "Please do not write a polite script." Brooks described the writing process as chaotic: ""Blazing Saddles" was more or less written in the middle of a drunken fistfight. There were five of us all yelling loudly for our ideas to be put into the movie. Not only was I the loudest, but luckily I also had the right as director to decide what was in or out." Bergman remembers the room being just as chaotic, telling "Creative Screenwriting", "In the beginning, we had five people. One guy left after a couple of weeks. Then, it was basically me, Mel, Richie Pryor and Norman Steinberg. Richie left after the first draft and then Norman, Mel and I wrote the next three or four drafts. It was a riot. It was a rioter’s room!"\nThe original title, "Tex X", was rejected to avoid it being mistaken for an X-rated film, as were "Black Bart" – a reference to Black Bart, a white highwayman of the 19th century – and "Purple Sage". Brooks said he finally conceived "Blazing Saddles" one morning while taking a shower.\nThe casting was problematic. Richard Pryor was Brooks\' original choice to play Sheriff Bart, but the studio, claiming his history of drug arrests made him uninsurable, refused to approve financing with Pryor as the star. The role of Sheriff Bart went to Cleavon Little, and Pryor remained as a screenwriter instead. Brooks offered the other leading role, the Waco Kid, to John Wayne; he declined, deeming the film "too blue" for his family-oriented image, but assured Brooks that "he would be the first one in line to see it." Gig Young was cast, but he collapsed during his first scene from what was later determined to be alcohol withdrawal syndrome, and Gene Wilder was flown in to replace him. Johnny Carson and Wilder both turned down the Hedley Lamarr role before Harvey Korman was cast. Madeline Kahn objected when Brooks asked to see her legs during her audition. "She said, \'So it\'s THAT kind of an audition? Brooks recalled. "I explained that I was a happily married man and that I needed someone who could straddle a chair with her legs like Marlene Dietrich in "Destry Rides Again." So she lifted her skirt and said, \'No touching.\nBrooks had numerous conflicts over content with Warner Bros. executives, including frequent use of the word "nigger", Lili Von Shtupp\'s seduction scene, the cacophony of flatulence around the campfire, and Mongo punching out a horse. Brooks, whose contract gave him final content control, declined to make any substantive changes, with the exception of cutting Bart\'s final line during Lili\'s seduction: "I hate to disappoint you, ma\'am, but you\'re sucking my arm." When asked later about the many "nigger" references, Brooks said he received consistent support from Pryor and Little. He added, "If they did a remake of "Blazing Saddles" today [2012], they would leave out the N-word. And then, you\'ve got no movie." Brooks said he received many letters of complaint after the film\'s release.\nThe film was almost not released. "When we screened it for executives, there were few laughs," said Brooks. "The head of distribution said, \'Let\'s dump it and take a loss.\' But [studio president John] Calley insisted they open it in New York, Los Angeles and Chicago as a test. It became the studio\'s top moneymaker that summer." The world premiere took place on February 7, 1974, at the Pickwick Drive-In Theater in Burbank; 250 invited guests—including Little and Wilder—watched the film on horseback.', 'songs and music': 'Songs and music.\nMel Brooks wrote the music and lyrics for three of "Blazing Saddles" songs, "The Ballad of Rock Ridge", "I\'m Tired", and "The French Mistake". Brooks also wrote the lyrics to the title song, with music by John Morris, the composer of the film\'s score. To sing the title song, Brooks advertised in the trade papers for a "Frankie Laine–type" singer; to his surprise, Laine himself offered his services. "Frankie sang his heart out ... and we didn\'t have the heart to tell him it was a spoof. He never heard the whip cracks; we put those in later. We got so lucky with his serious interpretation of the song."\nThe choreographer for "I\'m Tired" and "The French Mistake" was Alan Johnson. "I\'m Tired" is a homage to and parody of Marlene Dietrich\'s singing of Cole Porter\'s song "I\'m the Laziest Gal in Town" in Alfred Hitchcock\'s 1950 film "Stage Fright", as well as "Falling in Love Again (Can\'t Help It)" from "The Blue Angel".\nThe orchestrations were by Morris and Jonathan Tunick.\nThe first studio-licensed release of the full music soundtrack to "Blazing Saddles" was on La-La Land Records on August 26, 2008. Remastered from original studio vault elements, the limited-edition CD – a run of 3,000 – features the songs from the film as well as Morris\'s score. Instrumental versions of all the songs are bonus tracks on the disc. The disc features liner notes featuring comments from Mel Brooks and John Morris.', 'reception': 'Reception.\nWhile "Blazing Saddles" is now considered a classic comedy, critical reaction was mixed when the film was released. Vincent Canby wrote:\nRoger Ebert gave the film four stars out of four, calling it a "crazed grab bag of a movie that does everything to keep us laughing except hit us over the head with a rubber chicken. Mostly, it succeeds. It\'s an audience picture; it doesn\'t have a lot of classy polish and its structure is a total mess. But of course! What does that matter while Alex Karras is knocking a horse cold with a right cross to the jaw?" Gene Siskel awarded three stars out of four and called it "bound to rank with the funniest of the year," adding, "Whenever the laughs begin to run dry, Brooks and his quartet of gag writers splash about in a pool of obscenities that score belly laughs if your ears aren\'t sensitive and if you\'re hip to western movie conventions being parodied." "Variety" wrote, "If comedies are measured solely by the number of yocks they generate from audiences, then \'Blazing Saddles\' must be counted a success ... Few viewers will have time between laughs to complain that pic is essentially a raunchy, protracted version of a television comedy skit."\nCharles Champlin of the "Los Angeles Times" called the film "irreverent, outrageous, improbable, often as blithely tasteless as a stag night at the Friar\'s Club and almost continuously funny." Gary Arnold of "The Washington Post" was negative, writing that "Mel Brooks squanders a snappy title on a stockpile of stale jokes. To say that this slapdash Western spoof lacks freshness and spontaneity and originality is putting it mildly. \'Blazing Saddles\' is at once a messy and antiquated gag machine." Jan Dawson of "The Monthly Film Bulletin" wrote, "Perhaps it is pedantic to complain that the whole is not up to the sum of its parts when, for the curate\'s egg that it is, "Blazing Saddles" contains so many good parts and memorable performances." John Simon wrote a negative review of "Blazing Saddles", saying, "All kinds of gags—chiefly anachronisms, irrelevancies, reverse ethnic jokes, and out and out vulgarities—are thrown together pell-mell, batted about insanely in all directions, and usually beaten into the ground."\nOn review aggregator Rotten Tomatoes, the film has an 89% approval rating based on 61 reviews, with a average rating of 8.20/10. The site\'s consensus reads: "Daring, provocative, and laugh-out-loud funny, "Blazing Saddles" is a gleefully vulgar spoof of Westerns that marks a high point in Mel Brooks\' storied career." On Metacritic it has a score of 73% based on reviews from 12 critics, indicating "generally favorable reviews".\nDuring production for the film, retired longtime film star Hedy Lamarr sued Warner Bros. for $100,000, charging that the film\'s running parody of her name infringed on her right to privacy. Brooks said that he was flattered and chose to not fight it in court; the studio settled out of court for a small sum and an apology for "almost using her name." Brooks said that Lamarr "never got the joke." This lawsuit would be referenced by an in-film joke where Brooks\' character, the Governor, tells Hedley Lamarr that "This is 1874; you\'ll be able to sue HER."\nIshmael Reed\'s 1969 novel "Yellow Back Radio Broke-Down" has been cited as an important precursor or influence for "Blazing Saddles", a connection that Reed himself has made.\nBox office.\nThe film earned theatrical rentals of $26.7 million in its initial release in the United States and Canada. In its 1976 reissue, it earned a further $10.5 million and another $8 million in 1979. Its total rentals in the United States and Canada totalled $47.8 million from a gross of $119.5 million, becoming only the tenth film up to that time to pass the $100 million mark.', 'box office': 'Box office.\nThe film earned theatrical rentals of $26.7 million in its initial release in the United States and Canada. In its 1976 reissue, it earned a further $10.5 million and another $8 million in 1979. Its total rentals in the United States and Canada totalled $47.8 million from a gross of $119.5 million, becoming only the tenth film up to that time to pass the $100 million mark.', 'awards and accolades': 'Awards and accolades.\nWhile addressing his group of bad guys, Harvey Korman\'s character reminds them that, although they are risking their lives, he is "risking an almost certain Academy Award nomination for Best Supporting Actor!" Korman did not receive an Oscar bid, but the film did get three nominations at the 47th Academy Awards, including Best Supporting Actress for Madeline Kahn.\nIn 2006, "Blazing Saddles" was deemed "culturally, historically, or aesthetically significant" by the Library of Congress and was selected for preservation in the National Film Registry.\nUpon the release of the 30th anniversary special edition in 2004, "Today" said that the movie "skewer[ed] just about every aspect of racial prejudice while keeping the laughs coming" and that it was "at the top of a very short list" of comedies still funny after 30 years. In 2014, NPR wrote that four decades after the movie was made it was "still as biting a satire" on racism as ever, although its treatment of gays and women was "not self-aware at all".\nThe film is recognized by American Film Institute in these lists:', 'adaptations': 'Adaptations.\nTV series.\nA television pilot titled "Black Bart" was produced for CBS based on Bergman\'s original story. It featured Louis Gossett Jr. as Bart and Steve Landesberg as his drunkard sidekick, a former Confederate officer named "Reb Jordan". Other cast members included Millie Slavin and Noble Willingham. Bergman is listed as the sole creator. CBS aired the pilot once on April 4, 1975. The pilot episode of "Black Bart" was later included as a bonus feature on the "Blazing Saddles" 30th Anniversary DVD and the Blu-ray disc.\nPossible stage production.\nIn September 2017, Mel Brooks indicated his desire to do a stage play version of "Blazing Saddles" in the future.', 'tv series': 'TV series.\nA television pilot titled "Black Bart" was produced for CBS based on Bergman\'s original story. It featured Louis Gossett Jr. as Bart and Steve Landesberg as his drunkard sidekick, a former Confederate officer named "Reb Jordan". Other cast members included Millie Slavin and Noble Willingham. Bergman is listed as the sole creator. CBS aired the pilot once on April 4, 1975. The pilot episode of "Black Bart" was later included as a bonus feature on the "Blazing Saddles" 30th Anniversary DVD and the Blu-ray disc.', 'possible stage production': 'Possible stage production.\nIn September 2017, Mel Brooks indicated his desire to do a stage play version of "Blazing Saddles" in the future.', 'in popular culture': 'In popular culture.\nThe Rock Ridge standard for CD and DVD media is named after the town in "Blazing Saddles".\nThe 2022 animated film "", starring Michael Cera, Samuel L. Jackson, Michelle Yeoh, and Ricky Gervais, was originally titled "Blazing Samurai" and its creators called it "equally inspired by and an homage to "Blazing Saddles"." Brooks served as an executive producer for the production, and voiced one of the characters.', 'home media': 'Home media.\nThe film was first released on DVD in 1997, followed by a 30th Anniversary Special Edition DVD in 2004 and a Blu-ray version in 2006. A 40th anniversary Blu-ray set was released in 2014.'}, [{'name': 'film', 'params': {'name': 'Blazing Saddles', 'image': 'blazing_saddles_movie_poster.jpg', 'caption': 'Theatrical release poster by [[John Alvin]]{{cite news|first=Jocelyn|last=Stewart|title=John Alvin, 59; created movie posters for such films as \'Blazing Saddles\' and \'E.T.\'|url=https://www.latimes.com/news/printedition/california/la-me-alvin10feb10,1,5113268.story|newspaper=[[Los Angeles Times]]|date=February 10, 2008|access-date=February 10, 2008|archive-date=March 14, 2010|archive-url=https://web.archive.org/web/20100314184353/http://articles.latimes.com/2008/feb/10/local/me-alvin10|url-status=live}}', 'director': '[[Mel Brooks]]', 'producer': '[[Michael Hertzberg]]', 'screenplay': '{{Plainlist|\n* Mel Brooks\n* [[Norman Steinberg]]\n* [[Andrew Bergman]]\n* [[Richard Pryor]]\n* [[Alan Uger]]\n}}', 'story': 'Andrew Bergman', 'starring': '{{Plainlist|\n* [[Cleavon Little]]\n* [[Gene Wilder]]\n* [[Slim Pickens]]\n* [[Alex Karras]]\n* Mel Brooks\n* [[Harvey Korman]]\n* [[Madeline Kahn]]\n}}', 'music': '[[John Morris (composer)|John Morris]]', 'cinematography': '[[Joseph Biroc]]', 'editing': '{{Plainlist|\n* [[Danford B. Greene|Danford Greene]]\n* [[John C. Howard]]\n}}', 'studio': 'Crossbow Productions', 'distributor': '[[Warner Bros.]]', 'released': '{{Film date|1974|2|7}}', 'runtime': '93 minutes{{AFI film|54998}}', 'country': 'United States', 'budget': '$2.6 million', 'gross': '$119.6 million (US and Canada){{cite web|url=https://www.boxofficemojo.com/release/rl4064773633/rankings/|title=Blazing Saddles (1974)|access-date=January 17, 2012|website=[[Box Office Mojo]]|archive-date=July 14, 2019|archive-url=https://web.archive.org/web/20190714112524/https://www.boxofficemojo.com/movies/?id=blazingsaddles.htm|url-status=live}}'}}]), 231851: TipOfTheTongueDoc('72634502', 'Skulle det dukke opp flere lik er det bare å ringe', 'Q17415598', [['Q11424', 'film']], re.compile('^Skulle det dukke opp flere lik er det bare å ringe \\(If Any More Bodies Should Turn Up, Just Call\\) is.{192} the film\\. The stage play was a translation of Jack Popplewell\'s British play "Busybody", from 1964\\.$', flags=48), {'abstract': 'Skulle det dukke opp flere lik er det bare å ringe (If Any More Bodies Should Turn Up, Just Call) is a Norwegian crime comedy from 1970 directed by Knut Bohwim. The film stars Arve Opsahl and Aud Schønemann. The film is based on a popular theater production from 1968 with the same cast as in the film. The stage play was a translation of Jack Popplewell\'s British play "Busybody", from 1964.'}, [{'name': 'film', 'params': {'name': 'Skulle det dukke opp flere lik, er det bare å ringe', 'director': '[[Knut Bohwim]]', 'producer': 'Anne Vennerød', 'writer': '[[Jon Lennart Mjøen]]', 'based_on': "[[Jack Popplewell]]'s play ''Busybody''", 'starring': '[[Aud Schønemann]]
[[Arve Opsahl]]
[[Arne Aas (actor)|Arne Aas]]
[[Carsten Byhring]]
[[Thorleif Reiss]]
[[Jorunn Kjellsby]]', 'music': '[[Egil Monn-Iversen]]', 'cinematography': '[[Mattis Mathiesen]]', 'editing': '[[Reidar Lund]]
[[Olav Engebretsen]]', 'distributor': 'EMI Produksjon', 'released': 'March 2, 1970', 'runtime': '98 minutes', 'country': 'Norway', 'language': 'Norwegian'}}]), }) def test_test_tip_of_the_tongue_queries_train(self): self._test_queries('trec-tot/2023/train', count=150, items={ 0: TipOfTheTongueQuery('763', 'https://irememberthismovie.com/super-rare-surreal-dystopian-masterpiece/', 'movie', 'Super Rare Surreal Dystopian Masterpiece', 'Very rare movie that is scifi/dystopian/experimental/surreal. It’s like Stalker meets el Topo meets Holy Mountain meets Alphaville meets Delicatessen meets Hard to be a God, like Kurosawa, Tarkovsky, and Lynch had a kid together. It was color, possibly Russian, and I don’t really remember the decade but want to say 60s or 70s, though could easily be more recent. It is VERY rare, there is only one crappy partial print of it, and that is what the youtube version is from. Lot of wide shots in a surreal wilderness, winter settings, strange bleeding saturation in some shots. Crazy costumes. Seriously one of the strangest films I’ve ever seen and my favorite films are strange/weird ones. If you’ve ever seen what you’re thinking of on a “best weird movies” or “you’ve never seen this!” list, that’s NOT it. I don’t think this film even has a cult following of ten people. It’s an actual rare gem. Have been looking through selections at 366 Weird Movies and not found it yet (btw the way most of those titles are exactly the kind of not-actually-rare movies this film is definitely not).', [{'id': 1, 'text': 'Very rare movie that is scifi/dystopian/experimental/surreal.', 'labels': {'opinion': False, 'emotion': False, 'hedging': True, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': True, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': True, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 2, 'text': 'It’s like Stalker meets el Topo meets Holy Mountain meets Alphaville meets Delicatessen meets Hard to be a God, like Kurosawa, Tarkovsky, and Lynch had a kid together.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': True, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': True, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 3, 'text': 'It was color, possibly Russian, and I don’t really remember the decade but want to say 60s or 70s, though could easily be more recent.', 'labels': {'opinion': False, 'emotion': False, 'hedging': True, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': True, 'origin_language': False, 'category': False, 'character': False, 'production_visual': True, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': True, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 4, 'text': 'It is VERY rare, there is only one crappy partial print of it, and that is what the youtube version is from.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 5, 'text': 'Lot of wide shots in a surreal wilderness, winter settings, strange bleeding saturation in some shots.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': True, 'origin_actor': False, 'location_type': True, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 6, 'text': 'Crazy costumes.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': True, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 7, 'text': 'Seriously one of the strangest films I’ve ever seen and my favorite films are strange/weird ones.', 'labels': {'opinion': True, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': True, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': True, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 8, 'text': 'If you’ve ever seen what you’re thinking of on a “best weird movies” or “you’ve never seen this!”', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': True, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 9, 'text': 'list, that’s NOT it.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 10, 'text': 'I don’t think this film even has a cult following of ten people.', 'labels': {'opinion': True, 'emotion': False, 'hedging': True, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': True, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 11, 'text': 'It’s an actual rare gem.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 12, 'text': 'Have been looking through selections at 366 Weird Movies and not found it yet (btw the way most of those titles are exactly the kind of not-actually-rare movies this film is definitely not).', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': True, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}]), 9: TipOfTheTongueQuery('293', 'https://irememberthismovie.com/date-movie-from-awhile-ago/', 'movie', 'Date Movie from awhile ago', 'This was a movie from at least 10 years ago.\xa0 The lead actor and actress are on a first date and eating at a burger stand or shop of some sort and the guy mentions how hard dates are because the guy worries the whole time about whether he will get a kiss or not.\xa0 So the female says lets just kiss now and get it out of the way. I dont recall how long ago it was, any of the actors or what else happened in the movie.\xa0 i think it was a romantic comedy of sorts.', [{'id': 1, 'text': 'This was a movie from at least 10 years ago.', 'labels': {'opinion': False, 'emotion': False, 'hedging': True, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': True, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': True, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 2, 'text': 'The lead actor and actress are on a first date and eating at a burger stand or shop of some sort and the guy mentions how hard dates are because the guy worries the whole time about whether he will get a kiss or not.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': True, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': True, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 3, 'text': 'So the female says lets just kiss now and get it out of the way.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': True, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 4, 'text': 'I dont recall how long ago it was, any of the actors or what else happened in the movie.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': True, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': True, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 5, 'text': 'i think it was a romantic comedy of sorts.', 'labels': {'opinion': False, 'emotion': False, 'hedging': True, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': True, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}]), 149: TipOfTheTongueQuery('828', 'https://irememberthismovie.com/animated-movie-consisted-of-several-short-movies-that-was-once-on-netflix/', 'movie', 'Animated Movie (consisted of several short movies) That Was Once On Netflix', 'I remember watching an animated movie in the years 2009-2015. I don’t remember the name or any of the actors of the movie, but I remember a lot about it. First off, it was animated. It wasn’t super high quality, the people/things looked like silhouettes (you couldn’t see their faces or exact clothing details, but you could see the shadows of them). Second, it was about a boy and a girl acting out these “plays” in an old theater owned by an old man, who may have been their grandfather. There was also an owl in the theater, and it looked at the camera and hooted whenever they changed from one “play” to another. Third, to change from one costume to another, they stood on a sort of platform thing. It “scanned” them, and the silhouettes of the clothes they described appeared on their bodies. The plays/plots I remember are as follows : Boy gets put in prison (wrongly accused of stealing a necklace, I think), but gets pitied by a princess and she takes him food/clothing. However, the good princess’s evil sister tries to marry the boy when he gets out of prison. When he refuses/realizes she’s the evil princess, she curses him to be a wolf. Boy then eventually gets freed by the good princess. Boy finds himself in a city of gold, but an evil dragon rules the city. The dragon gives the people gold, but only if they sacrifice the most beautiful maiden to the dragon every month/year. Boy eventually slays dragon and marries the girl, who was the most beautiful at that time. Boy is in Africa/similar place and likes to play the drum, but is shunned for doing so. The girl’s father is the clan/tribe chief and is dying from illness. Boy gets thrown out of clan/tribe, but saves an old medicine man from a leopard/big cat. Old medicine man teaches boy how to play drum, which ends up healing the sick clan/tribe chief. The drum also makes the clan/tribe unbeatable in battle. Boy is an adventurer in a strange land where he crawls through a tunnel and gets put in prison. He has to choose/marry one of the king’s three daughters, but he doesn’t know which one. A bee flies around the head of the correct daughter because he fed the bee earlier in the tunnel. He also gets help from other monsters he met in the tunnel/cave. There may have been more, but it’s been years since I saw the movie. It was', [{'id': 1, 'text': 'I remember watching an animated movie in the years 2009-2015.', 'labels': {'opinion': False, 'emotion': False, 'hedging': True, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': True, 'character': False, 'production_visual': True, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': True, 'situational_evidence': False}}}, {'id': 2, 'text': 'I don’t remember the name or any of the actors of the movie, but I remember a lot about it.', 'labels': {'opinion': False, 'emotion': False, 'hedging': True, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': True, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 3, 'text': 'First off, it was animated.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': False, 'production_visual': True, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 4, 'text': 'It wasn’t super high quality, the people/things looked like silhouettes (you couldn’t see their faces or exact clothing details, but you could see the shadows of them).', 'labels': {'opinion': True, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': False, 'production_visual': True, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 5, 'text': 'Second, it was about a boy and a girl acting out these “plays” in an old theater owned by an old man, who may have been their grandfather.', 'labels': {'opinion': False, 'emotion': False, 'hedging': True, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': True, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': True, 'object': True, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 6, 'text': 'There was also an owl in the theater, and it looked at the camera and hooted whenever they changed from one “play” to another.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': True, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': True, 'origin_actor': False, 'location_type': True, 'object': True, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 7, 'text': 'Third, to change from one costume to another, they stood on a sort of platform thing.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': True, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': True, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 8, 'text': 'It “scanned” them, and the silhouettes of the clothes they described appeared on their bodies.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': True, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': True, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 9, 'text': 'The plays/plots I remember are as follows : Boy gets put in prison (wrongly accused of stealing a necklace, I think), but gets pitied by a princess and she takes him food/clothing.', 'labels': {'opinion': False, 'emotion': False, 'hedging': True, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': True, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': True, 'object': True, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 10, 'text': 'However, the good princess’s evil sister tries to marry the boy when he gets out of prison.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': True, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 11, 'text': 'When he refuses/realizes she’s the evil princess, she curses him to be a wolf.', 'labels': {'opinion': False, 'emotion': False, 'hedging': True, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': True, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 12, 'text': 'Boy then eventually gets freed by the good princess.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': True, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 13, 'text': 'Boy finds himself in a city of gold, but an evil dragon rules the city.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': True, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': True, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 14, 'text': 'The dragon gives the people gold, but only if they sacrifice the most beautiful maiden to the dragon every month/year.', 'labels': {'opinion': False, 'emotion': False, 'hedging': True, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': True, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': True, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 15, 'text': 'Boy eventually slays dragon and marries the girl, who was the most beautiful at that time.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': True, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 16, 'text': 'Boy is in Africa/similar place and likes to play the drum, but is shunned for doing so.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': True, 'production_audio': False, 'scene': True, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': True, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 17, 'text': 'The girl’s father is the clan/tribe chief and is dying from illness.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': True, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 18, 'text': 'Boy gets thrown out of clan/tribe, but saves an old medicine man from a leopard/big cat.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': True, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 19, 'text': 'Old medicine man teaches boy how to play drum, which ends up healing the sick clan/tribe chief.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': True, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': True, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 20, 'text': 'The drum also makes the clan/tribe unbeatable in battle.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': True, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 21, 'text': 'Boy is an adventurer in a strange land where he crawls through a tunnel and gets put in prison.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': True, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': True, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 22, 'text': 'He has to choose/marry one of the king’s three daughters, but he doesn’t know which one.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': True, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 23, 'text': 'A bee flies around the head of the correct daughter because he fed the bee earlier in the tunnel.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': True, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': True, 'object': True, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 24, 'text': 'He also gets help from other monsters he met in the tunnel/cave.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': True, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': True, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 25, 'text': 'There may have been more, but it’s been years since I saw the movie.', 'labels': {'opinion': False, 'emotion': False, 'hedging': True, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': True, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': True, 'situational_evidence': False}}}, {'id': 26, 'text': 'It was', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}]), }) def test_test_tip_of_the_tongue_queries_dev(self): self._test_queries('trec-tot/2023/dev', count=150, items={ 0: TipOfTheTongueQuery('152', 'https://irememberthismovie.com/foriegn-film-about-3-strangers-in-an-apartment/', 'movie', 'Foriegn Film about 3 Strangers in an Apartment', 'Movie from the early 2000s I believe about three people living in an apartment but never running into each other. One woman and two men are in the apartment. The woman is the realtor or owner of the apartment and at least one of the guys is a squator/homeless. It is a Korean or Chinese film I think. Art house flick… I think it won a few awards from film festivals like Cannes. Help if you can! ?', [{'id': 1, 'text': 'Movie from the early 2000s I believe about three people living in an apartment but never running into each other.', 'labels': {'opinion': False, 'emotion': False, 'hedging': True, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': True, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': True, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': True, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': True, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 2, 'text': 'One woman and two men are in the apartment.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': True, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 3, 'text': 'The woman is the realtor or owner of the apartment and at least one of the guys is a squator/homeless.', 'labels': {'opinion': False, 'emotion': False, 'hedging': True, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': True, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': True, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 4, 'text': 'It is a Korean or Chinese film I think.', 'labels': {'opinion': False, 'emotion': False, 'hedging': True, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': True, 'origin_language': False, 'category': True, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 5, 'text': 'Art house flick.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': True, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 6, 'text': 'I think it won a few awards from film festivals like Cannes.', 'labels': {'opinion': False, 'emotion': False, 'hedging': True, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': True, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 7, 'text': 'Help if you can!', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': True, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}]), 9: TipOfTheTongueQuery('813', 'https://irememberthismovie.com/i-remember-this-movie-3/', 'movie', 'I remember this movie', 'Looking for movie I saw when I was a kid. It may have been an episode of a TV show. Anyway, this newly married couple bought an old lighthouse or at least a house that was on a cliff overlooking the sea.The woman found a bunch of mirrors in the attic of the house that mesmerized her. She would dance in front of them. The story was a former female occupant had died in the house under suspicious circumstances, maybe she committed suicide or something. But the mirrors were haunted and the ghost arose and fought with the woman. Her husband came to help and tried to beat the ghost with a metal rod, a fire poker. Of course, the solid object went through the ghost and he actually beat his wife to death. Despondent , he thought that he saw his wife dancing in front of a window beckoning to him and he jumped through the window to his death. Another couple witnessed this whereupon the woman fainted and her husband scooped her up and rushed from the house. It was a black and white movie and I think I saw it sometime in the early ’60s on television. I have looked at The Outer Limits Episodes but could find nothing. Would like to see it again. Any help appreciated.', [{'id': 1, 'text': 'Looking for movie I saw when I was a kid.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': True, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': True, 'situational_evidence': False}}}, {'id': 2, 'text': 'It may have been an episode of a TV show.', 'labels': {'opinion': False, 'emotion': False, 'hedging': True, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': True, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 3, 'text': 'Anyway, this newly married couple bought an old lighthouse or at least a house that was on a cliff overlooking the sea.The woman found a bunch of mirrors in the attic of the house that mesmerized her.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': True, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': True, 'object': True, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 4, 'text': 'She would dance in front of them.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': True, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 5, 'text': 'The story was a former female occupant had died in the house under suspicious circumstances, maybe she committed suicide or something.', 'labels': {'opinion': False, 'emotion': False, 'hedging': True, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': True, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': True, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 6, 'text': 'But the mirrors were haunted and the ghost arose and fought with the woman.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': True, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': True, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 7, 'text': 'Her husband came to help and tried to beat the ghost with a metal rod, a fire poker.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': True, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': True, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 8, 'text': 'Of course, the solid object went through the ghost and he actually beat his wife to death.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': True, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': True, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 9, 'text': 'Despondent , he thought that he saw his wife dancing in front of a window beckoning to him and he jumped through the window to his death.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': True, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': True, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 10, 'text': 'Another couple witnessed this whereupon the woman fainted and her husband scooped her up and rushed from the house.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': True, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': True, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 11, 'text': 'It was a black and white movie and I think I saw it sometime in the early ’60s on television.', 'labels': {'opinion': False, 'emotion': False, 'hedging': True, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': True, 'character': False, 'production_visual': True, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': True, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': True, 'situational_evidence': False}}}, {'id': 12, 'text': 'I have looked at The Outer Limits Episodes but could find nothing.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': True, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 13, 'text': 'Would like to see it again.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': True, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 14, 'text': 'Any help appreciated.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': True, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}]), 149: TipOfTheTongueQuery('521', 'https://irememberthismovie.com/ghost-story/', 'movie', 'Ghost story', 'There’s an old one I got to see just once as a kid. Dude takes a bet to survive one nite in a mansion. So, he’s there thinking he’s alone and all these people show up. Remember two real well. A girl he falls I love with and some scientist. Scientist talking at one point shows experiment where he’s talking about how long something can live, cuts the head off a snake, tells dude to try to pick up snake head and dude almost gets bit. Another scene he’s with the girl in bed laying his head in her chest and jumps up worried cause he can’t hear her heartbeat. End of show he walks out stands by the mansion gates and it slams on him with the spikes stabbing him in the back. He loses the bet but gets the girl', [{'id': 1, 'text': 'There’s an old one I got to see just once as a kid.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': True, 'situational_evidence': False}}}, {'id': 2, 'text': 'Dude takes a bet to survive one nite in a mansion.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': True, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': True, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 3, 'text': 'So, he’s there thinking he’s alone and all these people show up.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': True, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 4, 'text': 'Remember two real well.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': False, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 5, 'text': 'A girl he falls I love with and some scientist.', 'labels': {'opinion': False, 'emotion': False, 'hedging': True, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 6, 'text': 'Scientist talking at one point shows experiment where he’s talking about how long something can live, cuts the head off a snake, tells dude to try to pick up snake head and dude almost gets bit.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': True, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': True, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 7, 'text': 'Another scene he’s with the girl in bed laying his head in her chest and jumps up worried cause he can’t hear her heartbeat.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': True, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': True, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 8, 'text': 'End of show he walks out stands by the mansion gates and it slams on him with the spikes stabbing him in the back.', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': True, 'plot': False, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': True, 'object': True, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}, {'id': 9, 'text': 'He loses the bet but gets the girl', 'labels': {'opinion': False, 'emotion': False, 'hedging': False, 'social': False, 'comparison_relative': False, 'search': False, 'movie': {'music_compare': False, 'origin_movie': False, 'origin_language': False, 'category': False, 'character': True, 'production_visual': False, 'music_specific': False, 'location_specific': False, 'production_audio': False, 'scene': False, 'plot': True, 'person_fictional': False, 'negation': False, 'genre_audience': False, 'person_real': False, 'production_camera_angle': False, 'origin_actor': False, 'location_type': False, 'object': False, 'timeframe_singular': False, 'quote': False, 'genre_traditional_tone': False, 'release_date': False, 'timeframe_plural': False}, 'context': {'physical_medium': False, 'physical_user_location': False, 'situational_count': False, 'cross_media': False, 'situational_witness': False, 'temporal': False, 'situational_evidence': False}}}]), }) def test_test_tip_of_the_tongue_qrels_train(self): self._test_qrels('trec-tot/2023/train', count=150, items={ 0: TrecQrel('763', '16742289', 1, '0'), 9: TrecQrel('293', '142456', 1, '0'), 149: TrecQrel('828', '30672517', 1, '0'), }) def test_test_tip_of_the_tongue_qrels_dev(self): self._test_qrels('trec-tot/2023/dev', count=150, items={ 0: TrecQrel('152', '1940119', 1, '0'), 9: TrecQrel('813', '2310134', 1, '0'), 149: TrecQrel('521', '2911505', 1, '0'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/trec_tot_2024.py ================================================ import re import unittest from ir_datasets.formats import TrecQrel from ir_datasets.datasets.trec_tot import TipOfTheTongueDoc2024, TipOfTheTongueQuery2024 from test.integration.base import DatasetIntegrationTest import ir_datasets class TestTipOfTheTongue(DatasetIntegrationTest): def test_tip_of_the_tongue_docs(self): self._test_docs('trec-tot/2024', count=3185450, items={ 0: TipOfTheTongueDoc2024("846", "Museum of Work", "Q6941060", re.compile("^The Museum of Work .*"), [{"start": 0, "end": 798, "section": "Abstract"}, {"start": 798, "end": 1620, "section": "Overview"}, {"start": 1620, "end": 3095, "section": "Exhibitions"}, {"start": 3095, "end": 3371, "section": "The history of Alva"}, {"start": 3371, "end": 3824, "section": "Industriland"}, {"start": 3824, "end": 4371, "section": "Framtidsland (Future country)"}, {"start": 4371, "end": 4761, "section": "EWK \u2014 The Center for Political Illustration Art"}]), 1091: TipOfTheTongueDoc2024("9764", "Emma Goldman", "Q79969", re.compile("Emma Goldman \\(June 27, 1869 .*"),[{"start": 0, "end": 2752, "section": "Abstract"}, {"start": 2752, "end": 45613, "section": "Biography"}, {"start": 45613, "end": 47371, "section": "Family"}, {"start": 47371, "end": 50317, "section": "Adolescence"}, {"start": 50317, "end": 52433, "section": "Rochester, New York"}, {"start": 52433, "end": 54427, "section": "Most and Berkman"}, {"start": 54427, "end": 57448, "section": "Homestead plot"}, {"start": 57448, "end": 60672, "section": "\"Inciting to riot\""}, {"start": 60672, "end": 63288, "section": "McKinley assassination"}, {"start": 63288, "end": 66975, "section": "''Mother Earth'' and Berkman's release"}, {"start": 66975, "end": 69914, "section": "Reitman, essays, and birth control"}, {"start": 69914, "end": 73788, "section": "World War I"}, {"start": 73788, "end": 76344, "section": "Deportation"}, {"start": 76344, "end": 79375, "section": "Russia"}, {"start": 79375, "end": 83782, "section": "England, Canada, and France"}, {"start": 83782, "end": 86917, "section": "Spanish Civil War"}, {"start": 86917, "end": 87430, "section": "Final years"}, {"start": 87430, "end": 88493, "section": "Death"}, {"start": 88493, "end": 101764, "section": "Philosophy"}, {"start": 101764, "end": 106976, "section": "Anarchism"}, {"start": 106976, "end": 109922, "section": "Tactical uses of violence"}, {"start": 109922, "end": 111036, "section": "Capitalism and labor"}, {"start": 111036, "end": 114245, "section": "State"}, {"start": 114245, "end": 116281, "section": "Feminism and sexuality"}, {"start": 116281, "end": 117248, "section": "Atheism"}, {"start": 117248, "end": 120736, "section": "Legacy"}, {"start": 120736, "end": 120977, "section": "Works"}]) }) def test_tip_of_the_tongue_queries(self): self._test_queries('trec-tot/2024/test', count=600, items={ 0: TipOfTheTongueQuery2024("2001", re.compile("^I remember this old building I used to pass by in the heart of a bustling financial district, a place where the air always seemed thick.*")), 599: TipOfTheTongueQuery2024("2600", re.compile("^Okay, this is a vague one .\n So I know this is going to be.*")) }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/trec_tot_2025/test_docs_iter.py ================================================ import unittest def load_dataset(): import ir_datasets return ir_datasets.load("trec-tot/2025") def load_doc_number(num): index = 0 for i in load_dataset().docs_iter(): if num == index: return i index += 1 class TestDocsIter(unittest.TestCase): def test_dataset_can_be_loaded(self): actual = load_dataset() self.assertIsNotNone(actual) def test_first_doc(self): actual = load_doc_number(0) self.assertIsNotNone(actual) self.assertEqual("12", actual.doc_id) self.assertEqual("https://en.wikipedia.org/wiki/Anarchism", actual.url) self.assertEqual("Anarchism", actual.title) self.assertIn("a political philosophy and movement that is skeptical", actual.text) self.assertIn("a political philosophy and movement that is skeptical", actual.default_text()) self.assertIn("Anarchism Anarchism is a political philosophy and movement that is skeptical", actual.default_text()) def test_third_doc(self): actual = load_doc_number(3) self.assertIsNotNone(actual) self.assertEqual("303", actual.doc_id) self.assertEqual("https://en.wikipedia.org/wiki/Alabama", actual.url) self.assertEqual("Alabama", actual.title) self.assertIn("Alabama's economy in the 21st century is based on automotive", actual.text) self.assertIn("Alabama's economy in the 21st century is based on automotive", actual.default_text()) ================================================ FILE: test/integration/trec_tot_2025/test_docs_store.py ================================================ import unittest def load_docs_store(): import ir_datasets return ir_datasets.load("trec-tot/2025").docs_store() class TestDocsStore(unittest.TestCase): def test_docs_store_can_be_loaded(self): actual = load_docs_store() self.assertIsNotNone(actual) def test_first_doc(self): actual = load_docs_store().get("12") self.assertIsNotNone(actual) self.assertEqual("12", actual.doc_id) self.assertEqual("https://en.wikipedia.org/wiki/Anarchism", actual.url) self.assertEqual("Anarchism", actual.title) self.assertIn("a political philosophy and movement that is skeptical", actual.text) self.assertIn("a political philosophy and movement that is skeptical", actual.default_text()) self.assertIn("Anarchism Anarchism is a political philosophy and movement that is skeptical", actual.default_text()) def test_third_doc(self): actual = load_docs_store().get("303") self.assertIsNotNone(actual) self.assertEqual("303", actual.doc_id) self.assertEqual("https://en.wikipedia.org/wiki/Alabama", actual.url) self.assertEqual("Alabama", actual.title) self.assertIn("Alabama's economy in the 21st century is based on automotive", actual.text) self.assertIn("Alabama's economy in the 21st century is based on automotive", actual.default_text()) def test_some_random_doc(self): actual = load_docs_store().get("6596604") self.assertIsNotNone(actual) self.assertEqual("6596604", actual.doc_id) self.assertEqual("https://en.wikipedia.org/wiki/Radio%20Reloj", actual.url) self.assertEqual("Radio Reloj", actual.title) self.assertIn("Radio Reloj (Spanish for Radio Clock) is an internationally broadcast Spanish-language radio station", actual.text) self.assertIn("Radio Reloj (Spanish for Radio Clock) is an internationally broadcast Spanish-language radio station", actual.default_text()) ================================================ FILE: test/integration/trec_tot_2025/test_qrel_iter.py ================================================ import unittest def load_dataset(dataset_id): import ir_datasets return ir_datasets.load(dataset_id) def load_qrel_number(dataset_id, num): index = 0 for i in load_dataset(dataset_id).qrels_iter(): if num == index: return i index += 1 class TestQrelIter(unittest.TestCase): def test_train_dataset_can_be_loaded(self): actual = load_dataset("trec-tot/2025/train") self.assertIsNotNone(actual) def test_dev1_dataset_can_be_loaded(self): actual = load_dataset("trec-tot/2025/dev1") self.assertIsNotNone(actual) def test_dev2_dataset_can_be_loaded(self): actual = load_dataset("trec-tot/2025/dev2") self.assertIsNotNone(actual) def test_dev3_dataset_can_be_loaded(self): actual = load_dataset("trec-tot/2025/dev3") self.assertIsNotNone(actual) def test_train_qrel_iter(self): actual = load_qrel_number("trec-tot/2025/train", 12) self.assertEqual("1014", actual.query_id) self.assertEqual("46264411", actual.doc_id) self.assertEqual(1, actual.relevance) def test_dev1_qrel_iter(self): actual = load_qrel_number("trec-tot/2025/dev1", 12) self.assertEqual("898", actual.query_id) self.assertEqual("3761238", actual.doc_id) self.assertEqual(1, actual.relevance) def test_dev2_qrel_iter(self): actual = load_qrel_number("trec-tot/2025/dev2", 12) self.assertEqual("632", actual.query_id) self.assertEqual("3261733", actual.doc_id) self.assertEqual(1, actual.relevance) def test_dev3_qrel_iter(self): actual = load_qrel_number("trec-tot/2025/dev3", 12) self.assertEqual("2014", actual.query_id) self.assertEqual("446518", actual.doc_id) self.assertEqual(1, actual.relevance) ================================================ FILE: test/integration/trec_tot_2025/test_queries_iter.py ================================================ import unittest def load_dataset(dataset_id): import ir_datasets return ir_datasets.load(dataset_id) def load_query_number(dataset_id, num): index = 0 for i in load_dataset(dataset_id).queries_iter(): if num == index: return i index += 1 class TestQueriesIter(unittest.TestCase): def test_train_dataset_can_be_loaded(self): actual = load_dataset("trec-tot/2025/train") self.assertIsNotNone(actual) def test_dev1_dataset_can_be_loaded(self): actual = load_dataset("trec-tot/2025/dev1") self.assertIsNotNone(actual) def test_dev2_dataset_can_be_loaded(self): actual = load_dataset("trec-tot/2025/dev2") self.assertIsNotNone(actual) def test_dev3_dataset_can_be_loaded(self): actual = load_dataset("trec-tot/2025/dev3") self.assertIsNotNone(actual) def test_query_from_train_dataset_can_be_loaded_01(self): actual = load_query_number("trec-tot/2025/train", 2) self.assertIsNotNone(actual) self.assertEqual("950", actual.query_id) self.assertIn("two girls who run away", actual.default_text()) def test_query_from_train_dataset_can_be_loaded_02(self): actual = load_query_number("trec-tot/2025/train", 25) self.assertIsNotNone(actual) self.assertEqual("484", actual.query_id) self.assertIn("Main character is a famous person like a celebrity", actual.default_text()) def test_query_from_dev1_dataset_can_be_loaded_01(self): actual = load_query_number("trec-tot/2025/dev1", 2) self.assertIsNotNone(actual) self.assertEqual("473", actual.query_id) self.assertIn("possibly a ghost killing in an old house", actual.default_text()) def test_query_from_dev1_dataset_can_be_loaded_02(self): actual = load_query_number("trec-tot/2025/dev1", 25) self.assertIsNotNone(actual) self.assertEqual("153", actual.query_id) self.assertIn("Martial arts movie where the human is fighting aliens", actual.default_text()) def test_query_from_dev2_dataset_can_be_loaded_01(self): actual = load_query_number("trec-tot/2025/dev2", 2) self.assertIsNotNone(actual) self.assertEqual("477", actual.query_id) self.assertIn("Pretty sure it was a comedy", actual.default_text()) def test_query_from_dev2_dataset_can_be_loaded_02(self): actual = load_query_number("trec-tot/2025/dev2", 25) self.assertIsNotNone(actual) self.assertEqual("873", actual.query_id) self.assertIn("I remember there were 2 siblings involved in the movie", actual.default_text()) def test_query_from_dev3_dataset_can_be_loaded_01(self): actual = load_query_number("trec-tot/2025/dev3", 2) self.assertIsNotNone(actual) self.assertEqual("2003", actual.query_id) self.assertIn("I remember a scene where the bell tower guy and the soldier had to sneak into this hidden place", actual.default_text()) def test_query_from_dev3_dataset_can_be_loaded_02(self): actual = load_query_number("trec-tot/2025/dev3", 25) self.assertIsNotNone(actual) self.assertEqual("2028", actual.query_id) self.assertIn("The place had this weird energy source", actual.default_text()) def test_query_from_test_dataset_can_be_loaded_01(self): actual = load_query_number("trec-tot/2025/test", 2) self.assertIsNotNone(actual) self.assertEqual(3002, actual.query_id) self.assertIn("learn judo or kung fu techniques and become a skilled master himself", actual.default_text()) def test_query_from_test_dataset_can_be_loaded_02(self): actual = load_query_number("trec-tot/2025/test", 25) self.assertIsNotNone(actual) self.assertEqual(3025, actual.query_id) self.assertIn("discussing complex tax structures and their impacts on businesses globally", actual.default_text()) ================================================ FILE: test/integration/tripclick.py ================================================ import re import datetime import unittest from ir_datasets.formats import TrecQrel, TitleUrlTextDoc, GenericQuery, GenericScoredDoc, GenericDocPair from ir_datasets.datasets.tripclick import TripClickQlog, TripClickPartialDoc, LogItem from .base import DatasetIntegrationTest class TestTripclick(DatasetIntegrationTest): def test_docs(self): self._test_docs('tripclick', count=1523878, items={ 0: TitleUrlTextDoc('283040', re.compile(r'^V.*\.$', flags=48), re.compile(r'^http://www.ncb.*85551$', flags=48), re.compile('^BACKGROUND : .* Medical Society\\.\n$', flags=48)), 9: TitleUrlTextDoc('283070', re.compile(r'^N.*\.$', flags=48), re.compile(r'^http://www.ncb.*t_uids=16775235$', flags=48), re.compile('^BACKGROUND : Res.* Medical Society\\.\n$', flags=48)), 1523877: TitleUrlTextDoc('11701272', re.compile(r'^M.*m$', flags=48), re.compile(r'^https://www.ncbi.nlm.*58120938564.pdf$', flags=48), re.compile('^OBJECTIVE : To.* of Neurology\\.\n$', flags=48)), }) self._test_docs('tripclick/logs', count=5196956, items={ 0: TripClickPartialDoc('96657', re.compile(r'^Syst.*$', flags=48), re.compile(r'^https://www.journalslibrary.*hta1060/$', flags=48)), 9: TripClickPartialDoc('96666', re.compile(r'^Econom.*$', flags=48), re.compile(r'^https://www.journalslibrary..*hta3230/$', flags=48)), 5196955: TripClickPartialDoc('11707140', re.compile(r'^Basic Life.*$', flags=48), re.compile(r'^https://covid19evidence.*201021-151716$', flags=48)), }) def test_queries(self): self._test_queries('tripclick/train', count=685649, items={ 0: GenericQuery('8', re.compile(r'^a.*s$', flags=48)), 9: GenericQuery('136', re.compile(r'^c.*g$', flags=48)), 685648: GenericQuery('1647720', re.compile(r'^c.*e$', flags=48)), }) self._test_queries('tripclick/train/head', count=3529, items={ 0: GenericQuery('8', re.compile(r'^a.*s$', flags=48)), 9: GenericQuery('136', re.compile(r'^c.*g$', flags=48)), 3528: GenericQuery('1630245', re.compile(r'^I.*d$', flags=48)), }) self._test_queries('tripclick/train/head/dctr', count=3529, items={ 0: GenericQuery('8', re.compile(r'^a.*s$', flags=48)), 9: GenericQuery('136', re.compile(r'^c.*g$', flags=48)), 3528: GenericQuery('1630245', re.compile(r'^I.*d$', flags=48)), }) self._test_queries('tripclick/train/torso', count=105964, items={ 0: GenericQuery('5', re.compile(r'^p.*e$', flags=48)), 9: GenericQuery('43', re.compile(r'^p.*e$', flags=48)), 105963: GenericQuery('1647511', re.compile(r'^h.*s$', flags=48)), }) self._test_queries('tripclick/train/tail', count=576156, items={ 0: GenericQuery('1', re.compile(r'^c.*d$', flags=48)), 9: GenericQuery('65', re.compile(r'^s.*t$', flags=48)), 576155: GenericQuery('1647720', re.compile(r'^c.*e$', flags=48)), }) self._test_queries('tripclick/val', count=3525, items={ 0: GenericQuery('38', re.compile(r'^a.*e$', flags=48)), 9: GenericQuery('226', re.compile(r'^h.*t$', flags=48)), 3524: GenericQuery('1645595', re.compile(r'^h.*n$', flags=48)), }) self._test_queries('tripclick/val/head', count=1175, items={ 0: GenericQuery('38', re.compile(r'^a.*e$', flags=48)), 9: GenericQuery('226', re.compile(r'^h.*t$', flags=48)), 1174: GenericQuery('1630209', re.compile(r'^A.*e$', flags=48)), }) self._test_queries('tripclick/val/head/dctr', count=1175, items={ 0: GenericQuery('38', re.compile(r'^a.*e$', flags=48)), 9: GenericQuery('226', re.compile(r'^h.*t$', flags=48)), 1174: GenericQuery('1630209', re.compile(r'^A.*e$', flags=48)), }) self._test_queries('tripclick/val/torso', count=1175, items={ 0: GenericQuery('534', re.compile(r'^l.*n$', flags=48)), 9: GenericQuery('4773', re.compile(r'^h.*l$', flags=48)), 1174: GenericQuery('1626635', re.compile(r'^p.*f$', flags=48)), }) self._test_queries('tripclick/val/tail', count=1175, items={ 0: GenericQuery('1052', re.compile(r'^r.*y$', flags=48)), 9: GenericQuery('22440', re.compile(r'^g.*w$', flags=48)), 1174: GenericQuery('1645595', re.compile(r'^h.*n$', flags=48)), }) self._test_queries('tripclick/test', count=3525, items={ 0: GenericQuery('24', re.compile(r'^p.*g$', flags=48)), 9: GenericQuery('354', re.compile(r'^a.*e$', flags=48)), 3524: GenericQuery('1646719', re.compile(r'^p.*e$', flags=48)), }) self._test_queries('tripclick/test/head', count=1175, items={ 0: GenericQuery('24', re.compile(r'^p.*g$', flags=48)), 9: GenericQuery('354', re.compile(r'^a.*e$', flags=48)), 1174: GenericQuery('1610957', re.compile(r'^S.*l$', flags=48)), }) self._test_queries('tripclick/test/torso', count=1175, items={ 0: GenericQuery('152', re.compile(r'^v.*s$', flags=48)), 9: GenericQuery('2700', re.compile(r'^p.*g$', flags=48)), 1174: GenericQuery('1641005', re.compile(r'^h.*s$', flags=48)), }) self._test_queries('tripclick/test/tail', count=1175, items={ 0: GenericQuery('4752', re.compile(r'^h.*e$', flags=48)), 9: GenericQuery('15118', re.compile(r'^i.*n$', flags=48)), 1174: GenericQuery('1646719', re.compile(r'^p.*e$', flags=48)), }) def test_qlogs(self): self._test_qlogs('tripclick/logs', count=5317350, items={ 0: TripClickQlog(re.compile(r'3fjdej.{10}2yccq255', flags=48), re.compile(r'48a.{4}86f2', flags=48), 'community aed', re.compile(r'comm.*d', flags=48), datetime.datetime(2013, 1, 1, 1, 17, 26, 57000), (LogItem('981744', True),)), 9: TripClickQlog(re.compile(r'htgqag5.{10}qeult45', flags=48), re.compile(r'd23b.{4}539', flags=48), 'primary wound closure', re.compile(r'prim.*ure', flags=48), datetime.datetime(2013, 1, 1, 2, 15, 33, 930000), (LogItem('1185098', True),)), 19: TripClickQlog(re.compile(r'2qmdal45.{10}mbgrro', flags=48), re.compile(r'342.{4}6d4f', flags=48), 'status epilepticus treatment', re.compile(r'\(tit.*atment\)', flags=48), datetime.datetime(2013, 1, 1, 4, 50, 20, 603000), (LogItem('834890', True),)), 63: TripClickQlog(re.compile(r'c352x5mog.{10}bup45', flags=48), re.compile(r'e511.{4}6e9', flags=48), 'conjunctivitis medication', re.compile(r'conju.*ication', flags=48), datetime.datetime(2013, 1, 1, 7, 54, 5, 737000), (LogItem('929760', True),)), 5317349: TripClickQlog(re.compile(r'vcxwj5.{10}llhur3g0', flags=48), re.compile(r'921.{4}0c8d', flags=48), 'GORD in children', re.compile(r'\(GOR.*dren\)', flags=48), datetime.datetime(2020, 10, 28, 4, 56, 17, 943000), (LogItem('11203043', False), LogItem('11203042', True), LogItem('11203038', False), LogItem('11203041', False), LogItem('9459386', False), LogItem('9007494', False), LogItem('11172888', False), LogItem('9514712', False), LogItem('9159084', False), LogItem('11054069', False), LogItem('9338829', False), LogItem('9338609', False), LogItem('9007349', False), LogItem('9614037', False), LogItem('9184023', False), LogItem('10010303', False), LogItem('11054239', False), LogItem('9616794', False), LogItem('11185267', False), LogItem('11185318', False))), }) def test_qrels(self): self._test_qrels('tripclick/train', count=2705212, items={ 0: TrecQrel('8', '1398048', 1, '0'), 9: TrecQrel('8', '1431742', 1, '0'), 2705211: TrecQrel('1647720', '11698361', 1, '0'), }) self._test_qrels('tripclick/train/head', count=116821, items={ 0: TrecQrel('8', '1398048', 1, '0'), 9: TrecQrel('8', '1431742', 1, '0'), 116820: TrecQrel('1630245', '10818871', 1, '0'), }) self._test_qrels('tripclick/train/head/dctr', count=128420, items={ 0: TrecQrel('8', '1398048', 3, '0'), 9: TrecQrel('8', '5651514', 1, '0'), 128419: TrecQrel('1630245', '9448244', 0, '0'), }) self._test_qrels('tripclick/train/torso', count=966898, items={ 0: TrecQrel('5', '1099235', 1, '0'), 9: TrecQrel('15', '9028026', 0, '0'), 966897: TrecQrel('1647511', '11429892', 1, '0'), }) self._test_qrels('tripclick/train/tail', count=1621493, items={ 0: TrecQrel('1', '981744', 1, '0'), 9: TrecQrel('27', '1194092', 1, '0'), 1621492: TrecQrel('1647720', '11698361', 1, '0'), }) self._test_qrels('tripclick/val', count=82409, items={ 0: TrecQrel('38', '1390633', 1, '0'), 9: TrecQrel('38', '9137657', 0, '0'), 82408: TrecQrel('1645595', '9982749', 1, '0'), }) self._test_qrels('tripclick/val/head', count=64364, items={ 0: TrecQrel('38', '1390633', 1, '0'), 9: TrecQrel('38', '9137657', 0, '0'), 64363: TrecQrel('1630209', '11086242', 1, '0'), }) self._test_qrels('tripclick/val/head/dctr', count=66812, items={ 0: TrecQrel('38', '1390633', 2, '0'), 9: TrecQrel('38', '7858667', 0, '0'), 66811: TrecQrel('1630209', '9358372', 0, '0'), }) self._test_qrels('tripclick/val/torso', count=14133, items={ 0: TrecQrel('534', '1397165', 1, '0'), 9: TrecQrel('534', '5671894', 1, '0'), 14132: TrecQrel('1626635', '10258672', 1, '0'), }) self._test_qrels('tripclick/val/tail', count=3912, items={ 0: TrecQrel('1052', '951102', 1, '0'), 9: TrecQrel('9347', '296234', 1, '0'), 3911: TrecQrel('1645595', '9982749', 1, '0'), }) def test_scoreddocs(self): self._test_scoreddocs('tripclick/val', count=3503310, items={ 0: GenericScoredDoc('38', '869893', 10.582), 9: GenericScoredDoc('38', '484662', 9.7492), 3503309: GenericScoredDoc('1645595', '2058354', 6.3192), }) self._test_scoreddocs('tripclick/val/head', count=1166804, items={ 0: GenericScoredDoc('38', '869893', 10.582), 9: GenericScoredDoc('38', '484662', 9.7492), 1166803: GenericScoredDoc('1630209', '1336783', 10.0589), }) self._test_scoreddocs('tripclick/val/head/dctr', count=1166804, items={ 0: GenericScoredDoc('38', '869893', 10.582), 9: GenericScoredDoc('38', '484662', 9.7492), 1166803: GenericScoredDoc('1630209', '1336783', 10.0589), }) self._test_scoreddocs('tripclick/val/torso', count=1170314, items={ 0: GenericScoredDoc('534', '5671894', 10.4897), 9: GenericScoredDoc('534', '11100678', 9.9284), 1170313: GenericScoredDoc('1626635', '11074840', 4.848999), }) self._test_scoreddocs('tripclick/val/tail', count=1166192, items={ 0: GenericScoredDoc('1052', '9086112', 11.7205), 9: GenericScoredDoc('1052', '6298266', 10.1826), 1166191: GenericScoredDoc('1645595', '2058354', 6.3192), }) self._test_scoreddocs('tripclick/test', count=3486402, items={ 0: GenericScoredDoc('24', '11072695', 7.9575), 9: GenericScoredDoc('24', '9124053', 7.802), 3486401: GenericScoredDoc('1646719', '1047809', 4.342899), }) self._test_scoreddocs('tripclick/test/head', count=1159303, items={ 0: GenericScoredDoc('24', '11072695', 7.9575), 9: GenericScoredDoc('24', '9124053', 7.802), 1159302: GenericScoredDoc('1610957', '10780205', 9.3906), }) self._test_scoreddocs('tripclick/test/torso', count=1161972, items={ 0: GenericScoredDoc('152', '9083897', 21.2913), 9: GenericScoredDoc('152', '9379499', 20.041401), 1161971: GenericScoredDoc('1641005', '9003321', 9.9069), }) self._test_scoreddocs('tripclick/test/tail', count=1165127, items={ 0: GenericScoredDoc('4752', '1306524', 30.8922), 9: GenericScoredDoc('4752', '472516', 27.3608), 1165126: GenericScoredDoc('1646719', '1047809', 4.342899), }) def test_docpairs(self): self._test_docpairs('tripclick/train', count=23221224, items={ 0: GenericDocPair('338572', '1424623', '725225'), 9: GenericDocPair('1016988', '7785567', '5019636'), 23221223: GenericDocPair('3141', '9337445', '9337479'), }) self._test_docpairs('tripclick/train/hofstaetter-triples', count=10000000, items={ 0: GenericDocPair('1064190', '1435361', '9317735'), 9: GenericDocPair('1053769', '7796753', '10783812'), 9999999: GenericDocPair('1219267', '9260879', '1194457'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/tweets2013_ia.py ================================================ import re import unittest from ir_datasets.datasets.tweets2013_ia import TrecMb13Query, TrecMb14Query, TweetDoc from ir_datasets.formats import TrecQrel, TrecDoc from .base import DatasetIntegrationTest class TestTweets2013Ia(DatasetIntegrationTest): def test_docs(self): self._test_docs('tweets2013-ia', items={ 0: TweetDoc('297237850620035072', re.compile('RT @xoneedirecti.*yBirthdayHarry ♥', flags=16), '361751733', 'Fri Feb 01 07:00:00 +0000 2013', 'vi', None, '297235934536142848', re.compile(b'^\\{"created_at":"Fri Feb 01 07:00:00 \\+0000 2013","id":297237850620035072,"id_str":"297237850620035072".{4165}92909891,"id_str":"592909891","indices":\\[3,19\\]\\}\\]\\},"favorited":false,"retweeted":false,"lang":"vi"\\}\\\r\n$', flags=16), 'application/json'), 9: TweetDoc('297237850628423681', re.compile('Today stats: 6 new followers and .*tp://t.co/Wkb8Rdcm', flags=16), '240617244', 'Fri Feb 01 07:00:00 +0000 2013', 'en', None, None, re.compile(b'^\\{"created_at":"Fri Feb 01 07:00:00 \\+0000 2013","id":297237850628423681,"id_str":"297237850628423681".{1957}\\}\\],"user_mentions":\\[\\]\\},"favorited":false,"retweeted":false,"possibly_sensitive":false,"lang":"en"\\}\\\r\n$', flags=16), 'application/json'), 1000: TweetDoc('297237951287537665', re.compile('ドバイでコンドームの宅配サー.*tp://t.co/VRviUIJ1', flags=16), '239760031', 'Fri Feb 01 07:00:24 +0000 2013', 'ja', None, None, re.compile(b'^\\{"created_at":"Fri Feb 01 07:00:24 \\+0000 2013","id":297237951287537665,"id_str":"297237951287537665".{1972}\\}\\],"user_mentions":\\[\\]\\},"favorited":false,"retweeted":false,"possibly_sensitive":false,"lang":"ja"\\}\\\r\n$', flags=16), 'application/json'), }) def test_queries(self): self._test_queries('tweets2013-ia/trec-mb-2013', count=60, items={ 0: TrecMb13Query('111', 'water shortages', 'Fri Mar 29 18:56:02 +0000 2013', '317711766815653888'), 9: TrecMb13Query('120', "Argentina's Inflation", 'Tue Mar 19 15:37:48 +0000 2013', '314038001112076290'), 59: TrecMb13Query('170', 'Tony Mendez', 'Sun Mar 31 14:12:52 +0000 2013', '318365281321881600'), }) self._test_queries('tweets2013-ia/trec-mb-2014', count=55, items={ 0: TrecMb14Query('171', 'Ron Weasley birthday', 'Sat Mar 02 10:43:45 EST 2013', '307878904759201794', "Find tweets regarding the birthday of fictional character Ron Weasley, Harry Potter's sidekick."), 9: TrecMb14Query('180', 'Sherlock Elementary BBC CBS', 'Sun Mar 31 10:21:28 EDT 2013', '318367445586939904', 'Find opinions on either the BBC "Sherlock" series or "Elementary" on CBS, or comparisons of the shows or characters.'), 54: TrecMb14Query('225', 'Barbara Walters, chicken pox', 'Tue Mar 12 13:19:59 EDT 2013', '311527001297137664', 'Find information on Barbara Walters having chicken pox and her subsequent\nreturn to the TV show "The View".'), }) def test_qrels(self): self._test_qrels('tweets2013-ia/trec-mb-2013', count=71279, items={ 0: TrecQrel('111', '297136541426397184', 0, 'Q0'), 9: TrecQrel('111', '299374475248537602', 0, 'Q0'), 71278: TrecQrel('170', '317942407385726976', 0, 'Q0'), }) self._test_qrels('tweets2013-ia/trec-mb-2014', count=57985, items={ 0: TrecQrel('171', '305851659194609664', 0, 'Q0'), 9: TrecQrel('171', '304392188215836672', 0, 'Q0'), 57984: TrecQrel('225', '299257357664387072', 0, 'Q0'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/vaswani.py ================================================ import re import unittest from ir_datasets.formats import GenericDoc, GenericQuery, TrecQrel from .base import DatasetIntegrationTest class TestVaswani(DatasetIntegrationTest): def test_vaswani_docs(self): self._test_docs('vaswani', count=11429, items={ 0: GenericDoc('1', 'compact memories have flexible capacities a digital data storage\nsystem with capacity up to bits and random and or sequential access\nis described\n'), 9: GenericDoc('10', 'highspeed microwave switching of semiconductors part\n'), 11428: GenericDoc('11429', re.compile('^pattern detection and recognition both processes have been carried\nout on an ibm computer which was.{56} tested included the recognition process for\nreading handlettered sansserif alphanumeric characters\n$', flags=48)), }) def test_vaswani_queries(self): self._test_queries('vaswani', count=93, items={ 0: GenericQuery('1', 'MEASUREMENT OF DIELECTRIC CONSTANT OF LIQUIDS BY THE USE OF MICROWAVE TECHNIQUES\n'), 9: GenericQuery('10', 'METHODS OF CALCULATING INSTANTANEOUS POWER DISSIPATION IN REACTIVE CIRCUITS\n'), 92: GenericQuery('93', 'HIGH FREQUENCY OSCILLATORS USING TRANSISTORS THEORETICAL TREATMENT AND PRACTICAL CIRCUIT DETAILS\n'), }) def test_vaswani_qrels(self): self._test_qrels('vaswani', count=2083, items={ 0: TrecQrel('1', '1239', 1, '0'), 9: TrecQrel('1', '6824', 1, '0'), 2082: TrecQrel('93', '11318', 1, '0'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/wapo.py ================================================ import re import unittest from ir_datasets.datasets.wapo import WapoDoc, WapoDocMedia, TrecBackgroundLinkingQuery from ir_datasets.formats import GenericQrel, GenericQuery, TrecQuery, TrecQrel from .base import DatasetIntegrationTest class TestWapo(DatasetIntegrationTest): def test_docs(self): self._test_docs('wapo/v2', count=595037, items={ 0: WapoDoc('b2e89334-33f9-11e1-825f-dabc29fd7071', 'https://www.washingtonpost.com/sports/colleges/danny-coale-jarrett-boykin-are-a-perfect-1-2-punch-for-virginia-tech/2011/12/31/gIQAAaW4SP_story.html', re.compile('^Danny Coale, Jarrett .* Virginia Tech$', flags=48), 'Mark Giannotto', 1325376562000, 'Colleges', re.compile('^Virginia Tech wide receiver Danny Coale \\(19\\) was lightly recruited out of Episcopal in Alexandria bu.{2838} on him and he’ll be there\\. I know when I look back, part of my Tech experience is going to be him\\.”$', flags=48), (re.compile(' Whenever a Virginia Tech offensive coach is asked how the most prolific receiving duo in school history came to be, inevitably the first road game in 2008 against North Carolina comes up.', 'Midway through the first quarter, Virginia Tech had to call two timeouts in a row because then-freshmen Jarrett Boykin and Danny Coale couldn’t seem to line up right, and “they had those big eyes out there looking around,” Kevin Sherman, their position coach, said recently.', 'Now that Boykin and Coale have only Tuesday’s Sugar Bowl remaining before leaving Virginia Tech with every major school record for a wide receiver, they’ve taken a different stance.', '“I still don’t think that was on us. Macho [Harris] was in the game and he lined up wrong,” said Boykin, as Coale sat next to him nodding in agreement.', 'Just add that to the list of slights these seniors have had to overcome.', 'Boykin has been the team’s leading receiver the past three seasons, using hands that need size XXXL gloves and a knack for out-maneuvering opposing cornerbacks in the air to set a single-season school record for receptions this year (57). He will end his career with more catches (180) and yards (2,854) than any other Hokies receiver.', 'Coale, an Episcopal High graduate, leads Virginia Tech with 785 receiving yards this year. He is right behind Boykin in the school record books and became the team’s starting punter by the end of this season. Coach Frank Beamer has frequently marveled how “Danny just always seems to be open.”', 'And yet neither warranted even honorable mention all-ACC status this year, a snub that quarterback Logan Thomas said made him “extremely upset” and left Beamer wondering about the media members who participated in the voting.', 'In retrospect, Boykin said he recognizes the lack of notoriety is partly due to Virginia Tech’s offensive philosophy. The Hokies have always been known for their rushing attack, and this year was no different. Running back David Wilson earned ACC player of the year honors during a year when Thomas set multiple records for a first-year quarterback.', '“There’s just some things that we were held back from being able to show,” Boykin said, “that we’re just as good as [South Carolina wide receiver] Alshon Jeffrey and [Oklahoma State wide receiver] Justin Blackmon. I feel like they’re great athletes, but at the same time we’re right up there with them.', '“It’s great playing wide receiver here because once we throw the ball, you have opportunities to get big chunks of yardage. What we can’t do is we’re not going to catch 100 balls for 1,500 yards and 22 touchdowns.”', 'The other issue is that neither has the sort of attention-grabbing personality or pedigree associated with big-time wide receivers these days.', 'Coale has graduated with a degree in finance and was named the ACC’s top scholar-athlete this year. He speaks in measured tones reminiscent of a CEO and has yet to join Facebook or Twitter. Boykin is so quiet around the team facility that Beamer said he sometimes doesn’t notice him until he’s making catches on the practice field or in games.', 'Coming out of high school, Coale was barely recruited. Before showing up to a camp in Blacksburg one summer, his only scholarship offer was from VMI, where his father is the head of strength and conditioning. Coale still jokes that when he spent his redshirt year (2007) on the scout team, former Virginia Tech wide receivers and future NFL wideouts Eddie Royal, Andre Davis and Josh Morgan “must have thought I was a walk-on. I prefer to just fly under the radar.”', 'But their accomplishments haven’t gone unnoticed now that the clock is ticking on their careers. Quarterbacks coach Mike O’Cain said Thomas’s comfort level during his record-setting first year under center is a direct reflection of Boykin and Coale. “Not only are they gonna run the right route with the right timing, you know they’re gonna catch the ball,” he said.', 'Years of lining up together has also created a special bond between the two, and it played out before the ACC championship game this year.', 'Boykin was supposed to deliver the pregame speech, but always reticent about public speaking, he was afraid he might stutter and not be taken seriously. He asked Coale to take his place.', '“I’ve been through his struggles, he’s been through mine,” Coale said. “He’s a guy that I know I can count on, whether it’s five years from now, I just know I can count on him and he’ll be there. I know when I look back, part of my Tech experience is going to be him.”'), (WapoDocMedia('image', 'https://img.washingtonpost.com/rw/2010-2019/WashingtonPost/2011/11/12/Sports/Images/Virginia_Tech_Georgia_Tech_Football_0cf3f.jpg', 'Virginia Tech wide receiver Danny Coale (19) was lightly recruited out of Episcopal in Alexandria but has teamed with Jarrett Boykin for a record-breaking receiving duo. (John Bazemore/AP)'),)), 9: WapoDoc('153127ee-341e-11e1-825f-dabc29fd7071', 'https://www.washingtonpost.com/sports/capitals/capitals-vs-blue-jackets-alex-ovechkin-scores-twice-as-washington-wins/2011/12/31/gIQA1deHTP_story.html', 'Capitals vs. Blue Jackets: Alex Ovechkin scores twice as Washington wins', 'Katie Carrera', 1325387427000, 'Capitals/NHL', re.compile("^Columbus goalie Steve Mason makes a save as Grant Clitsome \\(14\\) and Washington's Brooks Laich \\(21\\) l.{3476} the group into making sure it didn’t let the week of progress unravel in one night\\.\n\nCapitals note:$", flags=48), (' As the Washington Capitals headed to their dressing room after 40 minutes of play on Saturday night, some players slammed their sticks in the tunnel while others threw their heads back and stared at the ceiling. It looked as though the team’s 2011 might end with a whimper against the worst team in the NHL.', 'A furious third-period comeback instead launched Washington into 2012 with its first three-game winning streak since October.', 'Three goals in the span of 2 minutes 53 seconds erased a two-goal deficit and fueled the Capitals to a 4-2 win over the Columbus Blue Jackets at Nationwide Arena. It marked just the third time this season that Washington has rallied to win a game after trailing entering the third period.', 'The victory improved the Capitals to 5-2-1 in their past eight contests and gave them a much-needed two points to complete a run of three wins in four nights. Washington (20-15-2) holds 42 points but is sitting outside the Eastern Conference playoff picture in ninth place.', '“For a while here we were going win one, win two, lose a couple — we were going back and forth,” defenseman Dennis Wideman said. “We needed to string some together. The top teams are starting to pull away from us and we’re getting into that time of year where you need to crank it up and start running some wins together to try to get back up there were we need to be.”', 'Columbus, which has just 10 wins in 38 games, outworked the Capitals through the first two periods and carried a two-goal advantage into the final period of regulation. No one in the visitors’ dressing room was pleased with the effort. But rather than accept a subpar fate, Washington opted to throw everything it had at the Blue Jackets.', '“We had nothing to lose,” Coach Dale Hunter said. “We were down 2-0, it’s an all-out blitz and the guys did a great job skating, playing hard, never quit, and it’s a credit to them.”', 'Hunter wanted pressure from all areas, with defensemen pinching to help keep offensive play alive. Ovechkin began the onslaught when he recorded his first goal of the night 4:23 into the third. Though scored as unassisted, his one-timer from the left faceoff circle that broke Steve Mason’s shutout bid was set up by a slap pass from Wideman.', 'The defenseman set up another tally when he found Alexander Semin, whose wicked wrister at 6:48 evened the score at 2 during four-on-four play while the Capitals’ Dmitry Orlov and Grant Clitsome of Columbus were in the penalty box for matching roughing minors.', 'Twenty-eight seconds later Wideman added a goal of his own, his first in nine games. The blast of a slap shot, which also came during four-on-four play, deflected off a Blue Jacket’s stick to beat Mason (22 saves) and stood as the game-winner.', 'Ovechkin added his second of the night and 16th of the season as insurance with just less than nine minutes remaining in regulation with a slap shot on the power play. The star left wing has seven goals in the past eight games.', '“Everybody was upset how we playing [in the first two periods]. I don’t think we play at all our game,” Ovechkin said. “So we just play hard and again this kind of win that we need and all the momentum on our side.”', 'Until the final period, though, the contest was hardly under the Capitals’ control. The game featured choppy play with turnovers aplenty, an abundance of whistles and more than 34 minutes of scoreless action.', 'Early in the game, the Blue Jackets showed a willingness to win battles for the puck in corners and along the boards, forcing the issue against Washington, which looked beleaguered in its second game in as many nights.', 'As the first period wore on, Columbus (10-23-5) gained some momentum and in the final five minutes of the period peppered netminder Tomas Vokoun (35 saves) with shots from all angles and speeds. The veteran goaltender remained composed and helped Washington reach the intermission in a scoreless tie.', 'In the second, what was already a slow-moving contest regressed to more of a crawl as whistles for offsides, high sticks, icings or shots off the netting occurred with stunning frequency. With the play continuing in spurts, the first goal grew in importance, and Columbus got it when a turnover by Jason Chimera turned into a goal by John Moore at 14:47 of the second.', 'Samuel Pahlsson made it 2-0 with just 34.9 seconds remaining in the second, but that tally, which could have set Washington on its heels, galvanized the group into making sure it didn’t let the week of progress unravel in one night.', '“It’s a big win for us. Losing here would have erased the big effort against Buffalo and Rangers earlier this week,” Vokoun said. “We need these points. We’re not sitting in the second or first place where we can say, ‘Well we have a nice comfortable cushion.’ We’re actually outside the playoffs so we need every point we can get.”', 'Capitals note: Mike Green missed a 23rd consecutive contest with a strained right groin muscle. He made the trip with the team, however, and took part in an optional on-ice workout Saturday.'), (WapoDocMedia('image', 'https://img.washingtonpost.com/rw/2010-2019/WashingtonPost/2012/01/01/Production/Daily/Sports/Images/Capitals_Blue_Jackets_Hockey_0ed19.jpg', "Columbus goalie Steve Mason makes a save as Grant Clitsome (14) and Washington's Brooks Laich (21) look for the rebound. (Jay LaPrete/Associated Press)"), WapoDocMedia('image', 'https://img.washingtonpost.com/rw/2010-2019/WashingtonPost/2012/01/01/Production/Daily/Sports/Images/Capitals_Blue_Jackets_Hockey_09796.jpg', "Washington's Dmitry Orlov, and the Blue Jackets' R.J. Umberger (18) fight for a loose puck during the second period. (Jay LaPrete/Associated Press)"),)), 728625: WapoDoc('DIB2IVWJNVG5FNEDFUFAFZWLUU', '/opinions/2020/12/29/worst-things-trump-did-2020/', 'The 10 worst things Trump did in 2020', 'Marc Thiessen', None, 'Opinions', re.compile('^\n10\\. He pardoned war criminals\n9\\. He vetoed the bipartisan National Defense Authorization Act\\.\n8\\. He.{624} from Marc A\\. Thiessen:\n\n\nThe 10 worst things Trump did in 2018\nThe 10 best things Trump did in 2018$', flags=48), ('This week, I offer my annual lists of the 10 best and 10 worst things President Trump did this year. Since 2020 was such a horrible year, we’ll start with the worst things first:', '10. He pardoned war criminals. Trump showed a flagrant disregard of the rule of law by pardoning Blackwater contractors who massacred unarmed Iraqi civilians, including innocent women and children.', '9. He vetoed the bipartisan National Defense Authorization Act. Trump vetoed $741 billion in military spending and a 3 percent pay raise for our troops over an unrelated issue, and put Republicans who voted for it in the difficult position of having to choose whether to flip-flop or override his veto.', '8. He ordered the drawdown of nearly all U.S. forces in Afghanistan and Iraq. Trump was apparently talked out of a complete withdrawal, but reducing to 2,500 troops in each country makes no strategic sense. Despite an ongoing terrorist threat, we will have fewer troops in Afghanistan or Iraq than we do in Spain.', '7. He put millions in limbo by threatening to veto coronavirus relief. After Democrats refused multiple GOP offers since July, Congress finally approved an aid package just as much pandemic relief was expiring. But Trump refused to sign the bill for almost a week — forcing millions of Americans to spend Christmas wondering whether they would be left to fend for themselves during the worst of the pandemic.', '6. He failed to ban travel from Europe in January. Trump announced a travel ban on Jan. 31 on non-U.S. residents who had recently been in mainland China, saving countless lives. But he did not shut down travel from Europe until March 11 — almost six weeks later — because of objections from his economic advisers. The outbreak in New York was seeded by travelers from Italy, and New York then seeded the rest of the country, becoming the primary source of new infections across the United States.', '5. His jarring fights with reporters during coronavirus briefings alienated rather than united us. Trump proudly compared his press briefings to a Mike Tyson boxing match, but frightened Americans didn’t want a boxing match; they wanted information and reassurance. In mid-March, 50.6 percent approved of Trump’s handling of the pandemic, but by April, he lost the American people — and never recovered.', '4. His reluctance to embrace masks cost lives. His refusal to require masks at his Tulsa rally, the maskless superspreader event at the White House to announce Amy Coney Barrett’s Supreme Court nomination, and the scene of him dramatically removing his mask on the White House balcony after returning from Walter Reed all became symbols of his covid response failures.', '3. He is failing to distribute more than half the available doses of vaccine. His administration is undermining the success of Operation Warp Speed by distributing only about 18 million doses this year when about 40 million will be available — leaving about 22 million Americans without any immunity during the deadliest period since the pandemic began.', '2. He lost a winnable election and then refused to accept the results — or his own responsibility for losing. Trump lost because he alienated millions who approved of his policies but were tired of chaos. His mocking of Joe Biden’s cognitive struggles offended seniors, and their support for Trump declined by five points in Arizona and 11 points in Georgia compared with 2016. And after winning suburban voters by two points in 2016, he lost them by 10 this year. If he had performed with these groups the way he did four years ago, no amount of real or imagined fraud could have deprived him of a second term.', '1. He discussed imposing martial law at an Oval Office meeting. The suggestion by Michael Flynn that Trump declare martial law and use the military to re-run the election in swing states is insane. That Trump took it seriously enough to discuss it in the Oval Office is shameful, as are his calls for elected Republicans to overturn the results.', 'Finally, one of the worst things Trump did is not on the list because the results are not yet in: He has barely lifted a finger in Georgia to save Republican control of the Senate. He is so focused on overturning the presidential election that he could very well hand Democrats control of the Senate on Jan. 5 — and with it, unchecked power to reverse his achievements and enact a radical agenda. If that happens, Trump will leave the White House in infamy.', 'That is the worst list of worsts I have compiled in four years. But 2020 also saw some of the greatest accomplishments of Trump’s presidency. Does the good outweigh the bad? Here’s my next column, reviewing the 10 best things Trump did in 2020.', 'Watch Opinions videos: ', 'Read more from Marc A. Thiessen:', 'The 10 worst things Trump did in 2019', 'The 10 best things Trump did in 2019', 'The 10 worst things Trump did in 2018', 'The 10 best things Trump did in 2018', 'The 10 worst things Trump did in his first year in office', 'The 10 best things Trump did in his first year in office'), ()), }) def test_queries(self): self._test_queries('wapo/v2/trec-core-2018', count=50, items={ 0: TrecQuery('321', 'Women in Parliaments', 'Pertinent documents will reflect the fact that women continue to be poorly represented in parliaments across the world, and the gap in political power between the sexes is very wide, particularly in the Third World.', 'Pertinent documents relating to this issue will discuss the lack of representation by women, the countries that mandate the inclusion of a certain percentage of women in their legislatures, decreases if any in female representation in legislatures, and those countries in which there is no representation of women.'), 9: TrecQuery('378', 'euro opposition', 'Identify documents that discuss opposition to the use of the euro, the European currency.', 'A relevant document should include the countries or individuals who oppose the use of the euro and the reason(s) for their opposition to its use.'), 49: TrecQuery('825', 'ethanol and food prices', 'Does diversion of U.S. corn crops into ethanol for fuel increase food prices?', 'Identify documents that discuss the impact of growing corn with the intention of using it for ethanol fuel on food prices in the U.S.'), }) self._test_queries('wapo/v2/trec-news-2018', count=50, items={ 0: TrecBackgroundLinkingQuery('321', '9171debc316e5e2782e0d2404ca7d09d', 'https://www.washingtonpost.com/news/worldviews/wp/2016/09/01/women-are-half-of-the-world-but-only-22-percent-of-its-parliaments/'), 9: TrecBackgroundLinkingQuery('378', '3c5be31e-24ab-11e5-b621-b55e495e9b78', 'https://www.washingtonpost.com/world/europe/to-greeks-german-offers-of-help-sound-more-like-a-threat/2015/07/07/3c5be31e-24ab-11e5-b621-b55e495e9b78_story.html'), 49: TrecBackgroundLinkingQuery('825', 'a1c41a70-35c7-11e3-8a0e-4e2cf80831fc', 'https://www.washingtonpost.com/business/economy/cellulosic-ethanol-once-the-way-of-the-future-is-off-to-a-delayed-boisterous-start/2013/11/08/a1c41a70-35c7-11e3-8a0e-4e2cf80831fc_story.html'), }) self._test_queries('wapo/v2/trec-news-2019', count=60, items={ 0: TrecBackgroundLinkingQuery('826', '96ab542e-6a07-11e6-ba32-5a4bf5aad4fa', 'https://www.washingtonpost.com/sports/nationals/the-minor-leagues-life-in-pro-baseballs-shadowy-corner/2016/08/26/96ab542e-6a07-11e6-ba32-5a4bf5aad4fa_story.html'), 9: TrecBackgroundLinkingQuery('835', 'c0c4e2d0-628f-11e7-a4f7-af34fc1d9d39', 'https://www.washingtonpost.com/local/social-issues/a-healthy-mystery-over-attending-houses-of-worship/2017/07/07/c0c4e2d0-628f-11e7-a4f7-af34fc1d9d39_story.html'), 59: TrecBackgroundLinkingQuery('885', '5ae44bfd66a49bcad7b55b29b55d63b6', 'https://www.washingtonpost.com/news/capital-weather-gang/wp/2017/07/14/sun-erupts-to-mark-another-bastille-day-aurora-possible-in-new-england-sunday-night/'), }) self._test_queries('wapo/v3/trec-news-2020', count=50, items={ 0: TrecBackgroundLinkingQuery('886', 'AEQZNZSVT5BGPPUTTJO7SNMOLE', 'https://www.washingtonpost.com/politics/2019/06/05/trump-says-transgender-troops-cant-serve-because-troops-cant-take-any-drugs-hes-wrong-many-ways/'), 9: TrecBackgroundLinkingQuery('895', '615f0d53ac8f1e05c51bfebf4fdaf0e5', 'https://www.washingtonpost.comhttps://www.washingtonpost.com/news/to-your-health/wp/2015/04/09/sabra-pulls-30000-cases-of-hummus-off-store-shelves-due-to-listeria-fears/'), 49: TrecBackgroundLinkingQuery('935', 'CCUJNXOJNFEJFBL57GD27EHMWI', 'https://www.washingtonpost.com/news/to-your-health/wp/2018/05/30/this-mock-pandemic-killed-150-million-people-next-time-it-might-not-be-a-drill/'), }) def test_qrels(self): self._test_qrels('wapo/v2/trec-core-2018', count=26233, items={ 0: TrecQrel('321', '004c6120d0aa69da29cc045da0562168', 0, '0'), 9: TrecQrel('321', '01664d72845d37c958a504b9b4085883', 0, '0'), 26232: TrecQrel('825', 'ff3a25b0-0ba4-11e4-8341-b8072b1e7348', 0, '0'), }) self._test_qrels('wapo/v2/trec-news-2018', count=8508, items={ 0: TrecQrel('321', '00f57310e5c8ec7833d6756ba637332e', 16, '0'), 9: TrecQrel('321', '09b3167f0d1aa5cfa8be932bb704d75a', 8, '0'), 8507: TrecQrel('825', 'f66b624ba8689d704872fa776fb52860', 0, '0'), }) self._test_qrels('wapo/v2/trec-news-2019', count=15655, items={ 0: TrecQrel('826', '0154349511cd8c49ab862d6cb0d8f6a8', 2, '0'), 9: TrecQrel('826', '054be3904bde907f71d684b268e2273d', 0, '0'), 15654: TrecQrel('885', 'fde80cb0-b4f0-11e2-bbf2-a6f9e9d79e19', 0, '0'), }) self._test_qrels('wapo/v3/trec-news-2020', count=17764, items={ 0: TrecQrel('886', '00183d98-741b-11e5-8248-98e0f5a2e830', 0, '0'), 9: TrecQrel('886', '03c3c222-0e01-11e4-8c9a-923ecc0c7d23', 0, '0'), 17763: TrecQrel('935', 'ff0a760128ecdbcc096cafc8cd553255', 0, '0'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/wikiclir.py ================================================ import re import unittest from ir_datasets.datasets.wikiclir import WikiClirDoc, WikiClirQuery from ir_datasets.formats import TrecQrel from .base import DatasetIntegrationTest class TestWikiclir(DatasetIntegrationTest): def test_docs(self): self._test_docs('wikiclir/ar', count=535118, items={ 0: WikiClirDoc('7', 'ماء', re.compile('^ الماء هو سائل شفاف لا لون لهُ ولا رائحة، ويوجد في الكرة الأرضية في المسطّحات المائيّة من الجداول وا.{959}يّة في المناطق القطبيّة، في حين تتواجد 0\\.3 % من الماء العذب في الأنهار والبحيرات وفي الغلاف الجوّي \\.$', flags=48)), 9: WikiClirDoc('90', 'عنكبوت', re.compile('^ الرُتَيْلاوات رتبة من صف العنكبيات، وهي أكبر رتبة في هذا الصف، إذا تشمل أكثر من 40,000 نوع في 3700 .{1001}نها تقضي معظم وقتها في اصطياد الحشرات والفتك بها \\. فلولاها لتكاثرت الحشرات وأتت على الأخضر واليابس \\.$', flags=48)), 535117: WikiClirDoc('3769734', 'ريجينا سنيفر', re.compile("^ ريجينا سنيفرهي كاتبة ولدت في لبنان في عام 1962 \\. كتبت العديد من الكتب، ونشر آخرها في عام 2013، '' ب.{341} الحرب '' في لبنان، كتاب ترجم إلى اللغة العربية وحرره الفارابي في يوليو / تموز 2008 وقبله جورج قرم \\.$", flags=48)), }) self._test_docs('wikiclir/ca', count=548722, items={ 0: WikiClirDoc('1', 'Àbac', re.compile("^àbac l'àbac \\( del llatí `` abăcus '' , i grec άβαξ\\-ακος , que significa `` taula '' \\) és una eina pe.{962}e napier i permeten llegir directament el resultat de la multiplicació sense fer sumes intermèdies \\.$", flags=48)), 9: WikiClirDoc('18', 'Aeròbic', re.compile("^ laeròbic és una modalitat de gimnàstica sueca amb acompanyament musical que consisteix en una sèrie.{931} la combinació d'exercicis aeròbics amb tonificació dels músculs , també anomenat `` body power \\. ''$", flags=48)), 548721: WikiClirDoc('1514683', 'Bepink-Cogeas', re.compile('^ el bepink\\-cogeas \\( codi uci : bpk \\) és un equip ciclista femení italià \\. creat al 2012 , té categor.{367}\\- web oficial \\- plantilles i resultats a cyclebase\\.nl \\- plantilles i resultats a procyclingstats\\.com$', flags=48)), }) self._test_docs('wikiclir/zh', count=951480, items={ 0: WikiClirDoc('13', '数学', re.compile('^ 数学是利用符号语言研究数量、结构、变化以及空间等概念的一门学科,从某种角度看属于形式科学的一种。数学透过抽象化和逻辑推理的使用,由计数、计算、量度和对物体形状及运动的观察而产生。数学家们拓展这些概念.{939}问 ” )。 史前的人类就已尝试用自然的法则来衡量物质的多少、时间的长短等抽象的数量关系,比如时间单位有日、季节和年等。算术(加减乘除)也自然而然地产生了。古代的石碑及泥版亦证实了当时已有几何的知识。$', flags=48)), 9: WikiClirDoc('53', '经济学', re.compile('^ 经济学是一门对产品和服务的生产、分配以及消费进行研究的社会科学。西方语言中的 “ 经济学 ” 一词源于古希腊的。起初这一领域被称为政治经济学,但19世纪经济学家採用简短的「经济学」一词来代表「经济科.{881}过度广泛,而且无法将分析的范围侷限在对于市场的研究上。然而,自从1960年代起,由于理性选择理论和其引发的赛局理论不断将经济学的研究领域扩张,这个定义已经获得广泛认同,尽管仍有一些对此定义的批评存在。$', flags=48)), 951479: WikiClirDoc('5795145', '族群', re.compile('^ 族群(),是指一群人,他们认为彼此共享了相同的祖先、血缘、外貌、历史、文化、习俗、语言、地域、宗教、生活习惯与国家体验等,因此形成一个共同的群体。为区分我族及「他者」的分类方式之一。族群含义在20世.{689}或方言分类,如客家人、闽南人。 宗教层面 \\. \\- 按宗教信仰对人群分类,如信仰伊斯兰教的群体穆斯林。 参见 \\. \\- 国民(nation) \\- 人种(race) \\- 民系 \\- 氏族 \\- 部落 \\- 原住民$', flags=48)), }) self._test_docs('wikiclir/cs', count=386906, items={ 0: WikiClirDoc('10', 'Astronomie', re.compile('^ astronomie , řecky αστρονομία z άστρον \\( astron \\) hvězda a νόμος \\( nomos \\) zákon , česky též hvězdá.{845} , například planet ve sluneční soustavě \\. základem nebeské mechaniky jsou práce keplera a newtona \\.$', flags=48)), 9: WikiClirDoc('21', 'Matematika', re.compile("^ matematika \\( z řeckého \\( `` mathematikós '' \\) = `` milující poznání '' ; \\( `` máthema '' \\) = `` věd.{1089}kem v reálném světě \\. některé obory čisté matematiky se nacházejí na pomezí s logikou či filozofií \\.$", flags=48)), 386905: WikiClirDoc('1319204', 'Helena Rubinsteinová', re.compile("^ helena rubinsteinová , rodným jménem chaja rubinsteinová \\( 25\\. prosince 1872 , krakov – 1\\. dubna 19.{1121}aci `` helena rubinstein foundation '' , kterou založila \\. byla sestřenicí filozofa martina bubera \\.$", flags=48)), }) self._test_docs('wikiclir/nl', count=1908260, items={ 0: WikiClirDoc('1', 'Albert Speer', re.compile('^ berthold konrad hermann albert speer \\( mannheim , 19 maart 1905 – londen , 1 september 1981 \\) was e.{1195}ald in 1927 bleef speer nog meerdere jaren , als tessenows assistent , aan de hogeschool verbonden \\.$', flags=48)), 9: WikiClirDoc('13', 'Astronomie', re.compile('^ astronomie of sterrenkunde is de wetenschap die zich bezighoudt met de observatie en de studie van .{949}eten bijvoorbeeld zijn meestal ontleend aan amateurastronomen die deze komeet als eerste waarnamen \\.$', flags=48)), 1908259: WikiClirDoc('4848272', 'Karen Briggs (judoka)', re.compile('^karen briggs \\( judoka \\) karen valerie briggs \\( kingston upon hull , 11 april 1963 \\) is een voormalig.{884}land \\( – 48 kg \\) \\- – 1991 praag , tsjecho\\-slowakije \\( – 48 kg \\) \\- – 1981 madrid , spanje \\( – 48 kg \\)$', flags=48)), }) self._test_docs('wikiclir/fi', count=418677, items={ 0: WikiClirDoc('1', 'Amsterdam', re.compile('^ amsterdam on alankomaiden pääkaupunki \\. amsterdam on väkiluvultaan alankomaiden suurin kaupunki , h.{1173}voi , kun se pystyi viemään yhä useampaa tuotetta muualle eurooppaan vapaasti , esimerkiksi olutta \\.$', flags=48)), 9: WikiClirDoc('14', 'Aleksis Kivi', re.compile('^ aleksis kivi \\( oikealta nimeltään alexis stenvall \\) \\( 10\\. lokakuuta 1834 nurmijärvi – 31\\. joulukuut.{1117}johan stenvall oli merimies \\. kirjailijan oma isä erik stenvall oli asunut lapsuutensa helsingissä \\.$', flags=48)), 418676: WikiClirDoc('1401493', 'Jordan Rowley', ' jordan rowley ( s. 3. huhtikuuta 1990 edmonton ) on jääkiekkoilija , joka pelaa lahden pelicansissa .'), }) self._test_docs('wikiclir/fr', count=1894397, items={ 0: WikiClirDoc('3', 'Antoine Meillet', re.compile("^ paul jules antoine meillet , né le à moulins \\( allier \\) et mort le à châteaumeillant \\( cher \\) , est.{884}a au linguiste auguste carrière à la tête de la chaire d'arménien à l'école des langues orientales \\.$", flags=48)), 9: WikiClirDoc('19', 'Algorithme', re.compile("^ un algorithme est une suite finie et non ambiguë d ’ opérations ou d'instructions permettant de rés.{1688}nécessaire pour amener un algorithme à son terme , en fonction de la quantité de données à traiter \\.$", flags=48)), 1894396: WikiClirDoc('11055655', 'Elisabeth Maxwell', re.compile("^ elisabeth `` betty '' maxwell , née meynard , née le et morte le , est une historienne d'origine fr.{866}mpire de la presse \\. elle donne naissance à 9 enfants , mais deux d ’ entre eux meurent en bas\\-âge \\.$", flags=48)), }) self._test_docs('wikiclir/de', count=2091278, items={ 0: WikiClirDoc('1', 'Alan Smithee', re.compile("^ alan smithee steht als pseudonym für einen fiktiven regisseur , der filme verantwortet , bei denen .{1223}ariante `` alan smithee '' war das anagramm `` the alias men '' vermutlich kein entstehungsgrund \\) \\.$", flags=48)), 9: WikiClirDoc('17', 'Liste von Autoren/K', ' kh . - yasmina khadra ( * 1955 )'), 2091277: WikiClirDoc('10015849', 'Soli (BiH)', re.compile('^soli \\( bih \\) soli war ein bosnisches gebiet \\( oblast \\) und eine gespanschaft im mittelalter \\. das ze.{1109} teil des sandžaks zvornik \\( zvornički sandžak \\) und des kadiluk srebrenik \\( srebrenički kadiluk \\) \\.$', flags=48)), }) self._test_docs('wikiclir/it', count=1347011, items={ 0: WikiClirDoc('2', 'Armonium', re.compile("^ l'armonium o armonio \\( in francese , `` harmonium '' \\) è un tipo di organo costituito da una tastie.{1119}quella più alta e , rispettivamente , svolgono l'azione verso l'ottava bassa e verso l'ottava alta \\.$", flags=48)), 9: WikiClirDoc('20', 'Abbie Hoffman', re.compile('^ di origini ebraiche , dotato di una personalità sardonica e vulcanica , di orientamento anarchico e.{937}ers \\( degli attori diventati attivisti sociali \\) , distribuendo cibo gratis e organizzando alloggi \\.$', flags=48)), 1347010: WikiClirDoc('6494686', 'Superflat', re.compile("^ il `` superflat '' è un movimento artistico postmoderno , influenzato dai manga e dagli anime , fon.{959}e boy : the arts of japan ’ s exploding subculture \\. new york : japan society \\. isbn 0\\-913304\\-57\\-3 \\.$", flags=48)), }) self._test_docs('wikiclir/ja', count=1071292, items={ 0: WikiClirDoc('5', 'アンパサンド', re.compile("^ アンパサンド \\( , \\& \\) とは「…と…」を意味する記号である。ラテン語の の合字で、trebuchet msフォントでは、と表示され `` et '' の合字であることが容易にわかる。amper.{874}グル \\) (アンド)は、浜崎あゆみが2003年に発売した4曲入りマキシシングル。 \\- \\& \\( 一青窈のアルバム \\) (アンド)は、一青窈が2005年に発売したアルバム、及び同アルバムに収録された楽曲。$", flags=48)), 9: WikiClirDoc('43', 'コケ植物', re.compile('^ コケ植物(コケしょくぶつ、)とは、陸上植物かつ非維管束植物であるような植物の総称、もしくはそこに含まれる植物のこと。コケ類(コケるい)や蘚苔類(せんたいるい)、蘚苔植物(せんたいしょくぶつ)などとも.{861}糸状の原糸体(げんしたい、protonema)というものを形成する。原糸体は葉緑体をもち、基質表面に伸びた後、その上に植物体が発達を始め配偶体となる。なお、一部に生涯にわたって原糸体を持つものがある。$', flags=48)), 1071291: WikiClirDoc('3641139', '若杉明', re.compile('^ 若杉明(わかすぎ あきら、1929年11月19日\\- \\) は、日本の会計学者、横浜国立大学名誉教授。 横須賀市出身。1958年東京大学大学院経済学会計学博士課程満期退学。68年「実現概念の展開 その会.{814} \\& aの財務・会計戦略』編著 ビジネス教育出版社 1989 \\- 『ソフト化社会と会計』編著 ビジネス教育出版社 1989 \\- 『リストラクチャリングの財務・会計戦略』編 ビジネス教育出版社 1991$', flags=48)), }) self._test_docs('wikiclir/ko', count=394177, items={ 0: WikiClirDoc('5', '지미 카터', re.compile("^ 제임스 얼 `` 지미 '' 카터 주니어 \\( , 1924년 10월 1일 \\~ \\) 는 민주당 출신 미국 39번째 대통령 \\( 1977년 \\~ 1981년 \\) 이다 \\. 지미 카터는 조지아 주.{1114}신 6,000명을 감축하는 데 그쳤다 \\. 또한 박정희 정권의 인권 문제 등과의 논란으로 불협화음을 냈으나 , 1979년 6월 하순 , 대한민국을 방문하여 관계가 다소 회복되었다 \\.$", flags=48)), 9: WikiClirDoc('31', '음계', re.compile('^ 음계 \\( 音階 \\) 는 음악에서 음높이 \\( pitch \\) 순서로 된 음의 집합을 말한다 \\. 악곡을 주로 구성하는 음을 나타낸 것이며 음계의 종류에 따라 곡의 분위기가 달라진다 \\. .{888} 變徵 \\) \\-올림화 \\( fa \\) ·치\\-솔·우\\-라·변궁 \\( 變宮 \\) \\-시로 7음계를 많이 쓴다 \\. 한국 전통 음악에서는 5음계 외에도 3음계 또는 악계통에서는 7음계 등이 쓰인다 \\.$', flags=48)), 394176: WikiClirDoc('1824675', '안세호', re.compile('^ 안세호 \\( 1981년 2월 17일 \\~ \\) 는 대한민국의 배우이다 \\. 학력 \\. \\- 대진대학교 연극영화학부 출연작 \\. 영화 \\. \\- 《골든슬럼버》 \\( 2017년 \\) \\- 《군함도》 \\( .{445}링》 \\( 2005년 \\) \\- 찌라시청년 역 드라마 \\. \\- 《나의 판타스틱한 장례식》 \\( 2015년 , sbs \\) \\- 동수네 작업반 반장 역 \\- 《삼총사》 \\( 2014년 , tvn \\)$', flags=48)), }) self._test_docs('wikiclir/no', count=471420, items={ 0: WikiClirDoc('2', 'Akershus', re.compile("^ akershus \\( fra norrønt `` akr '' , åker , og `` hús '' , borg eller kastell \\) er et norsk fylke , s.{1272}lsen fylke \\. i 1948 ble aker herred overført fra akershus til å bli en del av oslo \\( by og fylke \\) \\.$", flags=48)), 9: WikiClirDoc('18', 'Atalanta BC', re.compile('^ atalanta bergamasca calcio er en italiensk fotballklubb \\. den ble grunnlagt i 1907 i byen bergamo i.{383} 1978–79 \\) \\- glenn strömberg \\( 1984–85 \\) \\- filippo inzaghi \\( 1996–97 \\) \\- christian vieri \\( 2006–07 \\)$', flags=48)), 471419: WikiClirDoc('1521098', 'VM i vektløfting 1910', ' vm i vektløfting 1910 ( verdensmesterskapet i vektløfting ) ble arrangert i düsseldorf og wien i to forskjellige turneringer i 1910 .'), }) self._test_docs('wikiclir/nn', count=133290, items={ 0: WikiClirDoc('1', 'Hovudside', ' __ingainnhaldsliste__ __ingabolkredigering__'), 9: WikiClirDoc('28', 'Jødedommen', re.compile('^ jødedommen er den religiøse kulturen åt det jødiske folket \\. han er ein av dei først dokumenterte m.{1106} utøvinga av desse lovane og boda slik dei blir tolka av dei ulike antikke og moderne autoritetane \\.$', flags=48)), 133289: WikiClirDoc('341641', 'Harry Danielsen', re.compile('^ harry danielsen var ein norsk skulemann og politikar frå rødøy i nordland \\. han representerte nordl.{883} då medlem i forbrukar\\- og administrasjonskomitéen \\. sommaren 1987 melde danielsen seg ut av høgre \\.$', flags=48)), }) self._test_docs('wikiclir/pl', count=1234316, items={ 0: WikiClirDoc('2', 'AWK', re.compile('^ awk – interpretowany język programowania , którego główną funkcją jest wyszukiwanie i przetwarzanie.{895}dice_2 \\) i wartości , które mają być udostępnione w predefiniowanych zmiennych codice_3 i codice_4 \\.$', flags=48)), 9: WikiClirDoc('15', 'AmigaOS', re.compile('^ amigaos – system operacyjny opracowany przez firmę commodore international dla produkowanych przez .{942}ta implementacja systemu amigaos pod nazwą aros \\. dostępna jest ona między innymi na platformę x86 \\.$', flags=48)), 1234315: WikiClirDoc('4059443', 'Sóweczka ekwadorska', re.compile("^ sóweczka ekwadorska \\( `` glaucidium nubicola '' \\) – gatunek małego ptaka z rodziny puszczykowatych .{1006}żej spokrewnione są z dwuplamistymi \\( `` g\\. gnoma '' \\) i kostarykańskimi \\( `` g\\. costaricanum '' \\) \\.$", flags=48)), }) self._test_docs('wikiclir/pt', count=973057, items={ 0: WikiClirDoc('220', 'Astronomia', re.compile("^ astronomia é uma ciência natural que estuda corpos celestes \\( como estrelas , planetas , cometas , .{963}vium '' que , junto com o `` trivium '' , compunha a metodologia de ensino das sete artes liberais \\.$", flags=48)), 9: WikiClirDoc('235', 'Lista de padrões de arquivo gráfico', re.compile('^ \\- amiga interchange file format \\( iff \\) \\- adobe photoshop image \\( psd \\) \\- compuserv graphics interc.{293}f/tiff \\) \\- truevision targa \\( tga \\) \\- windows and os/2 bitmap \\( bmp/dib \\) \\- zsoft paintbrush \\( pcx \\)$', flags=48)), 973056: WikiClirDoc('5499216', 'Chaudhry Muhammad Ali', re.compile('^ chaudhry mohammad ali \\( punjabi , urdu : چوہدری محمد علی\u200e ; 15 de julho de 1905 – 2 de dezembro de .{189}e estado em 1958\\. ao longo da sua carreira , foi também ministro das finanças e ministro da defesa \\.$', flags=48)), }) self._test_docs('wikiclir/ro', count=376655, items={ 0: WikiClirDoc('1', 'Rocarta', re.compile('^ rocarta este o enciclopedie în format electronic care conține articole legate de românia , republic.{1032}imilare tipărite , precum și albume de imagini în care s\\-au investit mai mulți sau mai puțini bani \\.$', flags=48)), 9: WikiClirDoc('24', 'Romania (dezambiguizare)', re.compile('^romania \\( dezambiguizare \\) romania , în această grafie , se poate referi la : \\- capul , mina de baux.{948}ui roman de răsărit în secolele vi și vii , când restul italiei trecuse sub stăpânirea lombarzilor \\.$', flags=48)), 376654: WikiClirDoc('2013894', 'Rezonanță (chimie)', re.compile('^rezonanță \\( chimie \\) în chimie , rezonanța sau mezomeria face referire la oscilarea structurii chimi.{805}r , și nu prin poziția nucleelor \\. vezi și \\. \\- aromaticitate \\- tautomerie \\- delocalizare electronică$', flags=48)), }) self._test_docs('wikiclir/ru', count=1413945, items={ 0: WikiClirDoc('7', 'Литва', re.compile('^литва литва́ \\( \\) , официальное название — лито́вская респу́блика \\( \\) — государство , географически р.{867}олм аукштояс \\( \\) \\( или аукштасис калнас \\( \\) \\) в юго\\-восточной части страны , в 23,5 км от вильнюса \\.$', flags=48)), 9: WikiClirDoc('27', 'Киевская Русь', re.compile("^киевская русь ки́евская русь , древнеру́сское госуда́рство , дре́вняя русь \\( ' , ' , , др\\.\\-сканд \\. `.{909}ической дезинтеграции , что впоследствии сыграло важную роль в процессе объединения русских земель \\.$", flags=48)), 1413944: WikiClirDoc('7070375', 'Перекрёстки (телесериал, 1994)', re.compile('^перекрёстки \\( телесериал , 1994 \\) перекрёстки \\( \\) — мексиканский 68 серийный телесериал 1994 года те.{840}chir \\- orlando soles \\- isabel andrade \\- celia álvarez de soles \\- héctor cruz lara \\- reynaldo álvarez$', flags=48)), }) self._test_docs('wikiclir/en-simple', count=127089, items={ 0: WikiClirDoc('1', 'April', re.compile('^ april is the fourth month of the year , and comes between march and may \\. it is one of four months .{1212}mmediately after that , april finishes on the same day of the week as january of the previous year \\.$', flags=48)), 9: WikiClirDoc('18', 'Andouille', re.compile('^ andouille is a type of pork sausage \\. it is spicy \\( hot in taste \\) and smoked \\. there are different.{560}ane for a maximum of seven or eight hours , at about 175 degrees fahrenheit \\( 80 degrees celsius \\) \\.$', flags=48)), 127088: WikiClirDoc('594702', 'Digital video', re.compile('^ digital video is a representation of moving visual images in the form of encoded \\. this is in contr.{831}include hdmi , displayport , digital visual interface \\( dvi \\) and serial digital interface \\( sdi \\) \\.$', flags=48)), }) self._test_docs('wikiclir/es', count=1302958, items={ 0: WikiClirDoc('7', 'Andorra', re.compile('^ andorra , oficialmente principado de andorra \\( \\) , es un pequeño país soberano del suroeste de euro.{831}a oficial es el catalán que convive con el español y en menor medida con el francés y el portugués \\.$', flags=48)), 9: WikiClirDoc('24', 'Arquitectura', re.compile("^ la arquitectura es el arte y la técnica de proyectar , diseñar , construir y modificar el hábitat h.{1568}ángulos y llevada a término por una mente y una inteligencia culta '' '' \\( del lib \\. i , cap \\. i \\) \\.$", flags=48)), 1302957: WikiClirDoc('8045476', 'Inés Márquez Moreno', re.compile('^ poetisa cuencana nacida el 7 de junio de 1916 , hija del dr\\. ricardo márquez tapia y de la sra \\. ro.{1166}o vega , humberto mata , enrique noboa arízaga , rigoberto cordero león y jacinto cordero espinoza \\.$', flags=48)), }) self._test_docs('wikiclir/sw', count=37079, items={ 0: WikiClirDoc('2', 'Akiolojia', re.compile("^ akiolojia \\( kutoka kiyunani αρχαίος = `` zamani '' na λόγος = `` neno , usemi '' \\) ni somo linalohu.{1003}i ya kiroma , lakini mji uliharibika kabisa na kufunikwa na majivu ya volkeno vesuvio mwaka 79 b\\.k \\.$", flags=48)), 9: WikiClirDoc('33', 'Lugha asilia', re.compile("^ 1\\. lugha asilia ni lugha ambayo ilikua kama sehemu ya utamaduni wa umma fulani ambao watu wake wana.{131} kuzungumzwa na watu , na lugha za kompyuta na za kuandaa programu zinaitwa `` lugha za kuundwa '' \\.$", flags=48)), 37078: WikiClirDoc('92114', 'Kaptura', re.compile('^ kaptura ni vazi lililovaliwa na wanaume na wanawake juu ya eneo la pelvic yao , wakizunguka kiuno n.{272}joto au katika mazingira ambapo faraja na mtiririko wa hewa ni muhimu zaidi kuliko ulinzi wa miguu \\.$', flags=48)), }) self._test_docs('wikiclir/sv', count=3785412, items={ 0: WikiClirDoc('1', 'Amager', re.compile('^ amager är en dansk ö i öresund \\. öns norra och västra delar tillhör köpenhamn , medan övriga delar .{1153}i många köpenhamnsbors ögon , men inställningen håller på att ändras i takt med stigande huspriser \\.$', flags=48)), 9: WikiClirDoc('12', '1 april', re.compile('^ 1 april är den 91 : a dagen på året i den gregorianska kalendern \\( 92 : a under skottår \\) \\. det åte.{923}en fransk abbot från 1100\\-talet , på dagens datum före 1747 , då det utgick till förmån för harald \\.$', flags=48)), 3785411: WikiClirDoc('8048978', 'Surçina', ' surçina ( albanska : surçina , serbiska : svrčina ) är en by i kosovo . den ligger i kommunen ferizaj . enligt den senaste folkräkningen år 2011 fanns det 222 invånare .'), }) self._test_docs('wikiclir/tl', count=79008, items={ 0: WikiClirDoc('5', 'Wikipedia', re.compile('^ ang wikipedia ay isang ensiklopedya na may basehang wiki at may malayang nilalaman \\. ito ay tinataw.{845}l , at apache \\) \\. ang mga kalahok sa wikipediang sumusunod , at pinagtitibay , ang ilang patakaran \\.$', flags=48)), 9: WikiClirDoc('603', 'Astronomiya', re.compile("^ ang dalubtalaan \\( astronomiya \\) ay isang agham na kinapapalooban ng pagmamasid at pagpapaliwanag ng.{963}' = `` astron '' \\+ `` nomos '' , na mayroong literal na kahulugang `` '' batas ng mga bituin '' '' \\.$", flags=48)), 79007: WikiClirDoc('267691', 'Nao Iwadate', ' si nao iwadate ( ipinaganak agosto 17 , 1988 ) ay isang manlalaro ng putbol sa hapon .'), }) self._test_docs('wikiclir/tr', count=295593, items={ 0: WikiClirDoc('10', 'Cengiz Han', re.compile("^ cengiz han \\( `` cenghis khan '' , `` çinggis haan '' ya da doğum adıyla temuçin \\( anlamı : demirci .{1648}a çağırmış ve moğolca için uygur alfabesini uyarlatarak bunu çocuklarına da öğretmesini istemiştir \\.$", flags=48)), 9: WikiClirDoc('40', 'Beşiktaş JK', re.compile("^ beşiktaş jimnastik kulübü , 1903 yılında istanbul'da kurulan spor kulübüdür \\. bereket jimnastik kul.{864}ndan biridir \\. armasında türk bayrağı amblemi taşıma hakkını elde etmiş az sayıda takımdan biridir \\.$", flags=48)), 295592: WikiClirDoc('2268203', 'Prachatice İlçesi', re.compile("^prachatice ilçesi prachatice ilçesi , çek cumhuriyeti'nin güney bohemya bölgesinde bulunan ilçedir \\..{707}zí \\- volary \\- vrbice \\- záblatí \\- zábrdí \\- zálezly \\- zbytiny \\- zdíkov \\- žárovná \\- želnava \\- žernovice$", flags=48)), }) self._test_docs('wikiclir/uk', count=704903, items={ 0: WikiClirDoc('3', 'Головна сторінка', 'головна сторінка'), 9: WikiClirDoc('592', 'Біологія', re.compile('^біологія біоло́гія \\( — життя , — слово ; наука \\) — система наук , що вивчає життя в усіх його проява.{1135}авляють самостійні дисципліни — анатомія , фізіологія , гістологія , біохімія , мікробіологія тощо \\.$', flags=48)), 704902: WikiClirDoc('2485891', 'Хліб, любов і фантазія', re.compile("^хліб , любов і фантазія « хліб .*, що не любить чужої жалості , розриває її .$", flags=48)), }) self._test_docs('wikiclir/vi', count=1392152, items={ 0: WikiClirDoc('4', 'Internet Society', re.compile("^ internet society hay isoc là một tổ chức quốc tế hoạt động phi lợi nhuận , phi chính phủ và bao gồm.{820}d the internet society '' \\- về internet engineering task force và isoc , bài của vint cerf 18/6/1995$", flags=48)), 9: WikiClirDoc('56', 'Lào', re.compile("^ lào \\( , , `` lāo '' \\) , tên chính thức là nước cộng hoà dân chủ nhân dân lào , \\( tiếng lào : ສາທາລະ.{922} kết quả là chấm dứt chế độ quân chủ , phong trào pathet lào theo chủ nghĩa cộng sản lên nắm quyền \\.$", flags=48)), 1392151: WikiClirDoc('6111969', 'Sơn lục đậu', re.compile('^ còn gọi là vọng giang nam , cốt khí mồng , dương giác đậu , giang nam đậu , thạch quyết minh , dã b.{875} ta dùng toàn bộ cây , hay chỉ lá , hái hạt về phơi khô \\. ở việt nam người ta chưa chú ý khai thác \\.$', flags=48)), }) def test_queries(self): self._test_queries('wikiclir/ar', count=324489, items={ 0: WikiClirQuery('12', 'Anarchism', 'is a political philosophy that advocates self-governed societies based on voluntary institutions.'), 9: WikiClirQuery('324', 'Academy Awards', 'the , now known officially as the oscars, is a set of twenty-four for artistic and technical merit in the american film industry, given annually by the of motion picture arts and sciences (ampas), to recognize excellence in cinematic achievements as assessed by the voting membership.'), 324488: WikiClirQuery('54964051', 'Tal Afar offensive (2017)', 'the is an ongoing announced on 20 august 2017 by iraqi prime minister haider al-abadi in order to liberate the region from the islamic state of iraq and the levant (isil).'), }) self._test_queries('wikiclir/ca', count=339586, items={ 0: WikiClirQuery('12', 'Anarchism', 'is a political philosophy that advocates self-governed societies based on voluntary institutions.'), 9: WikiClirQuery('316', 'Academy Award for Best Production Design', 'the recognizes achievement for art direction in film.'), 339585: WikiClirQuery('54965687', 'Karl Heinrich Gräffe', '(1799-1873) was a german mathematician, who was professor at university of zurich.'), }) self._test_queries('wikiclir/zh', count=463273, items={ 0: WikiClirQuery('12', 'Anarchism', 'is a political philosophy that advocates self-governed societies based on voluntary institutions.'), 9: WikiClirQuery('316', 'Academy Award for Best Production Design', 'the recognizes achievement for art direction in film.'), 463272: WikiClirQuery('54967133', 'United Nations Security Council Resolution 2371', 'the unanimously adopted on august 5, 2017, with approval of all the five permanent members and the ten non-permanent members in response to north korea’s july 2017 missile tests.'), }) self._test_queries('wikiclir/cs', count=233553, items={ 0: WikiClirQuery('12', 'Anarchism', 'is a political philosophy that advocates self-governed societies based on voluntary institutions.'), 9: WikiClirQuery('334', 'International Atomic Time', "(tai, from the french name ) is a high-precision coordinate standard based on the notional passage of proper on earth's geoid."), 233552: WikiClirQuery('54961893', 'Vincenzo Legrenzio Ciampi', '(piacenza, 2 april 1719 – venice, 30 march 1762) was an italian composer.'), }) self._test_queries('wikiclir/nl', count=687718, items={ 0: WikiClirQuery('12', 'Anarchism', 'is a political philosophy that advocates self-governed societies based on voluntary institutions.'), 9: WikiClirQuery('324', 'Academy Awards', 'the , now known officially as the oscars, is a set of twenty-four for artistic and technical merit in the american film industry, given annually by the of motion picture arts and sciences (ampas), to recognize excellence in cinematic achievements as assessed by the voting membership.'), 687717: WikiClirQuery('54967572', 'SV Marken', 'sportvereniging (dutch for "sport club marken", commonly shortened to cv marken, or just marken) is an association football club from marken, netherlands.'), }) self._test_queries('wikiclir/fi', count=273819, items={ 0: WikiClirQuery('12', 'Anarchism', 'is a political philosophy that advocates self-governed societies based on voluntary institutions.'), 9: WikiClirQuery('316', 'Academy Award for Best Production Design', 'the recognizes achievement for art direction in film.'), 273818: WikiClirQuery('54966570', 'Nadezhda Babkina', 'georgieva (; born 19 march, 1950, chyorny yar, astrakhan oblast, soviet union) is а soviet and russian folk and pop singer.'), }) self._test_queries('wikiclir/fr', count=1089179, items={ 0: WikiClirQuery('12', 'Anarchism', 'is a political philosophy that advocates self-governed societies based on voluntary institutions.'), 9: WikiClirQuery('316', 'Academy Award for Best Production Design', 'the recognizes achievement for art direction in film.'), 1089178: WikiClirQuery('54967313', 'Lilly Wood and The Prick au Trianon', 'is a 2013 french musical movie directed by benjamin lemaire.'), }) self._test_queries('wikiclir/de', count=938217, items={ 0: WikiClirQuery('12', 'Anarchism', 'is a political philosophy that advocates self-governed societies based on voluntary institutions.'), 9: WikiClirQuery('316', 'Academy Award for Best Production Design', 'the recognizes achievement for art direction in film.'), 938216: WikiClirQuery('54967235', 'Journal of Risk and Uncertainty', 'the is a bimonthly peer-reviewed academic covering the study of analysis and decision-making under uncertainty.'), }) self._test_queries('wikiclir/it', count=808605, items={ 0: WikiClirQuery('12', 'Anarchism', 'is a political philosophy that advocates self-governed societies based on voluntary institutions.'), 9: WikiClirQuery('316', 'Academy Award for Best Production Design', 'the recognizes achievement for art direction in film.'), 808604: WikiClirQuery('54967555', '1999 Merano Open – Doubles', 'lucas arnold ker and jaime oncins win the title by defeating marc-kevin goellner and eric taino 6–4, 7–6 in the final.'), }) self._test_queries('wikiclir/ja', count=426431, items={ 0: WikiClirQuery('12', 'Anarchism', 'is a political philosophy that advocates self-governed societies based on voluntary institutions.'), 9: WikiClirQuery('316', 'Academy Award for Best Production Design', 'the recognizes achievement for art direction in film.'), 426430: WikiClirQuery('54966134', "Pu'an Signal Station", '() is a railway on the taiwan railways administration (tra) south-link line located in daren township, taitung county, taiwan.'), }) self._test_queries('wikiclir/ko', count=224855, items={ 0: WikiClirQuery('12', 'Anarchism', 'is a political philosophy that advocates self-governed societies based on voluntary institutions.'), 9: WikiClirQuery('316', 'Academy Award for Best Production Design', 'the recognizes achievement for art direction in film.'), 224854: WikiClirQuery('54965501', 'Lee Yoo-jin (actor)', '(hangul: ; born april 6, 1992) is a south korean actor.'), }) self._test_queries('wikiclir/no', count=299897, items={ 0: WikiClirQuery('12', 'Anarchism', 'is a political philosophy that advocates self-governed societies based on voluntary institutions.'), 9: WikiClirQuery('324', 'Academy Awards', 'the , now known officially as the oscars, is a set of twenty-four for artistic and technical merit in the american film industry, given annually by the of motion picture arts and sciences (ampas), to recognize excellence in cinematic achievements as assessed by the voting membership.'), 299896: WikiClirQuery('54967547', 'Wassana Panyapuek', '(born 14 december 1968) is a thai sprinter.'), }) self._test_queries('wikiclir/nn', count=99493, items={ 0: WikiClirQuery('12', 'Anarchism', 'is a political philosophy that advocates self-governed societies based on voluntary institutions.'), 9: WikiClirQuery('339', 'Ayn Rand', "(; born alisa zinov'yevna rosenbaum, ; – march 6, 1982) was a russian-american novelist, philosopher, playwright, and screenwriter."), 99492: WikiClirQuery('54952283', 'Lekamøya', 'is a mountain in the municipality of leka in nord-trøndelag, norway.'), }) self._test_queries('wikiclir/pl', count=693656, items={ 0: WikiClirQuery('12', 'Anarchism', 'is a political philosophy that advocates self-governed societies based on voluntary institutions.'), 9: WikiClirQuery('324', 'Academy Awards', 'the , now known officially as the oscars, is a set of twenty-four for artistic and technical merit in the american film industry, given annually by the of motion picture arts and sciences (ampas), to recognize excellence in cinematic achievements as assessed by the voting membership.'), 693655: WikiClirQuery('54966439', 'Top fermentation', 'or high is a brewing method for beer whereby the yeast floats on of the wort.'), }) self._test_queries('wikiclir/pt', count=611732, items={ 0: WikiClirQuery('12', 'Anarchism', 'is a political philosophy that advocates self-governed societies based on voluntary institutions.'), 9: WikiClirQuery('324', 'Academy Awards', 'the , now known officially as the oscars, is a set of twenty-four for artistic and technical merit in the american film industry, given annually by the of motion picture arts and sciences (ampas), to recognize excellence in cinematic achievements as assessed by the voting membership.'), 611731: WikiClirQuery('54964827', 'Monalysa Alcântara', '(born 26 january 1999) is a brazilian model and beauty pageant titleholder who won miss brasil 2017.'), }) self._test_queries('wikiclir/ro', count=199264, items={ 0: WikiClirQuery('12', 'Anarchism', 'is a political philosophy that advocates self-governed societies based on voluntary institutions.'), 9: WikiClirQuery('316', 'Academy Award for Best Production Design', 'the recognizes achievement for art direction in film.'), 199263: WikiClirQuery('54965687', 'Karl Heinrich Gräffe', '(1799-1873) was a german mathematician, who was professor at university of zurich.'), }) self._test_queries('wikiclir/ru', count=664924, items={ 0: WikiClirQuery('12', 'Anarchism', 'is a political philosophy that advocates self-governed societies based on voluntary institutions.'), 9: WikiClirQuery('324', 'Academy Awards', 'the , now known officially as the oscars, is a set of twenty-four for artistic and technical merit in the american film industry, given annually by the of motion picture arts and sciences (ampas), to recognize excellence in cinematic achievements as assessed by the voting membership.'), 664923: WikiClirQuery('54966570', 'Nadezhda Babkina', 'georgieva (; born 19 march, 1950, chyorny yar, astrakhan oblast, soviet union) is а soviet and russian folk and pop singer.'), }) self._test_queries('wikiclir/en-simple', count=114572, items={ 0: WikiClirQuery('12', 'Anarchism', 'is a political philosophy that advocates self-governed societies based on voluntary institutions.'), 9: WikiClirQuery('324', 'Academy Awards', 'the , now known officially as the oscars, is a set of twenty-four for artistic and technical merit in the american film industry, given annually by the of motion picture arts and sciences (ampas), to recognize excellence in cinematic achievements as assessed by the voting membership.'), 114571: WikiClirQuery('54964009', 'Pyotr Deynekin', 'stepanovich (14 december 1937 – 19 august 2017) was a russian military general.'), }) self._test_queries('wikiclir/es', count=781642, items={ 0: WikiClirQuery('12', 'Anarchism', 'is a political philosophy that advocates self-governed societies based on voluntary institutions.'), 9: WikiClirQuery('324', 'Academy Awards', 'the , now known officially as the oscars, is a set of twenty-four for artistic and technical merit in the american film industry, given annually by the of motion picture arts and sciences (ampas), to recognize excellence in cinematic achievements as assessed by the voting membership.'), 781641: WikiClirQuery('54966770', 'Selene Johnson', '(february 20, 1876-december 11, 1960) was an american stage and silent film actress born in philadelphia, pennsylvania (usa) as knapp johnson.'), }) self._test_queries('wikiclir/sw', count=22860, items={ 0: WikiClirQuery('12', 'Anarchism', 'is a political philosophy that advocates self-governed societies based on voluntary institutions.'), 9: WikiClirQuery('580', 'Astronomer', 'an is a scientist in the field of astronomy who concentrates their studies on a specific question or field outside the scope of earth.'), 22859: WikiClirQuery('54716724', 'Tirax language', 'is an oceanic spoken in north east malakula, vanuatu.'), }) self._test_queries('wikiclir/sv', count=639073, items={ 0: WikiClirQuery('12', 'Anarchism', 'is a political philosophy that advocates self-governed societies based on voluntary institutions.'), 9: WikiClirQuery('316', 'Academy Award for Best Production Design', 'the recognizes achievement for art direction in film.'), 639072: WikiClirQuery('54963595', 'Sirkka Selja', '(sirkka-liisa tulonen; 20 march 1920 – 17 august 2017) was a finnish poet and writer.'), }) self._test_queries('wikiclir/tl', count=48930, items={ 0: WikiClirQuery('12', 'Anarchism', 'is a political philosophy that advocates self-governed societies based on voluntary institutions.'), 9: WikiClirQuery('358', 'Algeria', "( '; , '; ), officially the people's democratic republic of algeria, is a sovereign state in north africa on the mediterranean coast."), 48929: WikiClirQuery('54959191', 'Miho Yoshioka (tarento)', 'she was born from higashiōsaka, osaka prefecture.'), }) self._test_queries('wikiclir/tr', count=185388, items={ 0: WikiClirQuery('12', 'Anarchism', 'is a political philosophy that advocates self-governed societies based on voluntary institutions.'), 9: WikiClirQuery('324', 'Academy Awards', 'the , now known officially as the oscars, is a set of twenty-four for artistic and technical merit in the american film industry, given annually by the of motion picture arts and sciences (ampas), to recognize excellence in cinematic achievements as assessed by the voting membership.'), 185387: WikiClirQuery('54965031', 'Himmet Karadağ', '(born 1974, denizli, turkey) is a turkish bureaucrat and chairman of borsa istanbul the sole exchange entity of turkey.'), }) self._test_queries('wikiclir/uk', count=348222, items={ 0: WikiClirQuery('12', 'Anarchism', 'is a political philosophy that advocates self-governed societies based on voluntary institutions.'), 9: WikiClirQuery('324', 'Academy Awards', 'the , now known officially as the oscars, is a set of twenty-four for artistic and technical merit in the american film industry, given annually by the of motion picture arts and sciences (ampas), to recognize excellence in cinematic achievements as assessed by the voting membership.'), 348221: WikiClirQuery('54966570', 'Nadezhda Babkina', 'georgieva (; born 19 march, 1950, chyorny yar, astrakhan oblast, soviet union) is а soviet and russian folk and pop singer.'), }) self._test_queries('wikiclir/vi', count=354312, items={ 0: WikiClirQuery('12', 'Anarchism', 'is a political philosophy that advocates self-governed societies based on voluntary institutions.'), 9: WikiClirQuery('324', 'Academy Awards', 'the , now known officially as the oscars, is a set of twenty-four for artistic and technical merit in the american film industry, given annually by the of motion picture arts and sciences (ampas), to recognize excellence in cinematic achievements as assessed by the voting membership.'), 354311: WikiClirQuery('54960571', 'Dictyocaryum lamarckianum', 'is a species of flowering plant in the arecaceae family.'), }) def test_qrels(self): self._test_qrels('wikiclir/ar', count=519269, items={ 0: TrecQrel('12', '23571', 2, 'Q0'), 9: TrecQrel('12', '804785', 1, 'Q0'), 519268: TrecQrel('54964051', '3769457', 2, 'Q0'), }) self._test_qrels('wikiclir/ca', count=965233, items={ 0: TrecQrel('12', '15902', 2, 'Q0'), 9: TrecQrel('12', '1010721', 1, 'Q0'), 965232: TrecQrel('54965687', '1423451', 2, 'Q0'), }) self._test_qrels('wikiclir/zh', count=926130, items={ 0: TrecQrel('12', '87200', 2, 'Q0'), 9: TrecQrel('12', '16304', 1, 'Q0'), 926129: TrecQrel('54967133', '5795012', 2, 'Q0'), }) self._test_qrels('wikiclir/cs', count=954370, items={ 0: TrecQrel('12', '12682', 2, 'Q0'), 9: TrecQrel('12', '430366', 1, 'Q0'), 954369: TrecQrel('54961893', '2646', 1, 'Q0'), }) self._test_qrels('wikiclir/nl', count=2334644, items={ 0: TrecQrel('12', '11036', 2, 'Q0'), 9: TrecQrel('12', '134021', 1, 'Q0'), 2334643: TrecQrel('54967572', '2716534', 2, 'Q0'), }) self._test_qrels('wikiclir/fi', count=939613, items={ 0: TrecQrel('12', '7556', 2, 'Q0'), 9: TrecQrel('12', '1101970', 1, 'Q0'), 939612: TrecQrel('54966570', '529972', 2, 'Q0'), }) self._test_qrels('wikiclir/fr', count=5137366, items={ 0: TrecQrel('12', '178', 2, 'Q0'), 9: TrecQrel('12', '1312543', 1, 'Q0'), 5137365: TrecQrel('54967313', '6378662', 1, 'Q0'), }) self._test_qrels('wikiclir/de', count=5550454, items={ 0: TrecQrel('12', '24409', 2, 'Q0'), 9: TrecQrel('12', '3103271', 1, 'Q0'), 5550453: TrecQrel('54967235', '7427899', 1, 'Q0'), }) self._test_qrels('wikiclir/it', count=3443633, items={ 0: TrecQrel('12', '22305', 2, 'Q0'), 9: TrecQrel('12', '14627', 1, 'Q0'), 3443632: TrecQrel('54967555', '3455512', 2, 'Q0'), }) self._test_qrels('wikiclir/ja', count=3338667, items={ 0: TrecQrel('12', '1430709', 2, 'Q0'), 9: TrecQrel('12', '2963727', 1, 'Q0'), 3338666: TrecQrel('54966134', '1664146', 1, 'Q0'), }) self._test_qrels('wikiclir/ko', count=568205, items={ 0: TrecQrel('12', '10071', 2, 'Q0'), 9: TrecQrel('12', '86969', 1, 'Q0'), 568204: TrecQrel('54965501', '1824430', 1, 'Q0'), }) self._test_qrels('wikiclir/no', count=963514, items={ 0: TrecQrel('12', '31', 2, 'Q0'), 9: TrecQrel('12', '285079', 1, 'Q0'), 963513: TrecQrel('54967547', '1387292', 2, 'Q0'), }) self._test_qrels('wikiclir/nn', count=250141, items={ 0: TrecQrel('12', '10770', 2, 'Q0'), 9: TrecQrel('12', '130318', 1, 'Q0'), 250140: TrecQrel('54952283', '2757', 1, 'Q0'), }) self._test_qrels('wikiclir/pl', count=2471360, items={ 0: TrecQrel('12', '25', 2, 'Q0'), 9: TrecQrel('12', '14226', 1, 'Q0'), 2471359: TrecQrel('54966439', '1937710', 1, 'Q0'), }) self._test_qrels('wikiclir/pt', count=1741889, items={ 0: TrecQrel('12', '230', 2, 'Q0'), 9: TrecQrel('12', '2121768', 1, 'Q0'), 1741888: TrecQrel('54964827', '1311522', 1, 'Q0'), }) self._test_qrels('wikiclir/ro', count=451180, items={ 0: TrecQrel('12', '23210', 2, 'Q0'), 9: TrecQrel('12', '226810', 1, 'Q0'), 451179: TrecQrel('54965687', '1736377', 2, 'Q0'), }) self._test_qrels('wikiclir/ru', count=2321384, items={ 0: TrecQrel('12', '3021', 2, 'Q0'), 9: TrecQrel('12', '2051069', 1, 'Q0'), 2321383: TrecQrel('54966570', '3117631', 1, 'Q0'), }) self._test_qrels('wikiclir/en-simple', count=250380, items={ 0: TrecQrel('12', '4807', 2, 'Q0'), 9: TrecQrel('25', '46790', 1, 'Q0'), 250379: TrecQrel('54964009', '594669', 2, 'Q0'), }) self._test_qrels('wikiclir/es', count=2894807, items={ 0: TrecQrel('12', '2190809', 2, 'Q0'), 9: TrecQrel('12', '221716', 1, 'Q0'), 2894806: TrecQrel('54966770', '8045048', 2, 'Q0'), }) self._test_qrels('wikiclir/sw', count=57924, items={ 0: TrecQrel('12', '16420', 2, 'Q0'), 9: TrecQrel('303', '6834', 1, 'Q0'), 57923: TrecQrel('54716724', '74685', 2, 'Q0'), }) self._test_qrels('wikiclir/sv', count=2069453, items={ 0: TrecQrel('12', '149', 2, 'Q0'), 9: TrecQrel('12', '79772', 1, 'Q0'), 2069452: TrecQrel('54963595', '263597', 1, 'Q0'), }) self._test_qrels('wikiclir/tl', count=72359, items={ 0: TrecQrel('12', '87382', 2, 'Q0'), 9: TrecQrel('305', '202908', 1, 'Q0'), 72358: TrecQrel('54959191', '155814', 2, 'Q0'), }) self._test_qrels('wikiclir/tr', count=380651, items={ 0: TrecQrel('12', '21889', 2, 'Q0'), 9: TrecQrel('12', '54359', 1, 'Q0'), 380650: TrecQrel('54965031', '2098262', 2, 'Q0'), }) self._test_qrels('wikiclir/uk', count=913358, items={ 0: TrecQrel('12', '12101', 2, 'Q0'), 9: TrecQrel('12', '1370301', 1, 'Q0'), 913357: TrecQrel('54966570', '2004654', 1, 'Q0'), }) self._test_qrels('wikiclir/vi', count=611355, items={ 0: TrecQrel('12', '307178', 2, 'Q0'), 9: TrecQrel('303', '33804', 2, 'Q0'), 611354: TrecQrel('54960571', '2174311', 2, 'Q0'), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/integration/wikir.py ================================================ import re import unittest from ir_datasets.formats import GenericDoc, GenericQuery, TrecQrel, GenericScoredDoc from .base import DatasetIntegrationTest class TestWikir(DatasetIntegrationTest): def test_docs(self): self._test_docs('wikir/en1k', count=369721, items={ 0: GenericDoc('1781133', re.compile('^it was used in landing craft during world war ii and is used today in private boats and training fac.{814}gine cooling is via liquid in a water jacket in a boat cool external water is pumped into the engine$', flags=48)), 9: GenericDoc('931408', re.compile('^they moved south to england to cannock staffordshire where they formed balaam and the angel initiall.{894}es trains and automobiles their presence was largely ignored by the time that their 1993 album prime$', flags=48)), 369720: GenericDoc('1230943', re.compile('^geetha jeevan born 6 may 1970 is the current mla of thoothukudi constituency in the 15th tamil nadu .{995}etha jeevan was defeated by s t chellapandian of the aiadmk party with about 27000 votes in contrast$', flags=48)), }) self._test_docs('wikir/en59k', count=2454785, items={ 0: GenericDoc('0', re.compile('^these institutions are often described as stateless societies although several authors have defined .{1140}the word anarchism appears in english from 1642 as anarchisme and the word anarchy from 1539 various$', flags=48)), 9: GenericDoc('9', re.compile('^given annually by the academy of motion picture arts and sciences ampas the awards are an internatio.{1001}demy awards the 91st academy awards ceremony honoring the best films of 2018 was held on february 24$', flags=48)), 2454784: GenericDoc('2456663', re.compile('^he began his career in gay erotic art in 1978 as illustrator and cover artist for barazoku the first.{1018}o appear disconnected from reality on february 18 2003 kimura died at the age of 56 from a pulmonary$', flags=48)), }) self._test_docs('wikir/en78k', count=2456637, items={ 0: GenericDoc('0', re.compile('^These institutions are often described as stateless societies although several authors have defined .{41155}nthesis anarchists and others of preserving tacitly statist authoritarian or bureaucratic tendencies$', flags=48)), 9: GenericDoc('9', re.compile('^Given annually by the Academy of Motion Picture Arts and Sciences AMPAS the awards are an internatio.{35352}an language it is used generically to refer to any award or award ceremony regardless of which field$', flags=48)), 2456636: GenericDoc('2456663', re.compile('^He began his career in gay erotic art in 1978 as illustrator and cover artist for Barazoku the first.{1192}\\-published in 1997 was published shortly after his death His collected works are held by Studio Kaiz$', flags=48)), }) self._test_docs('wikir/ens78k', count=2456637, items={ 0: GenericDoc('0', re.compile('^These institutions are often described as stateless societies although several authors have defined .{41155}nthesis anarchists and others of preserving tacitly statist authoritarian or bureaucratic tendencies$', flags=48)), 9: GenericDoc('9', re.compile('^Given annually by the Academy of Motion Picture Arts and Sciences AMPAS the awards are an internatio.{35352}an language it is used generically to refer to any award or award ceremony regardless of which field$', flags=48)), 2456636: GenericDoc('2456663', re.compile('^He began his career in gay erotic art in 1978 as illustrator and cover artist for Barazoku the first.{1192}\\-published in 1997 was published shortly after his death His collected works are held by Studio Kaiz$', flags=48)), }) self._test_docs('wikir/fr14k', count=736616, items={ 0: GenericDoc('0', re.compile('^il est aussi philologue d origine bourbonnaise fils d un notaire de châteaumeillant cher il fait ses.{910}rmé toute une génération de linguistes français parmi lesquels émile benveniste marcel cohen georges$', flags=48)), 9: GenericDoc('9', re.compile('^ce dernier constituait à l époque de l antiquité un point de passage important sur la route de la so.{949}ndi lieu de la défaite des armées britanniques et rejoint en 1921 la société des nations en 1979 les$', flags=48)), 736615: GenericDoc('739227', re.compile('^elle co anime le podcast red scare en français peur rouge fille d acrobates nekrasova émigre avec se.{901} émission reçoit des personnalités du milieu de la culture new yorkais nekrasova se dit partisane du$', flags=48)), }) self._test_docs('wikir/es13k', count=645901, items={ 0: GenericDoc('0', re.compile('^su territorio está organizado en siete parroquias con una población total de 78 282 habitantes su ca.{976}e dictaba que se convocaría al somatén formado por los cabezas de familia con nacionalidad andorrana$', flags=48)), 9: GenericDoc('9', re.compile('^matute fue una de las voces más personales de la literatura española del siglo y es considerada por .{942}en la que presenta influencias de heidi 1880 como el amor por la naturaleza y la relación de la niña$', flags=48)), 645900: GenericDoc('648294', re.compile('^desde joven se trasladó a panamá con el fin de ejercer el comercio y también la abogacía en ambas ac.{977}entarlas nuevamente a la asamblea lo acogen igual que la bandera mediante la ley 64 de junio de 1904$', flags=48)), }) self._test_docs('wikir/it16k', count=503012, items={ 0: GenericDoc('0', re.compile('^nel primo caso i mantici pompano l aria attraverso una camera del vento o serbatoio réservoir in fra.{1034}la bisogna attendere la metà del secolo con l affermarsi delle prime compagnie e dei primi marchi di$', flags=48)), 9: GenericDoc('9', re.compile('^le sue esperienze nei primi anni cinquanta come studente della brandeis university nel massachusetts.{1140}bisti che cominciarono a raccoglierli freneticamente ovviamente hoffman puntava a mettere in risalto$', flags=48)), 503011: GenericDoc('509379', re.compile('^si sposò con giulia de vito da cui ebbe cinque figli luca lorenzo giuseppe giustina angela e frances.{1056}nello stato nel pieno rispetto delle leggi e della giustizia giovanni antonio summonte sosteneva che$', flags=48)), }) def test_queries(self): self._test_queries('wikir/en1k/training', count=1444, items={ 0: GenericQuery('123839', 'yanni'), 9: GenericQuery('563603', 'trinidad and tobago'), 1443: GenericQuery('341793', 'gloucestershire county cricket club'), }) self._test_queries('wikir/en1k/validation', count=100, items={ 0: GenericQuery('1402535', 'irish sea'), 9: GenericQuery('8858', 'north america'), 99: GenericQuery('30711', '1992 summer olympics'), }) self._test_queries('wikir/en1k/test', count=100, items={ 0: GenericQuery('158491', 'southern methodist university'), 9: GenericQuery('104086', 'bulacan'), 99: GenericQuery('712704', 'west indies'), }) self._test_queries('wikir/en59k/training', count=57251, items={ 0: GenericQuery('453502', 'ruggero deodato'), 9: GenericQuery('36255', 'karakum desert'), 57250: GenericQuery('182474', 'university system of georgia'), }) self._test_queries('wikir/en59k/validation', count=1000, items={ 0: GenericQuery('2230207', '2017 welsh local elections'), 9: GenericQuery('1325', 'byzantium'), 999: GenericQuery('1314794', 'cestoda'), }) self._test_queries('wikir/en59k/test', count=1000, items={ 0: GenericQuery('1981012', 'canadian folklore'), 9: GenericQuery('271014', 'sentinel range'), 999: GenericQuery('73548', 'lanai'), }) self._test_queries('wikir/en78k/training', count=62904, items={ 0: GenericQuery('368996', 'Germersheim'), 9: GenericQuery('31044', 'Camp style'), 62903: GenericQuery('426311', 'Rajshahi District'), }) self._test_queries('wikir/en78k/validation', count=7862, items={ 0: GenericQuery('1411873', 'Auraiya district'), 9: GenericQuery('1459284', '2010 AFL season'), 7861: GenericQuery('532819', 'San Carlos canton'), }) self._test_queries('wikir/en78k/test', count=7862, items={ 0: GenericQuery('25182', 'Maria Callas'), 9: GenericQuery('124328', "1991 FIFA Women's World Cup"), 7861: GenericQuery('382632', 'Davis Mountains'), }) self._test_queries('wikir/ens78k/training', count=62904, items={ 0: GenericQuery('368996', 'Germersheim is a town in the German state of Rhineland-Palatinate'), 9: GenericQuery('31044', 'Camp is an aesthetic style and sensibility that regards something'), 62903: GenericQuery('426311', 'Rajshahi District is a district in mid-western Bangladesh'), }) self._test_queries('wikir/ens78k/validation', count=7862, items={ 0: GenericQuery('1411873', 'Auraiya district is one of the districts of Uttar Pradesh'), 9: GenericQuery('1459284', 'The 2010 Australian Football League season commenced on 25 March'), 7861: GenericQuery('532819', 'San Carlos is the 10th canton in the province of'), }) self._test_queries('wikir/ens78k/test', count=7862, items={ 0: GenericQuery('25182', 'Maria Callas Commendatore OMRI December 2 1923 – September 16'), 9: GenericQuery('124328', "The 1991 FIFA Women's World Cup was the inaugural FIFA"), 7861: GenericQuery('382632', 'The Davis Mountains originally known as Limpia Mountains are a'), }) self._test_queries('wikir/fr14k/training', count=11341, items={ 0: GenericQuery('390701', 'trait biologique'), 9: GenericQuery('200590', 'penza'), 11340: GenericQuery('57294', 'rio uruguay'), }) self._test_queries('wikir/fr14k/validation', count=1400, items={ 0: GenericQuery('82385', 'sagonne'), 9: GenericQuery('7235', 'vecteur'), 1399: GenericQuery('57832', 'la panne'), }) self._test_queries('wikir/fr14k/test', count=1400, items={ 0: GenericQuery('13067', 'achille'), 9: GenericQuery('30895', 'mexico tenochtitlan'), 1399: GenericQuery('367952', 'carpentras'), }) self._test_queries('wikir/es13k/training', count=11202, items={ 0: GenericQuery('207679', 'penguin random house grupo editorial'), 9: GenericQuery('9645', 'nave espacial'), 11201: GenericQuery('50392', 'la massana'), }) self._test_queries('wikir/es13k/validation', count=1300, items={ 0: GenericQuery('191608', 'general hospital'), 9: GenericQuery('22785', 'mannheim'), 1299: GenericQuery('231526', 'hindustan aeronautics limited'), }) self._test_queries('wikir/es13k/test', count=1300, items={ 0: GenericQuery('3088', 'isla grande de tierra del fuego'), 9: GenericQuery('86459', 'k pop'), 1299: GenericQuery('123345', 'la puebla de almoradiel'), }) self._test_queries('wikir/it16k/training', count=13418, items={ 0: GenericQuery('16956', 'corte penale internazionale'), 9: GenericQuery('27135', 'antibes'), 13417: GenericQuery('196572', 'manuele ii paleologo'), }) self._test_queries('wikir/it16k/validation', count=1600, items={ 0: GenericQuery('7528', 'bortigali'), 9: GenericQuery('70550', 'buda'), 1599: GenericQuery('27094', 'integrated development environment'), }) self._test_queries('wikir/it16k/test', count=1600, items={ 0: GenericQuery('492243', 'sistema di lancio riutilizzabile'), 9: GenericQuery('57632', 'devon'), 1599: GenericQuery('5285', 'arenaria'), }) def test_qrels(self): self._test_qrels('wikir/en1k/training', count=47699, items={ 0: TrecQrel('123839', '123839', 2, '0'), 9: TrecQrel('188629', '1440095', 1, '0'), 47698: TrecQrel('341793', '1672870', 1, '0'), }) self._test_qrels('wikir/en1k/validation', count=4979, items={ 0: TrecQrel('1402535', '1402535', 2, '0'), 9: TrecQrel('1402535', '6123', 1, '0'), 4978: TrecQrel('30711', '1719628', 1, '0'), }) self._test_qrels('wikir/en1k/test', count=4435, items={ 0: TrecQrel('158491', '158491', 2, '0'), 9: TrecQrel('5728', '5728', 2, '0'), 4434: TrecQrel('712704', '1577576', 1, '0'), }) self._test_qrels('wikir/en59k/training', count=2443383, items={ 0: TrecQrel('453502', '453502', 2, '0'), 9: TrecQrel('453502', '2228391', 1, '0'), 2443382: TrecQrel('182474', '1971100', 1, '0'), }) self._test_qrels('wikir/en59k/validation', count=68905, items={ 0: TrecQrel('2230207', '2230207', 2, '0'), 9: TrecQrel('105188', '174985', 1, '0'), 68904: TrecQrel('1314794', '2308683', 1, '0'), }) self._test_qrels('wikir/en59k/test', count=104715, items={ 0: TrecQrel('1981012', '1981012', 2, '0'), 9: TrecQrel('1164242', '788', 1, '0'), 104714: TrecQrel('73548', '2377038', 1, '0'), }) self._test_qrels('wikir/en78k/training', count=2435257, items={ 0: TrecQrel('368996', '368996', 2, '0'), 9: TrecQrel('5737', '11828', 1, '0'), 2435256: TrecQrel('426311', '2315108', 1, '0'), }) self._test_qrels('wikir/en78k/validation', count=271874, items={ 0: TrecQrel('1411873', '1411873', 2, '0'), 9: TrecQrel('1944076', '1944076', 2, '0'), 271873: TrecQrel('532819', '2440624', 1, '0'), }) self._test_qrels('wikir/en78k/test', count=353060, items={ 0: TrecQrel('25182', '25182', 2, '0'), 9: TrecQrel('174105', '918850', 1, '0'), 353059: TrecQrel('382632', '1309518', 1, '0'), }) self._test_qrels('wikir/ens78k/training', count=2435257, items={ 0: TrecQrel('368996', '368996', 2, '0'), 9: TrecQrel('5737', '11828', 1, '0'), 2435256: TrecQrel('426311', '2315108', 1, '0'), }) self._test_qrels('wikir/ens78k/validation', count=271874, items={ 0: TrecQrel('1411873', '1411873', 2, '0'), 9: TrecQrel('1944076', '1944076', 2, '0'), 271873: TrecQrel('532819', '2440624', 1, '0'), }) self._test_qrels('wikir/ens78k/test', count=353060, items={ 0: TrecQrel('25182', '25182', 2, '0'), 9: TrecQrel('174105', '918850', 1, '0'), 353059: TrecQrel('382632', '1309518', 1, '0'), }) self._test_qrels('wikir/fr14k/training', count=609240, items={ 0: TrecQrel('390701', '390701', 2, '0'), 9: TrecQrel('289251', '173579', 1, '0'), 609239: TrecQrel('57294', '282203', 1, '0'), }) self._test_qrels('wikir/fr14k/validation', count=81255, items={ 0: TrecQrel('82385', '82385', 2, '0'), 9: TrecQrel('80623', '591242', 1, '0'), 81254: TrecQrel('57832', '659742', 1, '0'), }) self._test_qrels('wikir/fr14k/test', count=55647, items={ 0: TrecQrel('13067', '13067', 2, '0'), 9: TrecQrel('1839', '210397', 1, '0'), 55646: TrecQrel('367952', '711334', 1, '0'), }) self._test_qrels('wikir/es13k/training', count=477212, items={ 0: TrecQrel('207679', '207679', 2, '0'), 9: TrecQrel('207679', '598495', 1, '0'), 477211: TrecQrel('50392', '615133', 1, '0'), }) self._test_qrels('wikir/es13k/validation', count=58757, items={ 0: TrecQrel('191608', '191608', 2, '0'), 9: TrecQrel('191608', '554255', 1, '0'), 58756: TrecQrel('231526', '276804', 1, '0'), }) self._test_qrels('wikir/es13k/test', count=71339, items={ 0: TrecQrel('3088', '3088', 2, '0'), 9: TrecQrel('3088', '253926', 1, '0'), 71338: TrecQrel('123345', '559669', 1, '0'), }) self._test_qrels('wikir/it16k/training', count=381920, items={ 0: TrecQrel('16956', '16956', 2, '0'), 9: TrecQrel('8993', '8993', 2, '0'), 381919: TrecQrel('196572', '136444', 1, '0'), }) self._test_qrels('wikir/it16k/validation', count=45003, items={ 0: TrecQrel('7528', '7528', 2, '0'), 9: TrecQrel('7528', '420171', 1, '0'), 45002: TrecQrel('27094', '493604', 1, '0'), }) self._test_qrels('wikir/it16k/test', count=49338, items={ 0: TrecQrel('492243', '492243', 2, '0'), 9: TrecQrel('492243', '493306', 1, '0'), 49337: TrecQrel('5285', '408735', 1, '0'), }) def test_scoreddocs(self): self._test_scoreddocs('wikir/en1k/training', count=144400, items={ 0: GenericScoredDoc('123839', '806300', 20.720094194011075), 9: GenericScoredDoc('123839', '1901730', 10.324072628860163), 144399: GenericScoredDoc('341793', '441259', 14.099367141266189), }) self._test_scoreddocs('wikir/en1k/validation', count=10000, items={ 0: GenericScoredDoc('1402535', '681497', 14.974678196110478), 9: GenericScoredDoc('1402535', '245557', 12.087820628131816), 9999: GenericScoredDoc('30711', '1705862', 8.521229143068965), }) self._test_scoreddocs('wikir/en1k/test', count=10000, items={ 0: GenericScoredDoc('158491', '625257', 15.660703104969318), 9: GenericScoredDoc('158491', '13801', 14.515017321771746), 9999: GenericScoredDoc('712704', '140985', 4.887942090200023), }) self._test_scoreddocs('wikir/en59k/training', count=5725100, items={ 0: GenericScoredDoc('453502', '2228391', 29.22122689830602), 9: GenericScoredDoc('453502', '1594621', 19.78462456588529), 5725099: GenericScoredDoc('182474', '669638', 12.514610996375488), }) self._test_scoreddocs('wikir/en59k/validation', count=100000, items={ 0: GenericScoredDoc('2230207', '961529', 18.460208054253798), 9: GenericScoredDoc('2230207', '2284579', 16.93401066213046), 99999: GenericScoredDoc('1314794', '818625', 0.0), }) self._test_scoreddocs('wikir/en59k/test', count=100000, items={ 0: GenericScoredDoc('1981012', '1968399', 13.390851551324499), 9: GenericScoredDoc('1981012', '821056', 8.81720912528983), 99999: GenericScoredDoc('73548', '818549', 0.0), }) self._test_scoreddocs('wikir/en78k/training', count=6284800, items={ 0: GenericScoredDoc('368996', '1651819', 23.14194046616975), 9: GenericScoredDoc('368996', '618593', 17.432096830331467), 6284799: GenericScoredDoc('426311', '97253', 15.529782072936454), }) self._test_scoreddocs('wikir/en78k/validation', count=785700, items={ 0: GenericScoredDoc('1411873', '1579044', 29.78805667499762), 9: GenericScoredDoc('1411873', '1411879', 19.593174845080704), 785699: GenericScoredDoc('532819', '456647', 14.645128248017446), }) self._test_scoreddocs('wikir/en78k/test', count=785600, items={ 0: GenericScoredDoc('25182', '1413822', 26.770190544879753), 9: GenericScoredDoc('25182', '567382', 21.329586618874135), 785599: GenericScoredDoc('382632', '933740', 11.712924197712407), }) self._test_scoreddocs('wikir/ens78k/training', count=6289800, items={ 0: GenericScoredDoc('368996', '368996', 42.5142628002974), 9: GenericScoredDoc('368996', '1628082', 33.412777578029235), 6289799: GenericScoredDoc('426311', '852080', 26.43338402309732), }) self._test_scoreddocs('wikir/ens78k/validation', count=786100, items={ 0: GenericScoredDoc('1411873', '1579044', 48.87893849879893), 9: GenericScoredDoc('1411873', '2301035', 34.36216854665406), 786099: GenericScoredDoc('532819', '678583', 17.237808584137724), }) self._test_scoreddocs('wikir/ens78k/test', count=786100, items={ 0: GenericScoredDoc('25182', '25182', 29.514812932099993), 9: GenericScoredDoc('25182', '887295', 23.03166853881259), 786099: GenericScoredDoc('382632', '1788341', 17.95820049117319), }) self._test_scoreddocs('wikir/fr14k/training', count=1134100, items={ 0: GenericScoredDoc('390701', '357730', 12.84854585287806), 9: GenericScoredDoc('390701', '358783', 11.805423649565245), 1134099: GenericScoredDoc('57294', '431312', 11.597854170542954), }) self._test_scoreddocs('wikir/fr14k/validation', count=140000, items={ 0: GenericScoredDoc('82385', '208929', 19.553876476904705), 9: GenericScoredDoc('82385', '246297', 0.0), 139999: GenericScoredDoc('57832', '246388', 0.0), }) self._test_scoreddocs('wikir/fr14k/test', count=140000, items={ 0: GenericScoredDoc('13067', '71891', 24.77999788377413), 9: GenericScoredDoc('13067', '246295', 0.0), 139999: GenericScoredDoc('367952', '246382', 0.0), }) self._test_scoreddocs('wikir/es13k/training', count=1120200, items={ 0: GenericScoredDoc('207679', '304542', 37.93233712840836), 9: GenericScoredDoc('207679', '30053', 24.34821303424021), 1120199: GenericScoredDoc('50392', '215709', 0.0), }) self._test_scoreddocs('wikir/es13k/validation', count=130000, items={ 0: GenericScoredDoc('191608', '442864', 13.718837032435605), 9: GenericScoredDoc('191608', '554969', 12.100914887895527), 129999: GenericScoredDoc('231526', '582771', 9.14032945504454), }) self._test_scoreddocs('wikir/es13k/test', count=130000, items={ 0: GenericScoredDoc('3088', '568822', 16.179207111846104), 9: GenericScoredDoc('3088', '58331', 12.618831564218615), 129999: GenericScoredDoc('123345', '215761', 0.0), }) self._test_scoreddocs('wikir/it16k/training', count=1341800, items={ 0: GenericScoredDoc('16956', '108934', 19.476623542001622), 9: GenericScoredDoc('16956', '470735', 11.086113055208344), 1341799: GenericScoredDoc('196572', '312933', 8.677228312993646), }) self._test_scoreddocs('wikir/it16k/validation', count=160000, items={ 0: GenericScoredDoc('7528', '509379', 0.0), 9: GenericScoredDoc('7528', '169121', 0.0), 159999: GenericScoredDoc('27094', '186973', 9.223293949280931), }) self._test_scoreddocs('wikir/it16k/test', count=160000, items={ 0: GenericScoredDoc('492243', '13937', 22.972013737556047), 9: GenericScoredDoc('492243', '380824', 10.295899790233975), 159999: GenericScoredDoc('5285', '169149', 0.0), }) if __name__ == '__main__': unittest.main() ================================================ FILE: test/metadata.py ================================================ import unittest import ir_datasets class TestMetadata(unittest.TestCase): def test_all_metadata_available(self): for dsid in ir_datasets.registry._registered: self._test_ds(dsid) # def test_clirmatrix_metadata_available(self): # LANGS = ('af', 'als', 'am', 'an', 'ar', 'arz', 'ast', 'az', 'azb', 'ba', 'bar', 'be', 'bg', 'bn', 'bpy', 'br', 'bs', 'bug', 'ca', 'cdo', 'ce', 'ceb', 'ckb', 'cs', 'cv', 'cy', 'da', 'de', 'diq', 'el', 'eml', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fo', 'fr', 'fy', 'ga', 'gd', 'gl', 'gu', 'he', 'hi', 'hr', 'hsb', 'ht', 'hu', 'hy', 'ia', 'id', 'ilo', 'io', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'kn', 'ko', 'ku', 'ky', 'la', 'lb', 'li', 'lmo', 'lt', 'lv', 'mai', 'mg', 'mhr', 'min', 'mk', 'ml', 'mn', 'mr', 'mrj', 'ms', 'my', 'mzn', 'nap', 'nds', 'ne', 'new', 'nl', 'nn', 'no', 'oc', 'or', 'os', 'pa', 'pl', 'pms', 'pnb', 'ps', 'pt', 'qu', 'ro', 'ru', 'sa', 'sah', 'scn', 'sco', 'sd', 'sh', 'si', 'simple', 'sk', 'sl', 'sq', 'sr', 'su', 'sv', 'sw', 'szl', 'ta', 'te', 'tg', 'th', 'tl', 'tr', 'tt', 'uk', 'ur', 'uz', 'vec', 'vi', 'vo', 'wa', 'war', 'wuu', 'xmf', 'yi', 'yo', 'zh') # MULTI8_LANGS = {'ar', 'de', 'en', 'es', 'fr', 'ja', 'ru', 'zh'} # for doc_lang in LANGS: # self._test_ds(f'clirmatrix/{doc_lang}') # for query_lang in LANGS: # if query_lang == doc_lang: # continue # for split in ['train', 'dev', 'test1', 'test2']: # self._test_ds(f'clirmatrix/{doc_lang}/bi139-base/{query_lang}/{split}') # self._test_ds(f'clirmatrix/{doc_lang}/bi139-full/{query_lang}/{split}') # if query_lang in MULTI8_LANGS and doc_lang in MULTI8_LANGS: # self._test_ds(f'clirmatrix/{doc_lang}/multi8/{query_lang}/{split}') def _test_ds(self, dsid): with self.subTest(dsid): dataset = ir_datasets.load(dsid) metadata = dataset.metadata() for etype in ir_datasets.EntityType: if dataset.has(etype): self.assertTrue(etype.value in metadata, f"{dsid} missing {etype.value} metadata") self.assertTrue('count' in metadata[etype.value], f"{dsid} missing {etype.value} metadata") if __name__ == '__main__': unittest.main() ================================================ FILE: test/test_defaulttext.py ================================================ import unittest import ir_datasets def template_instance(Cls): if hasattr(Cls, '_fields'): return Cls(*[template_instance(v) for k, v in Cls.__annotations__.items()]) elif hasattr(Cls, '_name') and Cls._name == 'List': return [template_instance(Cls.__args__[0]) for _ in range(3)] elif hasattr(Cls, '_name') and Cls._name == 'Tuple': return tuple([template_instance(Cls.__args__[0]) for _ in range(3)]) elif hasattr(Cls, '__members__'): return Cls(next(iter(Cls.__members__.values()))) elif Cls is str: return 'test string' else: try: return Cls() except: return None class TestMetadata(unittest.TestCase): def test_all_defualttext(self): for dsid in ir_datasets.registry._registered: self._test_defaulttet(dsid) def _test_defaulttet(self, dsid): with self.subTest(dsid): dataset = ir_datasets.load(dsid) if dataset.has_docs(): Cls = dataset.docs_cls() instance = template_instance(Cls) if hasattr(instance, 'default_text'): instance.default_text() # test it doesn't raise an error else: print(dsid, 'missing doc default_text') if dataset.has_queries(): Cls = dataset.queries_cls() instance = template_instance(Cls) if hasattr(instance, 'default_text'): instance.default_text() # test it doesn't raise an error else: print(dsid, 'missing query default_text') if __name__ == '__main__': unittest.main() ================================================ FILE: test/util/docs/__init__.py ================================================ ================================================ FILE: test/util/docs/data.py ================================================ import random import string from ir_datasets.formats.base import BaseDocs, GenericDoc from ir_datasets.indices import DEFAULT_DOCSTORE_OPTIONS, Docstore class OtherDoc: def __init__(self, id: str, text: str): self.id = id self.text = text class FakeDocs(BaseDocs): def __init__(self, n_docs: int, namespace = 'test', lang='en', docs_cls=GenericDoc): self._docs = [ docs_cls( f'{ix:05d}', ''.join(random.sample(string.ascii_lowercase, 16)) ) for ix in range(n_docs) ] self._docs_cls = docs_cls self._namespace = namespace self._lang = lang def docs_count(self): return len(self._docs) def docs_iter(self): return self._docs def docs_cls(self): return self._docs_cls def docs_lang(self): return self._lang def docs_namepace(self): return self._namespace def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS) -> Docstore: return FakeDocstore(self) class FakeDocstore(Docstore): def __init__(self, docs: FakeDocs): self._docs = docs def get_many(self, doc_ids, field=None): doc_ids = set(doc_ids) return {doc.doc_id: doc for doc in self._docs.docs_iter() if doc.doc_id in doc_ids} ================================================ FILE: test/util/docs/test_multiple.py ================================================ import pytest from ir_datasets.formats.base import GenericDoc from ir_datasets.util.docs.multiple import PrefixedDocs, PrefixedDocsSpec from .data import FakeDocs, OtherDoc def test_multiple_prefixes(): docs_1 = FakeDocs(5) docs_2 = FakeDocs(3) spec = [ PrefixedDocsSpec("D1-", docs_1), PrefixedDocsSpec("D2-", docs_2) ] all_docs = PrefixedDocs( None, *spec ) assert all_docs.docs_cls() == GenericDoc assert all_docs.docs_lang() == 'en' assert all_docs.docs_count() == 8 all_store = all_docs.docs_store() assert set(all_store.get_many(["D1-00001", "D1-00004", "D2-00002"]).values()) == set( GenericDoc(f"D1-{doc.doc_id}", doc.text) for doc in docs_1.docs_store().get_many(["00001", "00004"]).values() ) | set( GenericDoc(f"D2-{doc.doc_id}", doc.text) for doc in docs_2.docs_store().get_many(["00002"]).values() ) assert [doc.doc_id for doc in all_docs.docs_iter()] == [ "D1-00000", "D1-00001", "D1-00002", "D1-00003", "D1-00004", "D2-00000", "D2-00001", "D2-00002" ] # Check that the doc IDs are the same set_1 = set() for spec in spec: set_1.update(f"{spec.prefix}{doc.doc_id}" for doc in spec.docs.docs_iter()) assert set_1 == set(doc.doc_id for doc in all_docs.docs_iter()) with pytest.raises(AttributeError): PrefixedDocs( None, PrefixedDocsSpec("D1-", FakeDocs(5)), PrefixedDocsSpec("D2-", FakeDocs(3, docs_cls=OtherDoc)) ).docs_cls() blank = PrefixedDocs( None, PrefixedDocsSpec("D1-", FakeDocs(5)), PrefixedDocsSpec("D2-", FakeDocs(3, lang='fr', namespace='other')) ) assert blank.docs_lang() is None assert blank.docs_namespace() is None def test_multiple_prefixes_inlined(): """Test support for already prefixed collections""" docs_1 = FakeDocs(5) docs_2 = FakeDocs(3) spec = [ PrefixedDocsSpec("D1-", PrefixedDocs(None, PrefixedDocsSpec("D1-", docs_1)), True), PrefixedDocsSpec("D2-", PrefixedDocs(None, PrefixedDocsSpec("D2-", docs_2)), True) ] all_docs = PrefixedDocs( None, *spec ) assert all_docs.docs_cls() == GenericDoc assert all_docs.docs_lang() == 'en' assert all_docs.docs_count() == 8 assert [doc.doc_id for doc in all_docs.docs_iter()] == [ "D1-00000", "D1-00001", "D1-00002", "D1-00003", "D1-00004", "D2-00000", "D2-00001", "D2-00002" ] all_store = all_docs.docs_store() assert set(all_store.get_many(["D1-00001", "D1-00004", "D2-00002"]).values()) == set( GenericDoc(f"D1-{doc.doc_id}", doc.text) for doc in docs_1.docs_store().get_many(["00001", "00004"]).values() ) | set( GenericDoc(f"D2-{doc.doc_id}", doc.text) for doc in docs_2.docs_store().get_many(["00002"]).values() ) ================================================ FILE: test/util/docs/test_subset.py ================================================ from ir_datasets.util.docs.subset import DocsSubset, Dupes from .data import FakeDocs class SimpleDupes(Dupes): def __init__(self, doc_ids): self.doc_ids = doc_ids def test_subset_simple(): docs = FakeDocs(5) dupe_ids = set(doc.doc_id for ix, doc in zip(range(docs.docs_count()), docs.docs_iter()) if ix in [1, 3]) dupes = SimpleDupes(dupe_ids) docs_subset = DocsSubset("__fake_name__", docs, dupes) assert [ doc.doc_id for doc in docs_subset.docs_iter() ] == [ doc.doc_id for doc in docs.docs_iter() if doc.doc_id not in dupe_ids ] ================================================ FILE: test/util.py ================================================ import unittest import ir_datasets class TestUtil(unittest.TestCase): def test_apply_sub_slice(self): ass = ir_datasets.util.apply_sub_slice self.assertEqual(ass(slice(0, 100), slice(0, 10)), slice(0, 10)) self.assertEqual(ass(slice(0, 10), slice(0, 100)), slice(0, 10)) self.assertEqual(ass(slice(10, 100), slice(0, 10)), slice(10, 20)) self.assertEqual(ass(slice(0, 10), slice(10, 100)), slice(10, 10)) self.assertEqual(ass(slice(0, 100), slice(0, None)), slice(0, 100)) self.assertEqual(ass(slice(0, 100), slice(0, -1)), slice(0, 99)) self.assertEqual(ass(slice(0, 100), slice(0, -2)), slice(0, 98)) self.assertEqual(ass(slice(0, 100), slice(1, -2)), slice(1, 98)) self.assertEqual(ass(slice(0, 100), slice(-1, None)), slice(99, 100)) self.assertEqual(ass(slice(0, 100), slice(-2, None)), slice(98, 100)) self.assertEqual(ass(slice(0, 100), slice(-2, -1)), slice(98, 99)) self.assertEqual(ass(slice(0, 100), slice(0/3, 1/3)), slice(0, 33)) self.assertEqual(ass(slice(0, 100), slice(1/3, 2/3)), slice(33, 66)) self.assertEqual(ass(slice(0, 100), slice(2/3, 3/3)), slice(66, 100)) def test_corpus_id(self): # typical self.assertEqual(ir_datasets.corpus_id("msmarco-document/trec-dl-2019/judged"), "msmarco-document") # identity self.assertEqual(ir_datasets.corpus_id("msmarco-document"), "msmarco-document") # wikir doesn't support docs self.assertEqual(ir_datasets.corpus_id("wikir/en1k/test"), "wikir/en1k") # clueweb09 supports docs, but clueweb09/catb is a different subset self.assertEqual(ir_datasets.corpus_id("clueweb09/catb/trec-web-2009"), "clueweb09/catb") self.assertEqual(ir_datasets.corpus_id("clueweb09/catb"), "clueweb09/catb") self.assertEqual(ir_datasets.corpus_id("clueweb09"), "clueweb09") # clirmatrix uses matching patterns self.assertEqual(ir_datasets.corpus_id("clirmatrix/en"), "clirmatrix/en") self.assertEqual(ir_datasets.corpus_id("clirmatrix/en/bi139-full/de/train"), "clirmatrix/en") def test_html_find_charset(self): self.assertEqual(ir_datasets.util.html_parsing.find_charset(b'\n Resultados Copa Universidad de Chile | FECH\n \n\n