Repository: allenai/ir_datasets
Branch: master
Commit: ae24b5302c56
Files: 269
Total size: 3.1 MB
Directory structure:
gitextract_2j6ggfs5/
├── .github/
│ ├── ISSUE_TEMPLATE/
│ │ ├── bug_report.md
│ │ ├── dataset-addition.md
│ │ ├── documentation.md
│ │ └── feature_request.md
│ └── workflows/
│ ├── deploy.yml
│ └── test.yml
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── examples/
│ ├── adding_datasets.ipynb
│ ├── clirmatrix_example.py
│ ├── ir_datasets.ipynb
│ └── ir_datasets_cli.ipynb
├── ir_datasets/
│ ├── __init__.py
│ ├── __main__.py
│ ├── commands/
│ │ ├── __init__.py
│ │ ├── build_c4_checkpoints.py
│ │ ├── build_clueweb_warc_indexes.py
│ │ ├── build_download_cache.py
│ │ ├── clean.py
│ │ ├── doc_fifos.py
│ │ ├── export.py
│ │ ├── generate_metadata.py
│ │ ├── list.py
│ │ └── lookup.py
│ ├── datasets/
│ │ ├── __init__.py
│ │ ├── antique.py
│ │ ├── aol_ia.py
│ │ ├── aquaint.py
│ │ ├── argsme.py
│ │ ├── base.py
│ │ ├── beir.py
│ │ ├── c4.py
│ │ ├── car.py
│ │ ├── clinicaltrials.py
│ │ ├── clirmatrix.py
│ │ ├── clueweb09.py
│ │ ├── clueweb12.py
│ │ ├── codec.py
│ │ ├── codesearchnet.py
│ │ ├── cord19.py
│ │ ├── cranfield.py
│ │ ├── csl.py
│ │ ├── disks45.py
│ │ ├── dpr_w100.py
│ │ ├── gov.py
│ │ ├── gov2.py
│ │ ├── hc4.py
│ │ ├── highwire.py
│ │ ├── istella22.py
│ │ ├── kilt.py
│ │ ├── lotte.py
│ │ ├── medline.py
│ │ ├── miracl.py
│ │ ├── mmarco.py
│ │ ├── mr_tydi.py
│ │ ├── msmarco_document.py
│ │ ├── msmarco_document_v2.py
│ │ ├── msmarco_passage.py
│ │ ├── msmarco_passage_v2.py
│ │ ├── msmarco_qna.py
│ │ ├── nano_beir.py
│ │ ├── natural_questions.py
│ │ ├── neuclir.py
│ │ ├── neumarco.py
│ │ ├── nfcorpus.py
│ │ ├── nyt.py
│ │ ├── pmc.py
│ │ ├── sara.py
│ │ ├── touche.py
│ │ ├── touche_image.py
│ │ ├── trec_arabic.py
│ │ ├── trec_cast.py
│ │ ├── trec_fair.py
│ │ ├── trec_mandarin.py
│ │ ├── trec_robust04.py
│ │ ├── trec_spanish.py
│ │ ├── trec_tot.py
│ │ ├── trec_tot_2025.py
│ │ ├── tripclick.py
│ │ ├── tweets2013_ia.py
│ │ ├── vaswani.py
│ │ ├── wapo.py
│ │ ├── wikiclir.py
│ │ └── wikir.py
│ ├── docs/
│ │ ├── antique.yaml
│ │ ├── aol-ia.yaml
│ │ ├── aquaint.yaml
│ │ ├── argsme.yaml
│ │ ├── beir.yaml
│ │ ├── bibliography.bib
│ │ ├── c4.yaml
│ │ ├── car.yaml
│ │ ├── clinicaltrials.yaml
│ │ ├── clirmatrix.yaml
│ │ ├── clueweb09.yaml
│ │ ├── clueweb12.yaml
│ │ ├── codec.yaml
│ │ ├── codesearchnet.yaml
│ │ ├── cord19.yaml
│ │ ├── cranfield.yaml
│ │ ├── csl.yaml
│ │ ├── disks45.yaml
│ │ ├── dpr-w100.yaml
│ │ ├── gov.yaml
│ │ ├── gov2.yaml
│ │ ├── hc4.yaml
│ │ ├── highwire.yaml
│ │ ├── istella22.yaml
│ │ ├── kilt.yaml
│ │ ├── lotte.yaml
│ │ ├── medline.yaml
│ │ ├── miracl.yaml
│ │ ├── mmarco.yaml
│ │ ├── mr-tydi.yaml
│ │ ├── msmarco-document-v2.yaml
│ │ ├── msmarco-document.yaml
│ │ ├── msmarco-passage-v2.yaml
│ │ ├── msmarco-passage.yaml
│ │ ├── msmarco-qna.yaml
│ │ ├── nano-beir.yaml
│ │ ├── natural-questions.yaml
│ │ ├── neuclir.yaml
│ │ ├── neumarco.yaml
│ │ ├── nfcorpus.yaml
│ │ ├── nyt.yaml
│ │ ├── pmc.yaml
│ │ ├── sara.yaml
│ │ ├── touche-image.yaml
│ │ ├── touche.yaml
│ │ ├── trec-arabic.yaml
│ │ ├── trec-cast.yaml
│ │ ├── trec-fair.yaml
│ │ ├── trec-mandarin.yaml
│ │ ├── trec-robust04.yaml
│ │ ├── trec-spanish.yaml
│ │ ├── trec-tot-2025.yaml
│ │ ├── trec-tot.yaml
│ │ ├── tripclick.yaml
│ │ ├── tweets2013-ia.yaml
│ │ ├── vaswani.yaml
│ │ ├── wapo.yaml
│ │ ├── wikiclir.yaml
│ │ └── wikir.yaml
│ ├── etc/
│ │ ├── downloads.json
│ │ └── metadata.json
│ ├── formats/
│ │ ├── __init__.py
│ │ ├── argsme.py
│ │ ├── base.py
│ │ ├── clirmatrix.py
│ │ ├── csv_fmt.py
│ │ ├── extracted_cc.py
│ │ ├── jsonl.py
│ │ ├── ntcir.py
│ │ ├── touche.py
│ │ ├── touche_image.py
│ │ ├── trec.py
│ │ ├── tsv.py
│ │ └── webarc.py
│ ├── indices/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── cache_docstore.py
│ │ ├── clueweb_warc.py
│ │ ├── indexed_tsv_docstore.py
│ │ ├── lz4_pickle.py
│ │ ├── numpy_sorted_index.py
│ │ └── zpickle_docstore.py
│ ├── lazy_libs.py
│ ├── log.py
│ ├── util/
│ │ ├── __init__.py
│ │ ├── docs/
│ │ │ ├── __init__.py
│ │ │ ├── lazy.py
│ │ │ ├── multiple.py
│ │ │ └── subset.py
│ │ ├── download.py
│ │ ├── fileio.py
│ │ ├── hash.py
│ │ ├── html_parsing.py
│ │ ├── metadata.py
│ │ └── registry.py
│ └── wrappers/
│ ├── __init__.py
│ └── html_extractor.py
├── pyproject.toml
├── requirements-test.txt
├── requirements.txt
└── test/
├── __init__.py
├── downloads.py
├── dummy/
│ ├── docs.tsv
│ ├── qrels
│ └── queries.tsv
├── formats/
│ ├── __init__.py
│ ├── test_trec.py
│ └── test_tsv.py
├── indices/
│ ├── __init__.py
│ ├── lz4_pickle.py
│ └── numpy_sorted.py
├── integration/
│ ├── __init__.py
│ ├── antique.py
│ ├── aol_ia.py
│ ├── aquaint.py
│ ├── argsme.py
│ ├── base.py
│ ├── beir.py
│ ├── c4.py
│ ├── car.py
│ ├── clinicaltrials.py
│ ├── clirmatrix.py
│ ├── clueweb09.py
│ ├── clueweb12.py
│ ├── codec.py
│ ├── codesearchnet.py
│ ├── cord19.py
│ ├── cranfield.py
│ ├── csl.py
│ ├── disks45.py
│ ├── dpr_w100.py
│ ├── dummy.py
│ ├── gov.py
│ ├── gov2.py
│ ├── hc4.py
│ ├── highwire.py
│ ├── istella22.py
│ ├── kilt.py
│ ├── lotte.py
│ ├── medline.py
│ ├── miracl.py
│ ├── mmarco.py
│ ├── mr_tydi.py
│ ├── msmarco_document.py
│ ├── msmarco_document_v2.py
│ ├── msmarco_passage.py
│ ├── msmarco_passage_v2.py
│ ├── msmarco_qna.py
│ ├── nano_beir.py
│ ├── natural_questions.py
│ ├── neuclir.py
│ ├── neumarco.py
│ ├── nfcorpus.py
│ ├── nyt.py
│ ├── pmc.py
│ ├── sara.py
│ ├── touche.py
│ ├── touche_image.py
│ ├── trec_arabic.py
│ ├── trec_cast.py
│ ├── trec_fair.py
│ ├── trec_mandarin.py
│ ├── trec_robust04.py
│ ├── trec_spanish.py
│ ├── trec_tot.py
│ ├── trec_tot_2024.py
│ ├── trec_tot_2025/
│ │ ├── test_docs_iter.py
│ │ ├── test_docs_store.py
│ │ ├── test_qrel_iter.py
│ │ └── test_queries_iter.py
│ ├── tripclick.py
│ ├── tweets2013_ia.py
│ ├── vaswani.py
│ ├── wapo.py
│ ├── wikiclir.py
│ └── wikir.py
├── metadata.py
├── test_defaulttext.py
├── util/
│ └── docs/
│ ├── __init__.py
│ ├── data.py
│ ├── test_multiple.py
│ └── test_subset.py
└── util.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: Bug report
about: Errors in behavior or functionality
title: ''
labels: bug
assignees: ''
---
**Describe the bug**
A clear and concise description of what the bug is.
**Affected dataset(s)**
**To Reproduce**
Steps to reproduce the behavior:
1. Go to '...'
2. Click on '....'
3. Scroll down to '....'
4. See error
**Expected behavior**
A clear and concise description of what you expected to happen.
**Additional context**
Add any other context about the problem here.
================================================
FILE: .github/ISSUE_TEMPLATE/dataset-addition.md
================================================
---
name: Dataset Addition
about: Propose adding a new dataset, collection of related datasets, or feature to
existing dataset
title: ''
labels: add-dataset
assignees: ''
---
**Dataset Information:**
**Links to Resources:**
**Dataset ID(s) & supported entities:**
-
**Checklist**
Mark each task once completed. All should be checked prior to merging a new dataset.
- [ ] Dataset definition (in `ir_datasets/datasets/[topid].py`)
- [ ] Tests (in `tests/integration/[topid].py`)
- [ ] Metadata generated (using `ir_datasets generate_metadata` command, should appear in `ir_datasets/etc/metadata.json`)
- [ ] Documentation (in `ir_datasets/etc/[topid].yaml`)
- [ ] Documentation generated in https://github.com/seanmacavaney/ir-datasets.com/
- [ ] Downloadable content (in `ir_datasets/etc/downloads.json`)
- [ ] Download verification action (in `.github/workflows/verify_downloads.yml`). Only one needed per `topid`.
- [ ] Any small public files from NIST (or other potentially troublesome files) mirrored in https://github.com/seanmacavaney/irds-mirror/. Mirrored status properly reflected in `downloads.json`.
**Additional comments/concerns/ideas/etc.**
================================================
FILE: .github/ISSUE_TEMPLATE/documentation.md
================================================
---
name: Documentation
about: Additions to or improvmenets to the documentation
title: ''
labels: documentation
assignees: ''
---
**Dataset(s)**
**Describe the proposed change**
================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.md
================================================
---
name: Feature request
about: Suggest an idea for this project
title: ''
labels: enhancement
assignees: ''
---
**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
**Describe the solution you'd like**
A clear and concise description of what you want to happen.
**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.
**Additional context**
Add any other context or screenshots about the feature request here.
================================================
FILE: .github/workflows/deploy.yml
================================================
name: deploy
on:
release:
types: [created]
jobs:
pypi:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: '3.x'
- name: install-deps
run: |
python -m pip install --upgrade pip
pip install build setuptools wheel twine
- name: build
run: |
python -m build
- name: upload
env:
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
run: |
twine upload dist/*
================================================
FILE: .github/workflows/test.yml
================================================
name: test
on:
push: {branches: [master]} # pushes to master
pull_request: {} # all PRs
jobs:
pytest:
strategy:
matrix:
python-version: ['3.10', '3.12']
os: ['ubuntu-latest', 'windows-latest', 'macos-latest']
runs-on: ${{ matrix.os }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install Dependencies
run: |
pip install --upgrade -r requirements.txt -r requirements-test.txt
pip install -e '.[all]'
- name: Unit Test
if: matrix.os == 'ubuntu-latest' || matrix.os == 'macOs-latest'
run: |
pip install pytest
pytest test/util.py test/metadata.py test/integration/dummy.py test/integration/vaswani.py test/formats/ test/test_defaulttext.py
- name: Unit Test (Windows)
if: matrix.os == 'windows-latest'
shell: cmd
run: |
pip install pytest
pytest test\util.py test\metadata.py test\integration\dummy.py test\integration\vaswani.py test\formats\ test\test_defaulttext.py
env:
PATH: 'C:/Program Files/zlib/bin/'
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
.DS_Store
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: MANIFEST.in
================================================
recursive-include ir_datasets *.yaml
recursive-include ir_datasets *.bib
recursive-include ir_datasets *.json
================================================
FILE: README.md
================================================
# ir_datasets
`ir_datasets` is a python package that provides a common interface to many IR ad-hoc ranking
benchmarks, training datasets, etc.
The package takes care of downloading datasets (including documents, queries, relevance judgments,
etc.) when available from public sources. Instructions on how to obtain datasets are provided when
they are not publicly available.
`ir_datasets` provides a common iterator format to allow them to be easily used in python. It
attempts to provide the data in an unaltered form (i.e., keeping all fields and markup), while
handling differences in file formats, encoding, etc. Adapters provide extra functionality, e.g., to
allow quick lookups of documents by ID.
A command line interface is also available.
You can find a list of datasets and their features [here](https://ir-datasets.com/).
Want a new dataset, added functionality, or a bug fixed? Feel free to post an issue or make a pull request!
## Getting Started
For a quick start with the Python API, check out our Colab tutorials:
[Python](https://colab.research.google.com/github/allenai/ir_datasets/blob/master/examples/ir_datasets.ipynb)
[Command Line](https://colab.research.google.com/github/allenai/ir_datasets/blob/master/examples/ir_datasets_cli.ipynb)
Install via pip:
```
pip install ir_datasets
```
If you want the main branch, you install as such:
```
pip install git+https://github.com/allenai/ir_datasets.git
```
If you want to run an editable version locally:
```
$ git clone https://github.com/allenai/ir_datasets
$ cd ir_datasets
$ pip install -e .
```
Tested with python versions 3.7, 3.8, 3.9, and 3.10. (Mininum python version is 3.7.)
## Features
**Python and Command Line Interfaces**. Access datasts both through a simple Python API and
via the command line.
```python
import ir_datasets
dataset = ir_datasets.load('msmarco-passage/train')
# Documents
for doc in dataset.docs_iter():
print(doc)
# GenericDoc(doc_id='0', text='The presence of communication amid scientific minds was equa...
# GenericDoc(doc_id='1', text='The Manhattan Project and its atomic bomb helped bring an en...
# ...
```
```bash
ir_datasets export msmarco-passage/train docs | head -n2
0 The presence of communication amid scientific minds was equally important to the success of the Manh...
1 The Manhattan Project and its atomic bomb helped bring an end to World War II. Its legacy of peacefu...
```
**Automatically downloads source files** (when available). Will download and verify the source
files for queries, documents, qrels, etc. when they are publicly available, as they are needed.
A CI build checks weekly to ensure that all the downloadable content is available and correct:
[](https://github.com/seanmacavaney/ir-datasets.com/actions/workflows/verify_downloads.yml).
We mirror some troublesome files on [mirror.ir-datasets.com](https://mirror.ir-datasets.com/), and
automatically switch to the mirror when the original source is not available.
```python
import ir_datasets
dataset = ir_datasets.load('msmarco-passage/train')
for doc in dataset.docs_iter(): # Will download and extract MS-MARCO's collection.tar.gz the first time
...
for query in dataset.queries_iter(): # Will download and extract MS-MARCO's queries.tar.gz the first time
...
```
**Instructions for dataset access** (when not publicly available). Provides instructions on how
to get a copy of the data when it is not publicly available online (e.g., when it requires a
data usage agreement).
```python
import ir_datasets
dataset = ir_datasets.load('trec-arabic')
for doc in dataset.docs_iter():
...
# Provides the following instructions:
# The dataset is based on the Arabic Newswire corpus. It is available from the LDC via:
# To proceed, symlink the source file here: [gives path]
```
**Support for datasets big and small**. By using iterators, supports large datasets that may
not fit into system memory, such as ClueWeb.
```python
import ir_datasets
dataset = ir_datasets.load('clueweb09')
for doc in dataset.docs_iter():
... # will iterate through all ~1B documents
```
**Fixes known dataset issues**. For instance, automatically corrects the document UTF-8 encoding
problem in the MS-MARCO passage collection.
```python
import ir_datasets
dataset = ir_datasets.load('msmarco-passage')
docstore = dataset.docs_store()
docstore.get('243').text
# "John Maynard Keynes, 1st Baron Keynes, CB, FBA (/ˈkeɪnz/ KAYNZ; 5 June 1883 – 21 April [SNIP]"
# Naïve UTF-8 decoding yields double-encoding artifacts like:
# "John Maynard Keynes, 1st Baron Keynes, CB, FBA (/Ë\x88keɪnz/ KAYNZ; 5 June 1883 â\x80\x93 21 April [SNIP]"
# ~~~~~~ ~~ ~~~~~~~~~
```
**Fast Random Document Access.** Builds data structures that allow fast and efficient lookup of
document content. For large datasets, such as ClueWeb, uses
[checkpoint files](https://ir-datasets.com/clueweb_warc_checkpoints.md) to load documents from
source 40x faster than normal. Results are cached for even faster subsequent accesses.
```python
import ir_datasets
dataset = ir_datasets.load('clueweb12')
docstore = dataset.docs_store()
docstore.get_many(['clueweb12-0000tw-05-00014', 'clueweb12-0000tw-05-12119', 'clueweb12-0106wb-18-19516'])
# {'clueweb12-0000tw-05-00014': ..., 'clueweb12-0000tw-05-12119': ..., 'clueweb12-0106wb-18-19516': ...}
```
**Fancy Iter Slicing.** Sometimes it's helpful to be able to select ranges of data (e.g., for processing
document collections in parallel on multiple devices). Efficient implementations of slicing operations
allow for much faster dataset partitioning than using `itertools.slice`.
```python
import ir_datasets
dataset = ir_datasets.load('clueweb12')
dataset.docs_iter()[500:1000] # normal slicing behavior
# WarcDoc(doc_id='clueweb12-0000tw-00-00502', ...), WarcDoc(doc_id='clueweb12-0000tw-00-00503', ...), ...
dataset.docs_iter()[-10:-8] # includes negative indexing
# WarcDoc(doc_id='clueweb12-1914wb-28-24245', ...), WarcDoc(doc_id='clueweb12-1914wb-28-24246', ...)
dataset.docs_iter()[::100] # includes support for skip (only positive values)
# WarcDoc(doc_id='clueweb12-0000tw-00-00000', ...), WarcDoc(doc_id='clueweb12-0000tw-00-00100', ...), ...
dataset.docs_iter()[1/3:2/3] # supports proportional slicing (this takes the middle third of the collection)
# WarcDoc(doc_id='clueweb12-0605wb-28-12714', ...), WarcDoc(doc_id='clueweb12-0605wb-28-12715', ...), ...
```
## Datasets
Available datasets include:
- [ANTIQUE](https://ir-datasets.com/antique.html)
- [AQUAINT](https://ir-datasets.com/aquaint.html)
- [BEIR (benchmark suite)](https://ir-datasets.com/beir.html)
- [TREC CAR](https://ir-datasets.com/car.html)
- [C4](https://ir-datasets.com/c4.html)
- [ClueWeb09](https://ir-datasets.com/clueweb09.html)
- [ClueWeb12](https://ir-datasets.com/clueweb12.html)
- [CLIRMatrix](https://ir-datasets.com/clirmatrix.html)
- [CodeSearchNet](https://ir-datasets.com/codesearchnet.html)
- [CORD-19](https://ir-datasets.com/cord19.html)
- [DPR Wiki100](https://ir-datasets.com/dpr-w100.html)
- [GOV](https://ir-datasets.com/gov.html)
- [GOV2](https://ir-datasets.com/gov2.html)
- [HC4](https://ir-datasets.com/hc4.html)
- [Highwire (TREC Genomics 2006-07)](https://ir-datasets.com/highwire.html)
- [Medline](https://ir-datasets.com/medline.html)
- [MSMARCO (document)](https://ir-datasets.com/msmarco-document.html)
- [MSMARCO (passage)](https://ir-datasets.com/msmarco-passage.html)
- [MSMARCO (QnA)](https://ir-datasets.com/msmarco-qna.html)
- [Natural Questions](https://ir-datasets.com/natural-questions.html)
- [NFCorpus (NutritionFacts)](https://ir-datasets.com/nfcorpus.html)
- [NYT](https://ir-datasets.com/nyt.html)
- [PubMed Central (TREC CDS)](https://ir-datasets.com/pmc.html)
- [TREC Arabic](https://ir-datasets.com/trec-arabic.html)
- [TREC Fair Ranking 2021](https://ir-datasets.com/trec-fair-2021.html)
- [TREC Mandarin](https://ir-datasets.com/trec-mandarin.html)
- [TREC Robust 2004](https://ir-datasets.com/trec-robust04.html)
- [TREC Spanish](https://ir-datasets.com/trec-spanish.html)
- [TripClick](https://ir-datasets.com/tripclick.html)
- [Tweets 2013 (Internet Archive)](https://ir-datasets.com/tweets2013-ia.html)
- [Vaswani](https://ir-datasets.com/vaswani.html)
- [Washington Post](https://ir-datasets.com/wapo.html)
- [WikIR](https://ir-datasets.com/wikir.html)
There are "subsets" under each dataset. For instance, `clueweb12/b13/trec-misinfo-2019` provides the
queries and judgments from the [2019 TREC misinformation track](https://trec.nist.gov/data/misinfo2019.html),
and `msmarco-document/orcas` provides the [ORCAS dataset](https://microsoft.github.io/msmarco/ORCAS). They
tend to be organized with the document collection at the top level.
See the ir_dataets docs ([ir_datasets.com](https://ir-datasets.com/)) for details about each
dataset, its available subsets, and what data they provide.
## Environment variables
- `IR_DATASETS_HOME`: Home directory for ir_datasets data (default `~/.ir_datasets/`). Contains directories
for each top-level dataset.
- `IR_DATASETS_TMP`: Temporary working directory (default `/tmp/ir_datasets/`).
- `IR_DATASETS_DL_TIMEOUT`: Download stream read timeout, in seconds (default `15`). If no data is received
within this duration, the connection will be assumed to be dead, and another download may be attempted.
- `IR_DATASETS_DL_TRIES`: Default number of download attempts before exception is thrown (default `3`).
When the server accepts Range requests, uses them. Otherwise, will download the entire file again
- `IR_DATASETS_DL_DISABLE_PBAR`: Set to `true` to disable the progress bar for downloads. Useful in settings
where an interactive console is not available.
- `IR_DATASETS_DL_SKIP_SSL`: Set to `true` to disable checking SSL certificates when downloading files.
Useful as a short-term solution when SSL certificates expire or are otherwise invalid. Note that this
does not disable hash verification of the downloaded content.
- `IR_DATASETS_SKIP_DISK_FREE`: Set to `true` to disable checks for enough free space on disk before
downloading content or otherwise creating large files.
- `IR_DATASETS_SMALL_FILE_SIZE`: The size of files that are considered "small", in bytes. Instructions for
linking small files rather then downloading them are not shown. Defaults to 5000000 (5MB).
## Citing
When using datasets provided by this package, be sure to properly cite them. Bibtex for each dataset
can be found on the [datasets documentation page](https://ir-datasets.com/).
If you use this tool, please cite [our SIGIR resource paper](https://arxiv.org/pdf/2103.02280.pdf):
```
@inproceedings{macavaney:sigir2021-irds,
author = {MacAvaney, Sean and Yates, Andrew and Feldman, Sergey and Downey, Doug and Cohan, Arman and Goharian, Nazli},
title = {Simplified Data Wrangling with ir_datasets},
year = {2021},
booktitle = {SIGIR}
}
```
## Credits
Contributors to this repository:
- Sean MacAvaney (University of Glasgow)
- Shuo Sun (Johns Hopkins University)
- Thomas Jänich (University of Glasgow)
- Jan Heinrich Reimer (Martin Luther University Halle-Wittenberg)
- Maik Fröbe (Martin Luther University Halle-Wittenberg)
- Eugene Yang (Johns Hopkins University)
- Augustin Godinot (NAVERLABS Europe, ENS Paris-Saclay)
================================================
FILE: examples/adding_datasets.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ir_datasets - Adding Datasets"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This tutorial covers the process for adding a new dataset to the `ir_datasets` package.\n",
"\n",
"This tutorial is for datasets that are inteded to be added to the main package. For an example of an extension, see [this example extension](https://github.com/seanmacavaney/dummy-irds-ext).\n",
"\n",
"Before starting, we recommend [opening an issue](https://github.com/allenai/ir_datasets/issues/new/choose) so various decisions about how to support the dataset can be discussed.\n",
"\n",
"There are four files involved in adding a dataset to the `ir_datasets` package:\n",
" - `ir_datasets/datasets/[dataset-id].py` - Contains the definition of the dataset and any specialized code for handling it.\n",
" - `ir_datasets/etc/downloads.json` - Contains information about how to download and verify dataset source files.\n",
" - `ir_datasets/docs/[dataset-id].yaml` - Contains documentation of the dataset.\n",
" - `test/integration/[dataset-id].py` - Contains automated tests to ensure the dataset is processed as expected.\n",
" \n",
"We will now show examples of each of these files for a toy dataset called `dummy`, with files hosted here: https://github.com/seanmacavaney/dummy-irds-ext/tree/master/data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"File: `ir_datasets/datasets/dummy.py`\n",
"\n",
"```python\n",
"import ir_datasets\n",
"from ir_datasets.formats import TsvDocs, TsvQueries, TrecQrels\n",
"\n",
"# A unique identifier for this dataset. This should match the file name (with \"-\" instead of \"_\")\n",
"NAME = 'dummy'\n",
"\n",
"# What do the relevance levels in qrels mean?\n",
"QREL_DEFS = {\n",
" 1: 'relevant',\n",
" 0: 'not relevant',\n",
"}\n",
"\n",
"# This message is shown to the user before downloads are started\n",
"DUA = 'Please confirm that you agree to the data usage agreement at '\n",
"\n",
"# An initialization function is used to keep the namespace clean\n",
"def _init():\n",
" # The directory where this dataset's data files will be stored\n",
" base_path = ir_datasets.util.home_path() / NAME\n",
" \n",
" # Load an object that is used for providing the documentation\n",
" documentation = YamlDocumentation(f'docs/{NAME}.yaml')\n",
" \n",
" # A reference to the downloads file, under the key \"dummy\". (DLC stands for DownLoadable Content)\n",
" dlc = DownloadConfig.context(NAME, base_path, dua=DUA)\n",
" \n",
" # How to process the documents. Since they are in a typical TSV format, we'll use TsvDocs.\n",
" # Note that other dataset formats may require you to write a custom docs handler (BaseDocs).\n",
" # Note that this doesn't process the documents now; it just defines how they are processed.\n",
" docs = TsvDocs(dlc['docs'], namespace=NAME, lang='en')\n",
" \n",
" # How to process the queries. Similar to the documents, you may need to write a custom\n",
" # queries handler (BaseQueries).\n",
" queries = TsvQueries(dlc['queries'], namespace=NAME, lang='en')\n",
" \n",
" # Qrels: The qrels file is in the TREC format, so we'll use TrecQrels to process them\n",
" qrels = TrecQrels(dlc['qrels'], QREL_DEFS)\n",
" \n",
" # Package the docs, queries, qrels, and documentation into a Dataset object\n",
" dataset = Dataset(docs, queries, qrels, documentation('_'))\n",
" \n",
" # Register the dataset in ir_datasets\n",
" ir_datasets.registry.register(NAME, dataset)\n",
" \n",
" return dataset # used for exposing dataset to the namespace\n",
"\n",
"dataset = _init()\n",
"```\n",
"\n",
"Note that you also need to add this file to `ir_datasets/datasets/__init__.py`:\n",
"\n",
"```python\n",
"from . import dummy\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"File: `ir_datasets/etc/downloads.json`\n",
"\n",
"(add lines like these to the file)\n",
"\n",
"```json\n",
"\"dummy\": {\n",
" \"docs\": {\n",
" \"url\": \"https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/docs.tsv\",\n",
" \"expected_md5\": \"c7bb5a1a3a07d51de50e8414245c2be4\",\n",
" \"cache_path\": \"docs.tsv\"\n",
" },\n",
" \"queries\": {\n",
" \"url\": \"https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/queries.tsv\",\n",
" \"expected_md5\": \"08ba86d990cbe6890f727946346964db\",\n",
" \"cache_path\": \"queries.tsv\"\n",
" },\n",
" \"qrels\": {\n",
" \"url\": \"https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/qrels\",\n",
" \"expected_md5\": \"79ed359fe0afa0f67eb39f468d162920\",\n",
" \"cache_path\": \"qrels\"\n",
" }\n",
"}\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"File: `ir_datasets/docs/dummy.yaml`\n",
"\n",
"```yaml\n",
"_: # matches documentation key above\n",
" pretty_name: 'Dummy' # a more human-readable way to present this dataset than the dataset-id\n",
" desc: '\n",
"\n",
"HTML-encoded and human-readable information about this dataset.\n",
"Include a brief description of the dataset.\n",
"Be sure to include important decisions made when processing it.\n",
"Also, link to more information, e.g. websites, papers, etc.\n",
"
\n",
"' \n",
" bibtex: |\n",
" @misc{dummy,\n",
" title={Dummy: a made-up dataset},\n",
" year={2021}\n",
" }\n",
"```\n",
"\n",
"To generate the HTML documentation files, run `python -m ir_datasets documentation`"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"File: `test/integration/dummy.py`\n",
"\n",
"```python\n",
"from ir_datasets.formats import GenericQuery, GenericDoc, TrecQrel\n",
"from .base import DatasetIntegrationTest\n",
"\n",
"class TestDummy(DatasetIntegrationTest):\n",
" def test_docs(self):\n",
" # Test that the dataset 'dummy' has 15 documents, and test the specific docs at indices 0, 9, and 14\n",
" self._test_docs('dummy', count=15, items={\n",
" 0: GenericDoc('T1', 'CUT, CAP AND BALANCE. TAXED ENOUGH ALREADY!'),\n",
" 9: GenericDoc('T10', 'Perhaps this is the kind of thinking we need in Washington ...'),\n",
" 14: GenericDoc('T15', \"I've been visiting Trump Int'l Golf Links Scotland and the course will be unmatched anywhere in the world. Spectacular!\"),\n",
" })\n",
"\n",
" def test_queries(self):\n",
" # Test that the dataset 'dummy' has 4 queries, and test the specific queries at indices 0 and 3\n",
" self._test_queries('dummy', count=4, items={\n",
" 0: GenericQuery('1', 'republican party'),\n",
" 3: GenericQuery('4', 'media'),\n",
" })\n",
"\n",
" def test_qrels(self):\n",
" # Test that the dataset 'dummy' has 60 qrels, and test the specific qrels at indices 0, 9, and 59\n",
" self._test_qrels('dummy', count=60, items={\n",
" 0: TrecQrel('1', 'T1', 0, '0'),\n",
" 9: TrecQrel('1', 'T10', 0, '0'),\n",
" 59: TrecQrel('4', 'T15', 0, '0'),\n",
" })\n",
"```\n",
"\n",
"Note that within a DatasetIntegrationTest, you can use `self._build_test_docs('dummy')`, `self._build_test_queries('dummy')`, `self._build_test_qrels('dummy')` to generate sample test cases. But be sure to check that the tests they generate are properly processed, and feel free to add additional test cases, especially to test dataset-specific handlers."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
================================================
FILE: examples/clirmatrix_example.py
================================================
import ir_datasets
"""
dataset name
clirmatrix/[query language code]/dataset/[doc language code]/[split]
options:
--------
dataset: bi139-base/bi139-full/multi8
supported query/doc language codes:
bi139-base/bi139-full: ['af', 'als', 'am', 'an', 'ar', 'arz', 'ast', 'az', 'azb', 'ba', 'bar', 'be', 'bg', 'bn', 'bpy', 'br', 'bs', 'bug', 'ca', 'cdo', 'ce', 'ceb', 'ckb', 'cs', 'cv', 'cy', 'da', 'de', 'diq', 'el', 'eml', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fo', 'fr', 'fy', 'ga', 'gd', 'gl', 'gu', 'he', 'hi', 'hr', 'hsb', 'ht', 'hu', 'hy', 'ia', 'id', 'ilo', 'io', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'kn', 'ko', 'ku', 'ky', 'la', 'lb', 'li', 'lmo', 'lt', 'lv', 'mai', 'mg', 'mhr', 'min', 'mk', 'ml', 'mn', 'mr', 'mrj', 'ms', 'my', 'mzn', 'nap', 'nds', 'ne', 'new', 'nl', 'nn', 'no', 'oc', 'or', 'os', 'pa', 'pl', 'pms', 'pnb', 'ps', 'pt', 'qu', 'ro', 'ru', 'sa', 'sah', 'scn', 'sco', 'sd', 'sh', 'si', 'simple', 'sk', 'sl', 'sq', 'sr', 'su', 'sv', 'sw', 'szl', 'ta', 'te', 'tg', 'th', 'tl', 'tr', 'tt', 'uk', 'ur', 'uz', 'vec', 'vi', 'vo', 'wa', 'war', 'wuu', 'xmf', 'yi', 'yo', 'zh']
multi8: ['ar', 'de', 'en', 'es', 'fr', 'ja', 'ru', 'zh']
split: train/dev/test1/test2
"""
#examples
#reference python notebook: https://colab.research.google.com/github/allenai/ir_datasets/blob/master/examples/ir_datasets.ipynb#scrollTo=n7mY16MRH0hx
dataset = ir_datasets.load("clirmatrix/en/bi139-base/zh/test1")
docstore = dataset.docs_store()
for qrels in dataset.qrels_iter():
print(docstore.get(qrels.doc_id))
break
for query in dataset.queries_iter():
print(query)
break
dataset = ir_datasets.load("clirmatrix/en/multi8/zh/train")
docstore = dataset.docs_store()
for qrels in dataset.qrels_iter():
print(docstore.get(qrels.doc_id))
break
for query in dataset.queries_iter():
print(query)
break
dataset = ir_datasets.load("clirmatrix/an/bi139-full/zh/dev")
docstore = dataset.docs_store()
for qrels in dataset.qrels_iter():
print(docstore.get(qrels.doc_id))
break
for query in dataset.queries_iter():
print(query)
break
================================================
FILE: examples/ir_datasets.ipynb
================================================
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "ir-datasets.ipynb",
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "snL2s_xoHpph"
},
"source": [
"# ir_datasets - Tutorial"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "n7mY16MRH0hx"
},
"source": [
"## Getting Started\n",
"\n",
"We'll start out by installing the package. The package is available on pypi,\n",
"so you can install it with your favorite package manager."
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "M_6mg0PbHaFD",
"outputId": "0764869d-bb51-4a9e-edb2-35c9cf56a876"
},
"source": [
"!pip install ir_datasets"
],
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"text": [
"Requirement already satisfied: ir_datasets in /usr/local/lib/python3.6/dist-packages (0.2.0)\n",
"Requirement already satisfied: pyyaml>=5.3.1 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (5.4.1)\n",
"Requirement already satisfied: trec-car-tools>=2.5.3 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (2.5.3)\n",
"Requirement already satisfied: zlib-state>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (0.1.3)\n",
"Requirement already satisfied: requests>=2.22.0 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (2.23.0)\n",
"Requirement already satisfied: numpy>=1.18.1 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (1.19.5)\n",
"Requirement already satisfied: lxml>=4.5.2 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (4.6.2)\n",
"Requirement already satisfied: tqdm>=4.38.0 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (4.41.1)\n",
"Requirement already satisfied: lz4>=3.1.1 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (3.1.3)\n",
"Requirement already satisfied: beautifulsoup4>=4.4.1 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (4.6.3)\n",
"Requirement already satisfied: warc3-wet-clueweb09>=0.2.5 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (0.2.5)\n",
"Requirement already satisfied: warc3-wet>=0.2.3 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (0.2.3)\n",
"Requirement already satisfied: typing>=3.6.2 in /usr/local/lib/python3.6/dist-packages (from trec-car-tools>=2.5.3->ir_datasets) (3.7.4.3)\n",
"Requirement already satisfied: cbor>=1.0.0 in /usr/local/lib/python3.6/dist-packages (from trec-car-tools>=2.5.3->ir_datasets) (1.0.0)\n",
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests>=2.22.0->ir_datasets) (1.24.3)\n",
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.22.0->ir_datasets) (3.0.4)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.22.0->ir_datasets) (2020.12.5)\n",
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.22.0->ir_datasets) (2.10)\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "DH_aBA7hIDZ4"
},
"source": [
"You can now load up your favorite dataset. You can find the full listing of datasets [here](https://ir-datasets.com/all.html). Here's an example for `cord19/trec-covid`:"
]
},
{
"cell_type": "code",
"metadata": {
"id": "dFIuPyqdHVQ0"
},
"source": [
"import ir_datasets\n",
"dataset = ir_datasets.load('cord19/trec-covid')"
],
"execution_count": 2,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "ILomHf8CIdOf"
},
"source": [
"## Documents\n",
"\n",
"`doc` entities map a `doc_id` to one or more text fields.\n",
"\n",
"Let's see how many documents are in this collection. The first time you run this command, it will need to download and process the collection, which may take some time:"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "v3rCW-JUHpFz",
"outputId": "c2cba6ee-3f55-4369-de41-17972b570ad8"
},
"source": [
"dataset.docs_count()"
],
"execution_count": 3,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"192509"
]
},
"metadata": {
"tags": []
},
"execution_count": 3
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "bd2f31HzI2s5"
},
"source": [
"Now let's see some docments. You can iterate through the documents in the collection using `docs_iter`. Since there's so many, we'll just look at the top 10:"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "odfCkvALHXzz",
"outputId": "3f4241e6-7828-4fc1-d18b-9610b7874eec"
},
"source": [
"for doc in dataset.docs_iter()[:10]:\n",
" print(doc)"
],
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"text": [
"Cord19Doc(doc_id='ug7v899j', title='Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia', doi='10.1186/1471-2334-1-6', date='2001-07-04', abstract='OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolates (60%) were associated with pneumonia, 14 (35%) with upper respiratory tract infections, and 2 (5%) with bronchiolitis. Cough (82.5%), fever (75%), and malaise (58.8%) were the most common symptoms, and crepitations (60%), and wheezes (40%) were the most common signs. Most patients with pneumonia had crepitations (79.2%) but only 25% had bronchial breathing. Immunocompromised patients were more likely than non-immunocompromised patients to present with pneumonia (8/9 versus 16/31, P = 0.05). Of the 24 patients with pneumonia, 14 (58.3%) had uneventful recovery, 4 (16.7%) recovered following some complications, 3 (12.5%) died because of M pneumoniae infection, and 3 (12.5%) died due to underlying comorbidities. The 3 patients who died of M pneumoniae pneumonia had other comorbidities. CONCLUSION: our results were similar to published data except for the finding that infections were more common in infants and preschool children and that the mortality rate of pneumonia in patients with comorbidities was high.')\n",
"Cord19Doc(doc_id='02tnwd4m', title='Nitric oxide: a pro-inflammatory mediator in lung disease?', doi='10.1186/rr14', date='2000-08-15', abstract='Inflammatory diseases of the respiratory tract are commonly associated with elevated production of nitric oxide (NO•) and increased indices of NO• -dependent oxidative stress. Although NO• is known to have anti-microbial, anti-inflammatory and anti-oxidant properties, various lines of evidence support the contribution of NO• to lung injury in several disease models. On the basis of biochemical evidence, it is often presumed that such NO• -dependent oxidations are due to the formation of the oxidant peroxynitrite, although alternative mechanisms involving the phagocyte-derived heme proteins myeloperoxidase and eosinophil peroxidase might be operative during conditions of inflammation. Because of the overwhelming literature on NO• generation and activities in the respiratory tract, it would be beyond the scope of this commentary to review this area comprehensively. Instead, it focuses on recent evidence and concepts of the presumed contribution of NO• to inflammatory diseases of the lung.')\n",
"Cord19Doc(doc_id='ejv2xln0', title='Surfactant protein-D and pulmonary host defense', doi='10.1186/rr19', date='2000-08-25', abstract='Surfactant protein-D (SP-D) participates in the innate response to inhaled microorganisms and organic antigens, and contributes to immune and inflammatory regulation within the lung. SP-D is synthesized and secreted by alveolar and bronchiolar epithelial cells, but is also expressed by epithelial cells lining various exocrine ducts and the mucosa of the gastrointestinal and genitourinary tracts. SP-D, a collagenous calcium-dependent lectin (or collectin), binds to surface glycoconjugates expressed by a wide variety of microorganisms, and to oligosaccharides associated with the surface of various complex organic antigens. SP-D also specifically interacts with glycoconjugates and other molecules expressed on the surface of macrophages, neutrophils, and lymphocytes. In addition, SP-D binds to specific surfactant-associated lipids and can influence the organization of lipid mixtures containing phosphatidylinositol in vitro. Consistent with these diverse in vitro activities is the observation that SP-D-deficient transgenic mice show abnormal accumulations of surfactant lipids, and respond abnormally to challenge with respiratory viruses and bacterial lipopolysaccharides. The phenotype of macrophages isolated from the lungs of SP-D-deficient mice is altered, and there is circumstantial evidence that abnormal oxidant metabolism and/or increased metalloproteinase expression contributes to the development of emphysema. The expression of SP-D is increased in response to many forms of lung injury, and deficient accumulation of appropriately oligomerized SP-D might contribute to the pathogenesis of a variety of human lung diseases.')\n",
"Cord19Doc(doc_id='2b73a28n', title='Role of endothelin-1 in lung disease', doi='10.1186/rr44', date='2001-02-22', abstract='Endothelin-1 (ET-1) is a 21 amino acid peptide with diverse biological activity that has been implicated in numerous diseases. ET-1 is a potent mitogen regulator of smooth muscle tone, and inflammatory mediator that may play a key role in diseases of the airways, pulmonary circulation, and inflammatory lung diseases, both acute and chronic. This review will focus on the biology of ET-1 and its role in lung disease.')\n",
"Cord19Doc(doc_id='9785vg6d', title='Gene expression in epithelial cells in response to pneumovirus infection', doi='10.1186/rr61', date='2001-05-11', abstract='Respiratory syncytial virus (RSV) and pneumonia virus of mice (PVM) are viruses of the family Paramyxoviridae, subfamily pneumovirus, which cause clinically important respiratory infections in humans and rodents, respectively. The respiratory epithelial target cells respond to viral infection with specific alterations in gene expression, including production of chemoattractant cytokines, adhesion molecules, elements that are related to the apoptosis response, and others that remain incompletely understood. Here we review our current understanding of these mucosal responses and discuss several genomic approaches, including differential display reverse transcription-polymerase chain reaction (PCR) and gene array strategies, that will permit us to unravel the nature of these responses in a more complete and systematic manner.')\n",
"Cord19Doc(doc_id='zjufx4fo', title='Sequence requirements for RNA strand transfer during nidovirus discontinuous subgenomic RNA synthesis', doi='10.1093/emboj/20.24.7220', date='2001-12-17', abstract='Nidovirus subgenomic mRNAs contain a leader sequence derived from the 5′ end of the genome fused to different sequences (‘bodies’) derived from the 3′ end. Their generation involves a unique mechanism of discontinuous subgenomic RNA synthesis that resembles copy-choice RNA recombination. During this process, the nascent RNA strand is transferred from one site in the template to another, during either plus or minus strand synthesis, to yield subgenomic RNA molecules. Central to this process are transcription-regulating sequences (TRSs), which are present at both template sites and ensure the fidelity of strand transfer. Here we present results of a comprehensive co-variation mutagenesis study of equine arteritis virus TRSs, demonstrating that discontinuous RNA synthesis depends not only on base pairing between sense leader TRS and antisense body TRS, but also on the primary sequence of the body TRS. While the leader TRS merely plays a targeting role for strand transfer, the body TRS fulfils multiple functions. The sequences of mRNA leader–body junctions of TRS mutants strongly suggested that the discontinuous step occurs during minus strand synthesis.')\n",
"Cord19Doc(doc_id='5yhe786e', title='Debate: Transfusing to normal haemoglobin levels will not improve outcome', doi='10.1186/cc987', date='2001-03-08', abstract='Recent evidence suggests that critically ill patients are able to tolerate lower levels of haemoglobin than was previously believed. It is our goal to show that transfusing to a level of 100 g/l does not improve mortality and other clinically important outcomes in a critical care setting. Although many questions remain, many laboratory and clinical studies, including a recent randomized controlled trial (RCT), have established that transfusing to normal haemoglobin concentrations does not improve organ failure and mortality in the critically ill patient. In addition, a restrictive transfusion strategy will reduce exposure to allogeneic transfusions, result in more efficient use of red blood cells (RBCs), save blood overall, and decrease health care costs.')\n",
"Cord19Doc(doc_id='8zchiykl', title='The 21st International Symposium on Intensive Care and Emergency Medicine, Brussels, Belgium, 20-23 March 2001', doi='10.1186/cc1013', date='2001-05-02', abstract=\"The 21st International Symposium on Intensive Care and Emergency Medicine was dominated by the results of recent clinical trials in sepsis and acute respiratory distress syndrome (ARDS). The promise of extracorporeal liver replacement therapy and noninvasive ventilation were other areas of interest. Ethical issues also received attention. Overall, the 'state of the art' lectures, pro/con debates, seminars and tutorials were of a high standard. The meeting was marked by a sense of renewed enthusiasm that positive progress is occurring in intensive care medicine.\")\n",
"Cord19Doc(doc_id='8qnrcgnk', title='Heme oxygenase-1 and carbon monoxide in pulmonary medicine', doi='10.1186/1465-9921-4-7', date='2003-08-07', abstract='Heme oxygenase-1 (HO-1), an inducible stress protein, confers cytoprotection against oxidative stress in vitro and in vivo. In addition to its physiological role in heme degradation, HO-1 may influence a number of cellular processes, including growth, inflammation, and apoptosis. By virtue of anti-inflammatory effects, HO-1 limits tissue damage in response to proinflammatory stimuli and prevents allograft rejection after transplantation. The transcriptional upregulation of HO-1 responds to many agents, such as hypoxia, bacterial lipopolysaccharide, and reactive oxygen/nitrogen species. HO-1 and its constitutively expressed isozyme, heme oxygenase-2, catalyze the rate-limiting step in the conversion of heme to its metabolites, bilirubin IXα, ferrous iron, and carbon monoxide (CO). The mechanisms by which HO-1 provides protection most likely involve its enzymatic reaction products. Remarkably, administration of CO at low concentrations can substitute for HO-1 with respect to anti-inflammatory and anti-apoptotic effects, suggesting a role for CO as a key mediator of HO-1 function. Chronic, low-level, exogenous exposure to CO from cigarette smoking contributes to the importance of CO in pulmonary medicine. The implications of the HO-1/CO system in pulmonary diseases will be discussed in this review, with an emphasis on inflammatory states.')\n",
"Cord19Doc(doc_id='jg13scgo', title='Technical Description of RODS: A Real-time Public Health Surveillance System', doi='10.1197/jamia.m1345', date='2003-09-01', abstract='This report describes the design and implementation of the Real-time Outbreak and Disease Surveillance (RODS) system, a computer-based public health surveillance system for early detection of disease outbreaks. Hospitals send RODS data from clinical encounters over virtual private networks and leased lines using the Health Level 7 (HL7) message protocol. The data are sent in real time. RODS automatically classifies the registration chief complaint from the visit into one of seven syndrome categories using Bayesian classifiers. It stores the data in a relational database, aggregates the data for analysis using data warehousing techniques, applies univariate and multivariate statistical detection algorithms to the data, and alerts users of when the algorithms identify anomalous patterns in the syndrome counts. RODS also has a Web-based user interface that supports temporal and spatial analyses. RODS processes sales of over-the-counter health care products in a similar manner but receives such data in batch mode on a daily basis. RODS was used during the 2002 Winter Olympics and currently operates in two states—Pennsylvania and Utah. It has been and continues to be a resource for implementing, evaluating, and applying new methods of public health surveillance.')\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "lUB0AUfWJESJ"
},
"source": [
"You can see each document is represented as a `Cord19Doc`, which is a `namedtuple`. Named tuples are a light-weight data structure that consists of a pre-defined sequence of named fields.\n",
"\n",
"If you want more information aobut what document fields are available in this collection, you can\n",
"[check the documentation](https://ir-datasets.com/cord19.html#cord19) or inspect the dataset's `docs_cls()`:"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Cej2STMCI_eh",
"outputId": "55e06f14-390f-4dce-9ba2-576a33c50b6c"
},
"source": [
"dataset.docs_cls()"
],
"execution_count": 5,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"ir_datasets.datasets.cord19.Cord19Doc"
]
},
"metadata": {
"tags": []
},
"execution_count": 5
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "p2BxO7-vJWg7",
"outputId": "ea763b20-59e6-4d97-b28b-0f7bbabb2f65"
},
"source": [
"dataset.docs_cls()._fields"
],
"execution_count": 6,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"('doc_id', 'title', 'doi', 'date', 'abstract')"
]
},
"metadata": {
"tags": []
},
"execution_count": 6
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "2ORa7nztJXyq",
"outputId": "a2e836be-ebe4-4f71-f272-2536961ef271"
},
"source": [
"dataset.docs_cls().__annotations__"
],
"execution_count": 7,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"OrderedDict([('doc_id', str),\n",
" ('title', str),\n",
" ('doi', str),\n",
" ('date', str),\n",
" ('abstract', str)])"
]
},
"metadata": {
"tags": []
},
"execution_count": 7
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "jwcOyKP5Juct"
},
"source": [
"Did you notice the `[:10]` above? We can do all sorts of fancy slicing on document iterators. Here, we select every other document from the top 10:"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "VDYPMpOVJZmM",
"outputId": "402d5201-ceb9-4bcb-a985-e0052d650994"
},
"source": [
"for doc in dataset.docs_iter()[:10:2]:\n",
" print(doc)"
],
"execution_count": 8,
"outputs": [
{
"output_type": "stream",
"text": [
"Cord19Doc(doc_id='ug7v899j', title='Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia', doi='10.1186/1471-2334-1-6', date='2001-07-04', abstract='OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolates (60%) were associated with pneumonia, 14 (35%) with upper respiratory tract infections, and 2 (5%) with bronchiolitis. Cough (82.5%), fever (75%), and malaise (58.8%) were the most common symptoms, and crepitations (60%), and wheezes (40%) were the most common signs. Most patients with pneumonia had crepitations (79.2%) but only 25% had bronchial breathing. Immunocompromised patients were more likely than non-immunocompromised patients to present with pneumonia (8/9 versus 16/31, P = 0.05). Of the 24 patients with pneumonia, 14 (58.3%) had uneventful recovery, 4 (16.7%) recovered following some complications, 3 (12.5%) died because of M pneumoniae infection, and 3 (12.5%) died due to underlying comorbidities. The 3 patients who died of M pneumoniae pneumonia had other comorbidities. CONCLUSION: our results were similar to published data except for the finding that infections were more common in infants and preschool children and that the mortality rate of pneumonia in patients with comorbidities was high.')\n",
"Cord19Doc(doc_id='ejv2xln0', title='Surfactant protein-D and pulmonary host defense', doi='10.1186/rr19', date='2000-08-25', abstract='Surfactant protein-D (SP-D) participates in the innate response to inhaled microorganisms and organic antigens, and contributes to immune and inflammatory regulation within the lung. SP-D is synthesized and secreted by alveolar and bronchiolar epithelial cells, but is also expressed by epithelial cells lining various exocrine ducts and the mucosa of the gastrointestinal and genitourinary tracts. SP-D, a collagenous calcium-dependent lectin (or collectin), binds to surface glycoconjugates expressed by a wide variety of microorganisms, and to oligosaccharides associated with the surface of various complex organic antigens. SP-D also specifically interacts with glycoconjugates and other molecules expressed on the surface of macrophages, neutrophils, and lymphocytes. In addition, SP-D binds to specific surfactant-associated lipids and can influence the organization of lipid mixtures containing phosphatidylinositol in vitro. Consistent with these diverse in vitro activities is the observation that SP-D-deficient transgenic mice show abnormal accumulations of surfactant lipids, and respond abnormally to challenge with respiratory viruses and bacterial lipopolysaccharides. The phenotype of macrophages isolated from the lungs of SP-D-deficient mice is altered, and there is circumstantial evidence that abnormal oxidant metabolism and/or increased metalloproteinase expression contributes to the development of emphysema. The expression of SP-D is increased in response to many forms of lung injury, and deficient accumulation of appropriately oligomerized SP-D might contribute to the pathogenesis of a variety of human lung diseases.')\n",
"Cord19Doc(doc_id='9785vg6d', title='Gene expression in epithelial cells in response to pneumovirus infection', doi='10.1186/rr61', date='2001-05-11', abstract='Respiratory syncytial virus (RSV) and pneumonia virus of mice (PVM) are viruses of the family Paramyxoviridae, subfamily pneumovirus, which cause clinically important respiratory infections in humans and rodents, respectively. The respiratory epithelial target cells respond to viral infection with specific alterations in gene expression, including production of chemoattractant cytokines, adhesion molecules, elements that are related to the apoptosis response, and others that remain incompletely understood. Here we review our current understanding of these mucosal responses and discuss several genomic approaches, including differential display reverse transcription-polymerase chain reaction (PCR) and gene array strategies, that will permit us to unravel the nature of these responses in a more complete and systematic manner.')\n",
"Cord19Doc(doc_id='5yhe786e', title='Debate: Transfusing to normal haemoglobin levels will not improve outcome', doi='10.1186/cc987', date='2001-03-08', abstract='Recent evidence suggests that critically ill patients are able to tolerate lower levels of haemoglobin than was previously believed. It is our goal to show that transfusing to a level of 100 g/l does not improve mortality and other clinically important outcomes in a critical care setting. Although many questions remain, many laboratory and clinical studies, including a recent randomized controlled trial (RCT), have established that transfusing to normal haemoglobin concentrations does not improve organ failure and mortality in the critically ill patient. In addition, a restrictive transfusion strategy will reduce exposure to allogeneic transfusions, result in more efficient use of red blood cells (RBCs), save blood overall, and decrease health care costs.')\n",
"Cord19Doc(doc_id='8qnrcgnk', title='Heme oxygenase-1 and carbon monoxide in pulmonary medicine', doi='10.1186/1465-9921-4-7', date='2003-08-07', abstract='Heme oxygenase-1 (HO-1), an inducible stress protein, confers cytoprotection against oxidative stress in vitro and in vivo. In addition to its physiological role in heme degradation, HO-1 may influence a number of cellular processes, including growth, inflammation, and apoptosis. By virtue of anti-inflammatory effects, HO-1 limits tissue damage in response to proinflammatory stimuli and prevents allograft rejection after transplantation. The transcriptional upregulation of HO-1 responds to many agents, such as hypoxia, bacterial lipopolysaccharide, and reactive oxygen/nitrogen species. HO-1 and its constitutively expressed isozyme, heme oxygenase-2, catalyze the rate-limiting step in the conversion of heme to its metabolites, bilirubin IXα, ferrous iron, and carbon monoxide (CO). The mechanisms by which HO-1 provides protection most likely involve its enzymatic reaction products. Remarkably, administration of CO at low concentrations can substitute for HO-1 with respect to anti-inflammatory and anti-apoptotic effects, suggesting a role for CO as a key mediator of HO-1 function. Chronic, low-level, exogenous exposure to CO from cigarette smoking contributes to the importance of CO in pulmonary medicine. The implications of the HO-1/CO system in pulmonary diseases will be discussed in this review, with an emphasis on inflammatory states.')\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "iizcVwqmJ-TW"
},
"source": [
"Or the last 10 documents:"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "iVOvbOGOJ4A6",
"outputId": "ca1daed3-3394-472f-8bbd-60a3e2faf0a3"
},
"source": [
"for doc in dataset.docs_iter()[-10:]:\n",
" print(doc)"
],
"execution_count": 9,
"outputs": [
{
"output_type": "stream",
"text": [
"Cord19Doc(doc_id='7e8r61e7', title='Can Pediatric COVID-19 Testing Sensitivity Be Improved With Sequential Tests?', doi='10.1213/ane.0000000000004982', date='2020-05-26', abstract='')\n",
"Cord19Doc(doc_id='r3ud8t8w', title='rAre graphene and graphene-derived products capable of preventing COVID-19 infection?', doi='10.1016/j.mehy.2020.110031', date='2020-06-24', abstract=\"The Severe Acute Respiratory Syndrome CoronaVirus 2 (SARS-CoV-2) causes the new coronavirus disease 2019 (COVID-19). This disease is a severe respiratory tract infection that spread rapidly around the world. In this pandemic situation, the researchers' effort is to understand the targets of the virus, mechanism of their cause, and transmission from animal to human and vice-versa. Therefore, to support COVID-19 research and development, we have proposed approaches based on graphene and graphene-derived nanomaterials against COVID-19.\")\n",
"Cord19Doc(doc_id='6jittbis', title='Heterogeneity and plasticity of porcine alveolar macrophage and pulmonary interstitial macrophage isolated from healthy pigs in vitro', doi='10.1242/bio.046342', date='2019-10-15', abstract='This study investigated the heterogeneity and plasticity of porcine alveolar macrophages (PAM) and pulmonary interstitial macrophages (IM) isolated from healthy pigs, including phenotype, function and gene expression. Dynamic changes of nitric oxide (NO) levels secreted by PAM and IM with stimulation of different doses of lipopolysaccharide (LPS) were investigated by Griess method, and the viability of the PAM and IM cells was investigated by MTT assay. Flow cytometry, fluorescence quantitative PCR and ELISA techniques were used to measure cell phenotype, gene expression and cytokine secretion, respectively. The PAM and IM cells in normal healthy pigs showed heterogeneity with 95.42±1.51% and 31.99±5.84% of CD163+ macrophage, respectively. The NO level in IM was significantly higher versus PAM after LPS treatment. Consistently, the ratio of Arg I/iNOS in IM was much lower than that in PAM, suggesting that the PAM belong to M2 macrophages and the IM belong to M1 macrophages. The PAM and IM cells in normal healthy pigs also showed plasticity. The Arg I/iNOS ratio and TIMP1/MMP12 ratio were significantly decreased in LPS- or LPS+IFNγ-treated PAM and IM, suggesting that cells were polarized towards M1 macrophages under LPS or LPS+IFNγ stimulation. On the contrary, IL-4 and IL-13 stimulation on PAM and IM lead to M2 polarization. A similar result was found in IL-1β gene expression and TNFα secretion. In conclusion, porcine macrophages have shown heterogeneity and plasticity on polarization under the stimulation of LPS, IFNγ, IL-4 and IL-13.')\n",
"Cord19Doc(doc_id='kaku49xd', title='Review of Current Advances in Serologic Testing for COVID-19', doi='10.1093/ajcp/aqaa112', date='2020-06-25', abstract='OBJECTIVES: To examine and summarize the current literature on serologic methods for the detection of antibodies to severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2). METHODS: A literature review was performed using searches in databases including PubMed, medRxiv, and bioRxiv. Thirty-two peer-reviewed papers and 23 preprints were examined. RESULTS: The studies included lateral flow immunoassay, enzyme-linked immunosorbent assay, chemiluminescence immunoassay, and neutralizing antibody assays. The use of all major SARS-CoV-2 antigens was demonstrated to have diagnostic value. Assays measuring total antibody reactivity had the highest sensitivity. In addition, all the methods provided opportunities to characterize the humoral immune response by isotype. The combined use of IgM and IgG detection resulted in a higher sensitivity than that observed when detecting either isotype alone. Although IgA was rarely studied, it was also demonstrated to be a sensitive marker of infection, and levels correlated with disease severity and neutralizing activity. CONCLUSIONS: The use of serologic testing, in conjunction with reverse transcription polymerase chain reaction testing, was demonstrated to significantly increase the sensitivity of detection of patients infected with SARS-CoV-2. There was conflicting evidence regarding whether antibody titers correlated with clinical severity. However, preliminary investigations indicated some immunoassays may be a surrogate for the prediction of neutralizing antibody titers and the selection of recovered patients for convalescent serum donation.')\n",
"Cord19Doc(doc_id='ni94qi4r', title='Liver tests abnormalities in COVID-19: trick or treat?', doi='10.1016/j.jhep.2020.05.033', date='2020-05-27', abstract='')\n",
"Cord19Doc(doc_id='z4ro6lmh', title='Rapid radiological improvement of COVID-19 pneumonia after treatment with tocilizumab', doi='10.1007/s15010-020-01449-w', date='2020-06-15', abstract='')\n",
"Cord19Doc(doc_id='hi8k8wvb', title='SARS E protein in phospholipid bilayers: an anomalous X-ray reflectivity study', doi='10.1016/j.physb.2004.11.015', date='2005-02-28', abstract='Abstract We report on an anomalous X-ray reflectivity study to locate a labelled residue of a membrane protein with respect to the lipid bilayer. From such experiments, important constraints on the protein or peptide conformation can be derived. Specifically, our aim is to localize an iodine-labelled phenylalanine in the SARS E protein, incorporated in DMPC phospholipid bilayers, which are deposited in the form of thick multilamellar stacks on silicon surfaces. Here, we discuss the experimental aspects and the difficulties associated with the Fourier synthesis analysis that gives the electron density profile of the membranes.')\n",
"Cord19Doc(doc_id='ma3ndg41', title='Italian Society of Interventional Cardiology (GISE) position paper for Cath lab‐specific preparedness recommendations for healthcare providers in case of suspected, probable or confirmed cases of COVID‐19', doi='10.1002/ccd.28888', date='2020-04-11', abstract='COVID‐19 pandemic raised the issue to guarantee the proper level of care to patients with acute cardiovascular diseases and concomitant suspected or confirmed COVID‐19 and, in the meantime safety and protection of healthcare providers. The aim of this position paper is to provide standards to healthcare facilities and healthcare providers on infection prevention and control measures during the management of suspected and confirmed cases of 2019‐nCoV infection accessing in cath‐lab. The document represents the view of the Italian Society of Interventional Cardiology (GISE), and it is based on recommendations from the main World and European Health Organizations (WHO, and ECDC) as well as from the Italian Society of Anesthesia, Analgesia, Resuscitation and Intensive Care (SIAARTI).')\n",
"Cord19Doc(doc_id='wh10285j', title=\"Nimble, Together: A Training Program's Response to the COVID-19 Pandemic\", doi='10.1097/sla.0000000000003994', date='2020-04-29', abstract='')\n",
"Cord19Doc(doc_id='pnl9th2c', title='Vascular Life during the COVID-19 Pandemic Reminds Us to Prepare for the Unexpected', doi='10.1016/j.ejvs.2020.04.040', date='2020-05-12', abstract='')\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "nm8hUpVWKOwM"
},
"source": [
"You can also select by percentages, e.g., `[:1/3]` slects the first third, `[1/3:2/3]` selects the second third, and `[2/3:]` selects the final third. This is hany when splitting document processing across processes, machines, or GPUs.\n",
"\n",
"These slices are smart: they avoid processing each document in the collection and jump to the right position in the source files to process."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "kZjhG-5XKqPR"
},
"source": [
"Now let's say you know a document'd ID and want to find its text. You can use `docs_store()` to accomplish this."
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "hXp1nxooJ6uP",
"outputId": "fc52d452-754c-42fd-ae3f-2f37364c4462"
},
"source": [
"docstore = dataset.docs_store()\n",
"docstore.get('3wuh6k6g')"
],
"execution_count": 10,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"Cord19Doc(doc_id='3wuh6k6g', title='Understand Research Hotspots Surrounding COVID-19 and Other Coronavirus Infections Using Topic Modeling', doi='10.1101/2020.03.26.20044164', date='2020-03-30', abstract='Background: Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) is a virus that causes severe respiratory illness in humans, which eventually results in the current outbreak of novel coronavirus disease (COVID-19) around the world. The research community is interested to know what are the hotspots in coronavirus (CoV) research and how much is known about COVID-19. This study aimed to evaluate the characteristics of publications involving coronaviruses as well as COVID-19 by using a topic modeling analysis. Methods: We extracted all abstracts and retained the most informative words from the COVID-19 Open Research Dataset, which contains all the 35,092 pieces of coronavirus related literature published up to March 20, 2020. Using Latent Dirichlet Allocation modeling, we trained an eight-topic model from the corpus. We then analyzed the semantic relationships between topics and compared the topic distribution between COVID-19 and other CoV infections. Results: Eight topics emerged overall: clinical characterization, pathogenesis research, therapeutics research, epidemiological study, virus transmission, vaccines research, virus diagnostics, and viral genomics. It was observed that COVID-19 research puts more emphasis on clinical characterization, epidemiological study, and virus transmission at present. In contrast, topics about diagnostics, therapeutics, vaccines, genomics and pathogenesis only accounted for less than 10% or even 4% of all the COVID-19 publications, much lower than those of other CoV infections. Conclusions: These results identified knowledge gaps in the area of COVID-19 and offered directions for future research. Keywords: COVID-19, coronavirus, topic modeling, hotspots, text mining')"
]
},
"metadata": {
"tags": []
},
"execution_count": 10
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "m7IN1f9_LMS1"
},
"source": [
"Or, a list of IDs. Maybe you're re-ranking these documents."
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "oXvKBt6-LPtS",
"outputId": "733320e0-2762-44ba-ce4c-226295d7878d"
},
"source": [
"docstore.get_many(['ax6v6ham', '44l5q07k', '8xm0kacj', '3wuh6k6g', 'fiievwy7'])"
],
"execution_count": 11,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'3wuh6k6g': Cord19Doc(doc_id='3wuh6k6g', title='Understand Research Hotspots Surrounding COVID-19 and Other Coronavirus Infections Using Topic Modeling', doi='10.1101/2020.03.26.20044164', date='2020-03-30', abstract='Background: Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) is a virus that causes severe respiratory illness in humans, which eventually results in the current outbreak of novel coronavirus disease (COVID-19) around the world. The research community is interested to know what are the hotspots in coronavirus (CoV) research and how much is known about COVID-19. This study aimed to evaluate the characteristics of publications involving coronaviruses as well as COVID-19 by using a topic modeling analysis. Methods: We extracted all abstracts and retained the most informative words from the COVID-19 Open Research Dataset, which contains all the 35,092 pieces of coronavirus related literature published up to March 20, 2020. Using Latent Dirichlet Allocation modeling, we trained an eight-topic model from the corpus. We then analyzed the semantic relationships between topics and compared the topic distribution between COVID-19 and other CoV infections. Results: Eight topics emerged overall: clinical characterization, pathogenesis research, therapeutics research, epidemiological study, virus transmission, vaccines research, virus diagnostics, and viral genomics. It was observed that COVID-19 research puts more emphasis on clinical characterization, epidemiological study, and virus transmission at present. In contrast, topics about diagnostics, therapeutics, vaccines, genomics and pathogenesis only accounted for less than 10% or even 4% of all the COVID-19 publications, much lower than those of other CoV infections. Conclusions: These results identified knowledge gaps in the area of COVID-19 and offered directions for future research. Keywords: COVID-19, coronavirus, topic modeling, hotspots, text mining'),\n",
" '44l5q07k': Cord19Doc(doc_id='44l5q07k', title='Rôle des animaux vertébrés dans l’épidémiologie des zoonoses', doi='10.1016/s1773-035x(15)30110-6', date='2015-05-31', abstract='Résumé Les zoonoses, distinguées ici des maladies humaines d’origine animale, représentent un ensemble d’entités pathologiques dont les agents responsables circulent régulièrement entre l’espèce humaine et de nombreuses espèces de vertébrés. L’analyse de divers exemples, quelles que soient les voies de transmission et les causes favorisantes de la contamination, met en avant une régulière rareté du passage direct du réservoir animal vers l’espèce humaine, à opposer à la diversité et surtout à la gravité possible des évolutions sanitaires ultérieures possibles, parfois liées à des comportements humains peu adaptés. D’un point de vue pratique, il semblerait plus pertinent de se pencher d’abord sur un meilleur contrôle de la diffusion des agents pathogènes au sein des populations humaines que d’agir a priori sur le réservoir animal, qu’il soit domestique ou sauvage, source potentielle de ces agents pathogènes mais selon des modalités difficiles à anticiper. La relation entre biodiversité et santé est abordée à l’occasion de la discussion. Summary Zoonoses, to be distinguished here from human diseases with an animal origin, represent a large quantity of pathological entities the corresponding pathogens of which are regularly shared between human beings and many different vertebrates species. Working on selected examples and situations, whatever the contamination routes and the facilitating reasons, puts into light a real rarity of a direct transmission from the animal reservoir to human beings. On the opposite, the diversity and the possible severity of some of the sanitary consequences, quite often in relation to human behaviours, must be stressed. On a practical point of view, it seems more adapted to try first to improve the control of pathogens diffusion within human populations after a contamination than to try to start to work first on the reservoir, be it domestic or wild, the potential source of these pathogens, but following routes quite difficult to anticipate. The relationship between biodiversity and health is discussed.'),\n",
" '8xm0kacj': Cord19Doc(doc_id='8xm0kacj', title='Host range of SARS-CoV-2 and implications for public health', doi='10.1016/s2666-5247(20)30069-0', date='2020-06-18', abstract=''),\n",
" 'ax6v6ham': Cord19Doc(doc_id='ax6v6ham', title='Close relationship between SARS-coronavirus and group 2 coronavirus.', doi='', date='2006', abstract='The sudden appearance and potential lethality of severe acute respiratory syndrome (SARS)-associated coronavirus (SARS-CoV) in humans has resulted in a focusing of new attention on the determination of both its origins and evolution. The relationship existing between SARS-CoV and other groups of coronaviruses was determined via analyses of phylogenetic trees and comparative genomic analyses of the coronavirus genes: polymerase (Orf1ab), spike (S), envelope (E), membrane (M) and nucleocapsid (N). Although the coronaviruses are traditionally classed into 3 groups, with SARS-CoV forming a 4th group, the phylogenetic position and origins of SARS-CoV remain a matter of some controversy. Thus, we conducted extensive phylogenetic analyses of the genes common to all coronavirus groups, using the Neighbor-joining, Maximum-likelihood, and Bayesian methods. Our data evidenced largely identical topology for all of the obtained phylogenetic trees, thus supporting the hypothesis that the relationship existing between SARS-CoV and group 2 coronavirus is a monophyletic one. Additional comparative genomic studies, including sequence similarity and protein secondary structure analyses, suggested that SARS-CoV may bear a closer relationship with group 2 than with the other coronavirus groups. Although our data strongly suggest that group 2 coronaviruses are most closely related with SARS-CoV, further and more detailed analyses may provide us with an increased amount of information regarding the origins and evolution of the coronaviruses, most notably SARS-CoV.'),\n",
" 'fiievwy7': Cord19Doc(doc_id='fiievwy7', title='SARS-CoV-2 will continue to circulate in the human population: an opinion from the point of view of the virus-host relationship', doi='10.1007/s00011-020-01352-y', date='2020-04-30', abstract='At the population level, the virus-host relationship is not set up to end with the complete elimination of either or both. Pathogen-resistant individuals will always remain in the host population. In turn, the virus can never completely eliminate the host population, because evolutionarily such an event is a dead end for the virus as an obligate intracellular parasite. A certain existential balance exists in the virus-host relationship. Against this backdrop, viral epidemics and pandemics only become manifest and egregious to human beings when tens and hundreds of thousands of people die and the question emerges what caused the high mortality peaks on the death chart. The answer seems clear; the emerging strain of the virus is new to the host population, and new mutations of the virus and natural selection will lead to a survival of only genetically resistant individuals in a host population. The dangers inherent to a novel virus are due to new features generally inthe molecular structure of proteins, which enable the virus to infect the cells of the host organism more intensively, dramatically challenging host immunity, and thus be transmitted more readily in the host population. In this article, we will concentrate on the facts currently available about severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2), which has caused COVID-19 (coronavirus disease 2019) pandemic and try to predict its development and consequences based on the virus-host relationship. In fact, only two scenarios will occur simultaneously in the very near future: people who are genetically resistant to the virus will get sick, recover, and develop immunity, while people who are sensitive to the virus will need drugs and vaccines, which will have to be researched and developed if they are to recover. If the pandemic does not stop, in a few decades it is anticipated that SARS-CoV-2 will become as safe as the four non-severe acute respiratory syndrome human coronaviruses (HCoV-NL63, HCoV-HKU1, HCoV-OC43, and HCoV-229E) currently circulating but causing low mortality in the human population.')}"
]
},
"metadata": {
"tags": []
},
"execution_count": 11
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "d_oB4z4ALcO0"
},
"source": [
"If you don't care about the order they are returned in, you can use `get_many_iter()`. This avoids keeping all the results in memory, and reads them in the order in which they appear on disk."
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "uBIivxhyK2kz",
"outputId": "3423a7cd-e9ca-448c-e46b-98a057153601"
},
"source": [
"for doc in docstore.get_many_iter(['ax6v6ham', '44l5q07k', '8xm0kacj', '3wuh6k6g', 'fiievwy7']):\n",
" print(doc)"
],
"execution_count": 12,
"outputs": [
{
"output_type": "stream",
"text": [
"Cord19Doc(doc_id='3wuh6k6g', title='Understand Research Hotspots Surrounding COVID-19 and Other Coronavirus Infections Using Topic Modeling', doi='10.1101/2020.03.26.20044164', date='2020-03-30', abstract='Background: Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) is a virus that causes severe respiratory illness in humans, which eventually results in the current outbreak of novel coronavirus disease (COVID-19) around the world. The research community is interested to know what are the hotspots in coronavirus (CoV) research and how much is known about COVID-19. This study aimed to evaluate the characteristics of publications involving coronaviruses as well as COVID-19 by using a topic modeling analysis. Methods: We extracted all abstracts and retained the most informative words from the COVID-19 Open Research Dataset, which contains all the 35,092 pieces of coronavirus related literature published up to March 20, 2020. Using Latent Dirichlet Allocation modeling, we trained an eight-topic model from the corpus. We then analyzed the semantic relationships between topics and compared the topic distribution between COVID-19 and other CoV infections. Results: Eight topics emerged overall: clinical characterization, pathogenesis research, therapeutics research, epidemiological study, virus transmission, vaccines research, virus diagnostics, and viral genomics. It was observed that COVID-19 research puts more emphasis on clinical characterization, epidemiological study, and virus transmission at present. In contrast, topics about diagnostics, therapeutics, vaccines, genomics and pathogenesis only accounted for less than 10% or even 4% of all the COVID-19 publications, much lower than those of other CoV infections. Conclusions: These results identified knowledge gaps in the area of COVID-19 and offered directions for future research. Keywords: COVID-19, coronavirus, topic modeling, hotspots, text mining')\n",
"Cord19Doc(doc_id='ax6v6ham', title='Close relationship between SARS-coronavirus and group 2 coronavirus.', doi='', date='2006', abstract='The sudden appearance and potential lethality of severe acute respiratory syndrome (SARS)-associated coronavirus (SARS-CoV) in humans has resulted in a focusing of new attention on the determination of both its origins and evolution. The relationship existing between SARS-CoV and other groups of coronaviruses was determined via analyses of phylogenetic trees and comparative genomic analyses of the coronavirus genes: polymerase (Orf1ab), spike (S), envelope (E), membrane (M) and nucleocapsid (N). Although the coronaviruses are traditionally classed into 3 groups, with SARS-CoV forming a 4th group, the phylogenetic position and origins of SARS-CoV remain a matter of some controversy. Thus, we conducted extensive phylogenetic analyses of the genes common to all coronavirus groups, using the Neighbor-joining, Maximum-likelihood, and Bayesian methods. Our data evidenced largely identical topology for all of the obtained phylogenetic trees, thus supporting the hypothesis that the relationship existing between SARS-CoV and group 2 coronavirus is a monophyletic one. Additional comparative genomic studies, including sequence similarity and protein secondary structure analyses, suggested that SARS-CoV may bear a closer relationship with group 2 than with the other coronavirus groups. Although our data strongly suggest that group 2 coronaviruses are most closely related with SARS-CoV, further and more detailed analyses may provide us with an increased amount of information regarding the origins and evolution of the coronaviruses, most notably SARS-CoV.')\n",
"Cord19Doc(doc_id='fiievwy7', title='SARS-CoV-2 will continue to circulate in the human population: an opinion from the point of view of the virus-host relationship', doi='10.1007/s00011-020-01352-y', date='2020-04-30', abstract='At the population level, the virus-host relationship is not set up to end with the complete elimination of either or both. Pathogen-resistant individuals will always remain in the host population. In turn, the virus can never completely eliminate the host population, because evolutionarily such an event is a dead end for the virus as an obligate intracellular parasite. A certain existential balance exists in the virus-host relationship. Against this backdrop, viral epidemics and pandemics only become manifest and egregious to human beings when tens and hundreds of thousands of people die and the question emerges what caused the high mortality peaks on the death chart. The answer seems clear; the emerging strain of the virus is new to the host population, and new mutations of the virus and natural selection will lead to a survival of only genetically resistant individuals in a host population. The dangers inherent to a novel virus are due to new features generally inthe molecular structure of proteins, which enable the virus to infect the cells of the host organism more intensively, dramatically challenging host immunity, and thus be transmitted more readily in the host population. In this article, we will concentrate on the facts currently available about severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2), which has caused COVID-19 (coronavirus disease 2019) pandemic and try to predict its development and consequences based on the virus-host relationship. In fact, only two scenarios will occur simultaneously in the very near future: people who are genetically resistant to the virus will get sick, recover, and develop immunity, while people who are sensitive to the virus will need drugs and vaccines, which will have to be researched and developed if they are to recover. If the pandemic does not stop, in a few decades it is anticipated that SARS-CoV-2 will become as safe as the four non-severe acute respiratory syndrome human coronaviruses (HCoV-NL63, HCoV-HKU1, HCoV-OC43, and HCoV-229E) currently circulating but causing low mortality in the human population.')\n",
"Cord19Doc(doc_id='44l5q07k', title='Rôle des animaux vertébrés dans l’épidémiologie des zoonoses', doi='10.1016/s1773-035x(15)30110-6', date='2015-05-31', abstract='Résumé Les zoonoses, distinguées ici des maladies humaines d’origine animale, représentent un ensemble d’entités pathologiques dont les agents responsables circulent régulièrement entre l’espèce humaine et de nombreuses espèces de vertébrés. L’analyse de divers exemples, quelles que soient les voies de transmission et les causes favorisantes de la contamination, met en avant une régulière rareté du passage direct du réservoir animal vers l’espèce humaine, à opposer à la diversité et surtout à la gravité possible des évolutions sanitaires ultérieures possibles, parfois liées à des comportements humains peu adaptés. D’un point de vue pratique, il semblerait plus pertinent de se pencher d’abord sur un meilleur contrôle de la diffusion des agents pathogènes au sein des populations humaines que d’agir a priori sur le réservoir animal, qu’il soit domestique ou sauvage, source potentielle de ces agents pathogènes mais selon des modalités difficiles à anticiper. La relation entre biodiversité et santé est abordée à l’occasion de la discussion. Summary Zoonoses, to be distinguished here from human diseases with an animal origin, represent a large quantity of pathological entities the corresponding pathogens of which are regularly shared between human beings and many different vertebrates species. Working on selected examples and situations, whatever the contamination routes and the facilitating reasons, puts into light a real rarity of a direct transmission from the animal reservoir to human beings. On the opposite, the diversity and the possible severity of some of the sanitary consequences, quite often in relation to human behaviours, must be stressed. On a practical point of view, it seems more adapted to try first to improve the control of pathogens diffusion within human populations after a contamination than to try to start to work first on the reservoir, be it domestic or wild, the potential source of these pathogens, but following routes quite difficult to anticipate. The relationship between biodiversity and health is discussed.')\n",
"Cord19Doc(doc_id='8xm0kacj', title='Host range of SARS-CoV-2 and implications for public health', doi='10.1016/s2666-5247(20)30069-0', date='2020-06-18', abstract='')\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Vis7u-VeMX70"
},
"source": [
"## Queries\n",
"\n",
"`queries` (topics) map a `query_id` to one or more text fields. Akint to `docs`, you can iterate over queries for a collection using `queries_iter()`:"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "-Tl3npjZLEWB",
"outputId": "57bb154e-cb8a-4e6e-b57d-7ae3e5261c22"
},
"source": [
"for query in dataset.queries_iter():\n",
" print(query)"
],
"execution_count": 13,
"outputs": [
{
"output_type": "stream",
"text": [
"TrecQuery(query_id='1', title='coronavirus origin', description='what is the origin of COVID-19', narrative=\"seeking range of information about the SARS-CoV-2 virus's origin, including its evolution, animal source, and first transmission into humans\")\n",
"TrecQuery(query_id='2', title='coronavirus response to weather changes', description='how does the coronavirus respond to changes in the weather', narrative='seeking range of information about the SARS-CoV-2 virus viability in different weather/climate conditions as well as information related to transmission of the virus in different climate conditions')\n",
"TrecQuery(query_id='3', title='coronavirus immunity', description='will SARS-CoV2 infected people develop immunity? Is cross protection possible?', narrative='seeking studies of immunity developed due to infection with SARS-CoV2 or cross protection gained due to infection with other coronavirus types')\n",
"TrecQuery(query_id='4', title='how do people die from the coronavirus', description='what causes death from Covid-19?', narrative='Studies looking at mechanisms of death from Covid-19.')\n",
"TrecQuery(query_id='5', title='animal models of COVID-19', description='what drugs have been active against SARS-CoV or SARS-CoV-2 in animal studies?', narrative='Papers that describe the results of testing drugs that bind to spike proteins of the virus or any other drugs in any animal models. Papers about SARS-CoV-2 infection in cell culture assays are also relevant.')\n",
"TrecQuery(query_id='6', title='coronavirus test rapid testing', description='what types of rapid testing for Covid-19 have been developed?', narrative='Looking for studies identifying ways to diagnose Covid-19 more rapidly.')\n",
"TrecQuery(query_id='7', title='serological tests for coronavirus', description='are there serological tests that detect antibodies to coronavirus?', narrative='Looking for assays that measure immune response to COVID-19 that will help determine past infection and subsequent possible immunity.')\n",
"TrecQuery(query_id='8', title='coronavirus under reporting', description='how has lack of testing availability led to underreporting of true incidence of Covid-19?', narrative='Looking for studies answering questions of impact of lack of complete testing for Covid-19 on incidence and prevalence of Covid-19.')\n",
"TrecQuery(query_id='9', title='coronavirus in Canada', description='how has COVID-19 affected Canada', narrative='seeking data related to infections (confirm, suspected, and projected) and health outcomes (symptoms, hospitalization, intensive care, mortality)')\n",
"TrecQuery(query_id='10', title='coronavirus social distancing impact', description='has social distancing had an impact on slowing the spread of COVID-19?', narrative=\"seeking specific information on studies that have measured COVID-19's transmission in one or more social distancing (or non-social distancing) approaches\")\n",
"TrecQuery(query_id='11', title='coronavirus hospital rationing', description='what are the guidelines for triaging patients infected with coronavirus?', narrative='Seeking information on any guidelines for prioritizing COVID-19 patients infected with coronavirus based on demographics, clinical signs, serology and other tests.')\n",
"TrecQuery(query_id='12', title='coronavirus quarantine', description='what are best practices in hospitals and at home in maintaining quarantine?', narrative='Seeking information on best practices for activities and duration of quarantine for those exposed and/ infected to COVID-19 virus.')\n",
"TrecQuery(query_id='13', title='how does coronavirus spread', description='what are the transmission routes of coronavirus?', narrative='Looking for information on all possible ways to contract COVID-19 from people, animals and objects')\n",
"TrecQuery(query_id='14', title='coronavirus super spreaders', description='what evidence is there related to COVID-19 super spreaders', narrative='seeking range of information related to the number and proportion of super spreaders, their patterns of behavior that lead to spread, and potential prevention strategies targeted specifically toward super spreaders')\n",
"TrecQuery(query_id='15', title='coronavirus outside body', description='how long can the coronavirus live outside the body', narrative=\"seeking range of information on the SARS-CoV-2's virus's survival in different environments (surfaces, liquids, etc.) outside the human body while still being viable for transmission to another human\")\n",
"TrecQuery(query_id='16', title='how long does coronavirus survive on surfaces', description='how long does coronavirus remain stable on surfaces?', narrative='Studies of time SARS-CoV-2 remains stable after being deposited from an infected person on everyday surfaces in a household or hospital setting, such as through coughing or touching objects.')\n",
"TrecQuery(query_id='17', title='coronavirus clinical trials', description='are there any clinical trials available for the coronavirus', narrative='seeking specific COVID-19 clinical trials ranging from trials in recruitment to completed trials with results')\n",
"TrecQuery(query_id='18', title='masks prevent coronavirus', description='what are the best masks for preventing infection by Covid-19?', narrative='What types of masks should or should not be used to prevent infection by Covid-19?')\n",
"TrecQuery(query_id='19', title='what alcohol sanitizer kills coronavirus', description='what type of hand sanitizer is needed to destroy Covid-19?', narrative='Studies assessing chemicals and their concentrations needed to destroy the Covid-19 virus.')\n",
"TrecQuery(query_id='20', title='coronavirus and ACE inhibitors', description='are patients taking Angiotensin-converting enzyme inhibitors (ACE) at increased risk for COVID-19?', narrative='Looking for information on interactions between coronavirus and angiotensin converting enzyme 2 (ACE2) receptors, risk for patients taking these medications, and recommendations for these patients.')\n",
"TrecQuery(query_id='21', title='coronavirus mortality', description='what are the mortality rates overall and in specific populations', narrative='Seeking information on COVID-19 fatality rates in different countries and in different population groups based on gender, blood types, or other factors')\n",
"TrecQuery(query_id='22', title='coronavirus heart impacts', description='are cardiac complications likely in patients with COVID-19?', narrative='Seeking information on the types, frequency and mechanisms of cardiac complications caused by coronavirus.')\n",
"TrecQuery(query_id='23', title='coronavirus hypertension', description='what kinds of complications related to COVID-19 are associated with hypertension?', narrative='seeking specific outcomes that hypertensive (any type) patients are more/less likely to face if infected with the virus')\n",
"TrecQuery(query_id='24', title='coronavirus diabetes', description='what kinds of complications related to COVID-19 are associated with diabetes', narrative='seeking specific outcomes that diabetic (any type) patients are more/less likely to face if infected with the virus')\n",
"TrecQuery(query_id='25', title='coronavirus biomarkers', description='which biomarkers predict the severe clinical course of 2019-nCOV infection?', narrative='Looking for information on biomarkers that predict disease outcomes in people infected with coronavirus, specifically those that predict severe and fatal outcomes.')\n",
"TrecQuery(query_id='26', title='coronavirus early symptoms', description='what are the initial symptoms of Covid-19?', narrative='Studies of patients and the first clinical manifestations they develop upon active infection?')\n",
"TrecQuery(query_id='27', title='coronavirus asymptomatic', description='what is known about those infected with Covid-19 but are asymptomatic?', narrative='Studies of people who are known to be infected with Covid-19 but show no symptoms?')\n",
"TrecQuery(query_id='28', title='coronavirus hydroxychloroquine', description='what evidence is there for the value of hydroxychloroquine in treating Covid-19?', narrative='Basic science or clinical studies assessing the benefit and harms of treating Covid-19 with hydroxychloroquine.')\n",
"TrecQuery(query_id='29', title='coronavirus drug repurposing', description='which SARS-CoV-2 proteins-human proteins interactions indicate potential for drug targets. Are there approved drugs that can be repurposed based on this information?', narrative='Seeking information about protein-protein interactions for any of the SARS-CoV-2 structural proteins that represent a promising therapeutic target, and the drug molecules that may inhibit the virus and the host cell receptors at entry step.')\n",
"TrecQuery(query_id='30', title='coronavirus remdesivir', description='is remdesivir an effective treatment for COVID-19', narrative='seeking specific information on clinical outcomes in COVID-19 patients treated with remdesivir')\n",
"TrecQuery(query_id='31', title='difference between coronavirus and flu', description='How does the coronavirus differ from seasonal flu?', narrative='Includes studies ranging from those focusing on genomic differences to global public health impacts, but must draw direct comparisons between COVID-19 and seasonal influenza.')\n",
"TrecQuery(query_id='32', title='coronavirus subtypes', description='Does SARS-CoV-2 have any subtypes, and if so what are they?', narrative='Papers that discuss subtypes of the virus, from named subtypes to speculative subtypes based on genomic or geographic clustering.')\n",
"TrecQuery(query_id='33', title='coronavirus vaccine candidates', description='What vaccine candidates are being tested for Covid-19?', narrative='Seeking studies that discuss possible, but specific, COVID-19 vaccines. Includes articles from those describing the mechanisms of action of specific proposed vaccines to actual clinical trials, but excluding articles that do not name a specific vaccine candidate.')\n",
"TrecQuery(query_id='34', title='coronavirus recovery', description='What are the longer-term complications of those who recover from COVID-19?', narrative='Seeking information on the health outcomes for those that recover from the virus. Excludes studies only focusing on adverse effects related to a particular COVID-19 drug.')\n",
"TrecQuery(query_id='35', title='coronavirus public datasets', description='What new public datasets are available related to COVID-19?', narrative='Seeking articles that specifically release new data related to SARS-CoV-2 or COVID-19, including genomic data, patient data, public health data, etc. Articles that reference previously existing datasets are not relevant.')\n",
"TrecQuery(query_id='36', title='SARS-CoV-2 spike structure', description='What is the protein structure of the SARS-CoV-2 spike?', narrative='Looking for studies of the structure of the spike protein on the virus using any methods, such as cryo-EM or crystallography')\n",
"TrecQuery(query_id='37', title='SARS-CoV-2 phylogenetic analysis', description='What is the result of phylogenetic analysis of SARS-CoV-2 genome sequence?', narrative='Looking for a range of studies which provide the results of phylogenetic network analysis on the SARS-CoV-2 genome')\n",
"TrecQuery(query_id='38', title='COVID inflammatory response', description='What is the mechanism of inflammatory response and pathogenesis of COVID-19 cases?', narrative='Looking for a range of studies which describes the inflammatory response cells and pathogenesis during the Coronavirus Disease 2019 (COVID-19) outbreak, including the mechanism of anti-inflammatory drugs, corticosteroids, and vitamin supplements')\n",
"TrecQuery(query_id='39', title='COVID-19 cytokine storm', description='What is the mechanism of cytokine storm syndrome on the COVID-19?', narrative='Looking for studies that describes mechanism of development of cytokine storm syndrome among COVID-19 cases and the range of drugs used for the therapy of cytokine storm')\n",
"TrecQuery(query_id='40', title='coronavirus mutations', description='What are the observed mutations in the SARS-CoV-2 genome and how often do the mutations occur?', narrative='Looking for studies that describes the emergence of genomic diversity of the coronavirus due to recurrent mutations which explore the potential genomic site of the mutation, mechanisms and its potential or observed clinical implications in the pathogenicity of the virus')\n",
"TrecQuery(query_id='41', title='COVID-19 in African-Americans', description='What are the impacts of COVID-19 among African-Americans that differ from the rest of the U.S. population?', narrative='Looking for studies that analyze burden of illness and death among the African-American/black racial/ethnic group. This includes potential reasons for transmission, morbidity, and mortality. This may include discussion of other minority groups, but all studies should contain specific information on the health disparities faced by African-Americans in this pandemic.')\n",
"TrecQuery(query_id='42', title='Vitamin D and COVID-19', description='Does Vitamin D impact COVID-19 prevention and treatment?', narrative='This includes studies describing possible role of Vitamin D in prevention of COVID-19, suppression of cytokine storm, clinical outcomes, and associations between Vitamin D status and COVID-19 mortality.')\n",
"TrecQuery(query_id='43', title='violence during pandemic', description='How has the COVID-19 pandemic impacted violence in society, including violent crimes?', narrative='Looking for analyses and data on how the pandemic is impacting rates of violence, including domestic/family violence related to quarantine.')\n",
"TrecQuery(query_id='44', title='impact of masks on coronavirus transmission', description='How much impact do masks have on preventing the spread of the COVID-19?', narrative='Looking for studies of how masks slow SARS-CoV-2 transmission, including impact on R0. Studies can include both lab and population studies.')\n",
"TrecQuery(query_id='45', title='coronavirus mental health impact', description='How has the COVID-19 pandemic impacted mental health?', narrative='Includes increasing/decreasing rates of depression, anxiety, panic disorder, and other psychiatric and mental health conditions.')\n",
"TrecQuery(query_id='46', title='dexamethasone coronavirus', description='what evidence is there for dexamethasone as a treatment for COVID-19?', narrative='Looking for studies on the impact of dexamethasone treatment in COVID-19 patients, including health benefits as well as adverse effects. This also includes specific populations that are benefitted/harmed by dexamethasone.')\n",
"TrecQuery(query_id='47', title='COVID-19 outcomes in children', description='what are the health outcomes for children who contract COVID-19?', narrative='Looking for studies on health outcomes in children related to COVID-19. This includes studies attempting to explain the underlying biological mechanisms for why children differ from adults in response to infection.')\n",
"TrecQuery(query_id='48', title='school reopening coronavirus', description='what are the benefits and risks of re-opening schools in the midst of the COVID-19 pandemic?', narrative='With the possibility of schools re-opening while the COVID-19 pandemic is still ongoing, this topic is looking for evidence or projections on what the potential implications of this are in terms of COVID-19 cases, hospitalizations, or deaths, as well as other benefits or harms to re-opening schools. This includes both the impact on students, teachers, families, and the wider community.')\n",
"TrecQuery(query_id='49', title='post-infection COVID-19 immunity', description='do individuals who recover from COVID-19 show sufficient immune response, including antibody levels and T-cell mediated immunity, to prevent re-infection?', narrative='There is concern about re-infection for COVID-19, so this topic is looking for studies suggesting post-infection immunity, including post-infection antibody levels (over time) and evidence for individuals who have been infected more than once.')\n",
"TrecQuery(query_id='50', title='mRNA vaccine coronavirus', description='what is known about an mRNA vaccine for the SARS-CoV-2 virus?', narrative='Looking for studies specifically focusing on mRNA vaccines for COVID-19, including how mRNA vaccines work, why they are promising, and any results from actual clinical studies.')\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "-0TnBGErO7VS"
},
"source": [
"Iterables of namedtuples are handy structures because they are lightweight and do not load all the content into memory. But in case you need that, you can easily convert them into other data structures. Here's an example building a Pandas DataFrame of the queries:"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"id": "nFsImZY1PNGa",
"outputId": "6bf596b8-8412-48e7-be75-eaf5e703eb01"
},
"source": [
"import pandas as pd\n",
"pd.DataFrame(dataset.queries_iter())"
],
"execution_count": 14,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" query_id | \n",
" title | \n",
" description | \n",
" narrative | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" coronavirus origin | \n",
" what is the origin of COVID-19 | \n",
" seeking range of information about the SARS-Co... | \n",
"
\n",
" \n",
" | 1 | \n",
" 2 | \n",
" coronavirus response to weather changes | \n",
" how does the coronavirus respond to changes in... | \n",
" seeking range of information about the SARS-Co... | \n",
"
\n",
" \n",
" | 2 | \n",
" 3 | \n",
" coronavirus immunity | \n",
" will SARS-CoV2 infected people develop immunit... | \n",
" seeking studies of immunity developed due to i... | \n",
"
\n",
" \n",
" | 3 | \n",
" 4 | \n",
" how do people die from the coronavirus | \n",
" what causes death from Covid-19? | \n",
" Studies looking at mechanisms of death from Co... | \n",
"
\n",
" \n",
" | 4 | \n",
" 5 | \n",
" animal models of COVID-19 | \n",
" what drugs have been active against SARS-CoV o... | \n",
" Papers that describe the results of testing d... | \n",
"
\n",
" \n",
" | 5 | \n",
" 6 | \n",
" coronavirus test rapid testing | \n",
" what types of rapid testing for Covid-19 have ... | \n",
" Looking for studies identifying ways to diagno... | \n",
"
\n",
" \n",
" | 6 | \n",
" 7 | \n",
" serological tests for coronavirus | \n",
" are there serological tests that detect antibo... | \n",
" Looking for assays that measure immune respons... | \n",
"
\n",
" \n",
" | 7 | \n",
" 8 | \n",
" coronavirus under reporting | \n",
" how has lack of testing availability led to un... | \n",
" Looking for studies answering questions of imp... | \n",
"
\n",
" \n",
" | 8 | \n",
" 9 | \n",
" coronavirus in Canada | \n",
" how has COVID-19 affected Canada | \n",
" seeking data related to infections (confirm, s... | \n",
"
\n",
" \n",
" | 9 | \n",
" 10 | \n",
" coronavirus social distancing impact | \n",
" has social distancing had an impact on slowing... | \n",
" seeking specific information on studies that h... | \n",
"
\n",
" \n",
" | 10 | \n",
" 11 | \n",
" coronavirus hospital rationing | \n",
" what are the guidelines for triaging patients ... | \n",
" Seeking information on any guidelines for prio... | \n",
"
\n",
" \n",
" | 11 | \n",
" 12 | \n",
" coronavirus quarantine | \n",
" what are best practices in hospitals and at ho... | \n",
" Seeking information on best practices for acti... | \n",
"
\n",
" \n",
" | 12 | \n",
" 13 | \n",
" how does coronavirus spread | \n",
" what are the transmission routes of coronavirus? | \n",
" Looking for information on all possible ways t... | \n",
"
\n",
" \n",
" | 13 | \n",
" 14 | \n",
" coronavirus super spreaders | \n",
" what evidence is there related to COVID-19 sup... | \n",
" seeking range of information related to the nu... | \n",
"
\n",
" \n",
" | 14 | \n",
" 15 | \n",
" coronavirus outside body | \n",
" how long can the coronavirus live outside the ... | \n",
" seeking range of information on the SARS-CoV-2... | \n",
"
\n",
" \n",
" | 15 | \n",
" 16 | \n",
" how long does coronavirus survive on surfaces | \n",
" how long does coronavirus remain stable on su... | \n",
" Studies of time SARS-CoV-2 remains stable afte... | \n",
"
\n",
" \n",
" | 16 | \n",
" 17 | \n",
" coronavirus clinical trials | \n",
" are there any clinical trials available for th... | \n",
" seeking specific COVID-19 clinical trials rang... | \n",
"
\n",
" \n",
" | 17 | \n",
" 18 | \n",
" masks prevent coronavirus | \n",
" what are the best masks for preventing infecti... | \n",
" What types of masks should or should not be us... | \n",
"
\n",
" \n",
" | 18 | \n",
" 19 | \n",
" what alcohol sanitizer kills coronavirus | \n",
" what type of hand sanitizer is needed to destr... | \n",
" Studies assessing chemicals and their concentr... | \n",
"
\n",
" \n",
" | 19 | \n",
" 20 | \n",
" coronavirus and ACE inhibitors | \n",
" are patients taking Angiotensin-converting enz... | \n",
" Looking for information on interactions betwee... | \n",
"
\n",
" \n",
" | 20 | \n",
" 21 | \n",
" coronavirus mortality | \n",
" what are the mortality rates overall and in sp... | \n",
" Seeking information on COVID-19 fatality rate... | \n",
"
\n",
" \n",
" | 21 | \n",
" 22 | \n",
" coronavirus heart impacts | \n",
" are cardiac complications likely in patients w... | \n",
" Seeking information on the types, frequency an... | \n",
"
\n",
" \n",
" | 22 | \n",
" 23 | \n",
" coronavirus hypertension | \n",
" what kinds of complications related to COVID-1... | \n",
" seeking specific outcomes that hypertensive (... | \n",
"
\n",
" \n",
" | 23 | \n",
" 24 | \n",
" coronavirus diabetes | \n",
" what kinds of complications related to COVID-1... | \n",
" seeking specific outcomes that diabetic (any t... | \n",
"
\n",
" \n",
" | 24 | \n",
" 25 | \n",
" coronavirus biomarkers | \n",
" which biomarkers predict the severe clinical c... | \n",
" Looking for information on biomarkers that pre... | \n",
"
\n",
" \n",
" | 25 | \n",
" 26 | \n",
" coronavirus early symptoms | \n",
" what are the initial symptoms of Covid-19? | \n",
" Studies of patients and the first clinical man... | \n",
"
\n",
" \n",
" | 26 | \n",
" 27 | \n",
" coronavirus asymptomatic | \n",
" what is known about those infected with Covid-... | \n",
" Studies of people who are known to be infected... | \n",
"
\n",
" \n",
" | 27 | \n",
" 28 | \n",
" coronavirus hydroxychloroquine | \n",
" what evidence is there for the value of hydrox... | \n",
" Basic science or clinical studies assessing th... | \n",
"
\n",
" \n",
" | 28 | \n",
" 29 | \n",
" coronavirus drug repurposing | \n",
" which SARS-CoV-2 proteins-human proteins inter... | \n",
" Seeking information about protein-protein inte... | \n",
"
\n",
" \n",
" | 29 | \n",
" 30 | \n",
" coronavirus remdesivir | \n",
" is remdesivir an effective treatment for COVID-19 | \n",
" seeking specific information on clinical outco... | \n",
"
\n",
" \n",
" | 30 | \n",
" 31 | \n",
" difference between coronavirus and flu | \n",
" How does the coronavirus differ from seasonal ... | \n",
" Includes studies ranging from those focusing o... | \n",
"
\n",
" \n",
" | 31 | \n",
" 32 | \n",
" coronavirus subtypes | \n",
" Does SARS-CoV-2 have any subtypes, and if so w... | \n",
" Papers that discuss subtypes of the virus, fro... | \n",
"
\n",
" \n",
" | 32 | \n",
" 33 | \n",
" coronavirus vaccine candidates | \n",
" What vaccine candidates are being tested for C... | \n",
" Seeking studies that discuss possible, but spe... | \n",
"
\n",
" \n",
" | 33 | \n",
" 34 | \n",
" coronavirus recovery | \n",
" What are the longer-term complications of thos... | \n",
" Seeking information on the health outcomes for... | \n",
"
\n",
" \n",
" | 34 | \n",
" 35 | \n",
" coronavirus public datasets | \n",
" What new public datasets are available related... | \n",
" Seeking articles that specifically release new... | \n",
"
\n",
" \n",
" | 35 | \n",
" 36 | \n",
" SARS-CoV-2 spike structure | \n",
" What is the protein structure of the SARS-CoV-... | \n",
" Looking for studies of the structure of the sp... | \n",
"
\n",
" \n",
" | 36 | \n",
" 37 | \n",
" SARS-CoV-2 phylogenetic analysis | \n",
" What is the result of phylogenetic analysis of... | \n",
" Looking for a range of studies which provide t... | \n",
"
\n",
" \n",
" | 37 | \n",
" 38 | \n",
" COVID inflammatory response | \n",
" What is the mechanism of inflammatory response... | \n",
" Looking for a range of studies which describes... | \n",
"
\n",
" \n",
" | 38 | \n",
" 39 | \n",
" COVID-19 cytokine storm | \n",
" What is the mechanism of cytokine storm syndro... | \n",
" Looking for studies that describes mechanism o... | \n",
"
\n",
" \n",
" | 39 | \n",
" 40 | \n",
" coronavirus mutations | \n",
" What are the observed mutations in the SARS-Co... | \n",
" Looking for studies that describes the emergen... | \n",
"
\n",
" \n",
" | 40 | \n",
" 41 | \n",
" COVID-19 in African-Americans | \n",
" What are the impacts of COVID-19 among African... | \n",
" Looking for studies that analyze burden of ill... | \n",
"
\n",
" \n",
" | 41 | \n",
" 42 | \n",
" Vitamin D and COVID-19 | \n",
" Does Vitamin D impact COVID-19 prevention and ... | \n",
" This includes studies describing possible role... | \n",
"
\n",
" \n",
" | 42 | \n",
" 43 | \n",
" violence during pandemic | \n",
" How has the COVID-19 pandemic impacted violenc... | \n",
" Looking for analyses and data on how the pande... | \n",
"
\n",
" \n",
" | 43 | \n",
" 44 | \n",
" impact of masks on coronavirus transmission | \n",
" How much impact do masks have on preventing th... | \n",
" Looking for studies of how masks slow SARS-CoV... | \n",
"
\n",
" \n",
" | 44 | \n",
" 45 | \n",
" coronavirus mental health impact | \n",
" How has the COVID-19 pandemic impacted mental ... | \n",
" Includes increasing/decreasing rates of depres... | \n",
"
\n",
" \n",
" | 45 | \n",
" 46 | \n",
" dexamethasone coronavirus | \n",
" what evidence is there for dexamethasone as a ... | \n",
" Looking for studies on the impact of dexametha... | \n",
"
\n",
" \n",
" | 46 | \n",
" 47 | \n",
" COVID-19 outcomes in children | \n",
" what are the health outcomes for children who ... | \n",
" Looking for studies on health outcomes in chil... | \n",
"
\n",
" \n",
" | 47 | \n",
" 48 | \n",
" school reopening coronavirus | \n",
" what are the benefits and risks of re-opening ... | \n",
" With the possibility of schools re-opening whi... | \n",
"
\n",
" \n",
" | 48 | \n",
" 49 | \n",
" post-infection COVID-19 immunity | \n",
" do individuals who recover from COVID-19 show ... | \n",
" There is concern about re-infection for COVID-... | \n",
"
\n",
" \n",
" | 49 | \n",
" 50 | \n",
" mRNA vaccine coronavirus | \n",
" what is known about an mRNA vaccine for the SA... | \n",
" Looking for studies specifically focusing on m... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" query_id ... narrative\n",
"0 1 ... seeking range of information about the SARS-Co...\n",
"1 2 ... seeking range of information about the SARS-Co...\n",
"2 3 ... seeking studies of immunity developed due to i...\n",
"3 4 ... Studies looking at mechanisms of death from Co...\n",
"4 5 ... Papers that describe the results of testing d...\n",
"5 6 ... Looking for studies identifying ways to diagno...\n",
"6 7 ... Looking for assays that measure immune respons...\n",
"7 8 ... Looking for studies answering questions of imp...\n",
"8 9 ... seeking data related to infections (confirm, s...\n",
"9 10 ... seeking specific information on studies that h...\n",
"10 11 ... Seeking information on any guidelines for prio...\n",
"11 12 ... Seeking information on best practices for acti...\n",
"12 13 ... Looking for information on all possible ways t...\n",
"13 14 ... seeking range of information related to the nu...\n",
"14 15 ... seeking range of information on the SARS-CoV-2...\n",
"15 16 ... Studies of time SARS-CoV-2 remains stable afte...\n",
"16 17 ... seeking specific COVID-19 clinical trials rang...\n",
"17 18 ... What types of masks should or should not be us...\n",
"18 19 ... Studies assessing chemicals and their concentr...\n",
"19 20 ... Looking for information on interactions betwee...\n",
"20 21 ... Seeking information on COVID-19 fatality rate...\n",
"21 22 ... Seeking information on the types, frequency an...\n",
"22 23 ... seeking specific outcomes that hypertensive (...\n",
"23 24 ... seeking specific outcomes that diabetic (any t...\n",
"24 25 ... Looking for information on biomarkers that pre...\n",
"25 26 ... Studies of patients and the first clinical man...\n",
"26 27 ... Studies of people who are known to be infected...\n",
"27 28 ... Basic science or clinical studies assessing th...\n",
"28 29 ... Seeking information about protein-protein inte...\n",
"29 30 ... seeking specific information on clinical outco...\n",
"30 31 ... Includes studies ranging from those focusing o...\n",
"31 32 ... Papers that discuss subtypes of the virus, fro...\n",
"32 33 ... Seeking studies that discuss possible, but spe...\n",
"33 34 ... Seeking information on the health outcomes for...\n",
"34 35 ... Seeking articles that specifically release new...\n",
"35 36 ... Looking for studies of the structure of the sp...\n",
"36 37 ... Looking for a range of studies which provide t...\n",
"37 38 ... Looking for a range of studies which describes...\n",
"38 39 ... Looking for studies that describes mechanism o...\n",
"39 40 ... Looking for studies that describes the emergen...\n",
"40 41 ... Looking for studies that analyze burden of ill...\n",
"41 42 ... This includes studies describing possible role...\n",
"42 43 ... Looking for analyses and data on how the pande...\n",
"43 44 ... Looking for studies of how masks slow SARS-CoV...\n",
"44 45 ... Includes increasing/decreasing rates of depres...\n",
"45 46 ... Looking for studies on the impact of dexametha...\n",
"46 47 ... Looking for studies on health outcomes in chil...\n",
"47 48 ... With the possibility of schools re-opening whi...\n",
"48 49 ... There is concern about re-infection for COVID-...\n",
"49 50 ... Looking for studies specifically focusing on m...\n",
"\n",
"[50 rows x 4 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 14
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "x8guincjNQA1"
},
"source": [
"Again, we can [check the documentation](https://ir-datasets.com/cord19.html#cord19/trec-covid) for information about what fields are available. Or we can use `queries_cls()`:"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "lxSvF03NM2Dt",
"outputId": "d2ca9dd1-5914-4697-d8b4-8b904c85902a"
},
"source": [
"dataset.queries_cls()"
],
"execution_count": 15,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"ir_datasets.formats.trec.TrecQuery"
]
},
"metadata": {
"tags": []
},
"execution_count": 15
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "BRBNpFHZNK70",
"outputId": "5142178d-5844-4224-e5b1-c7547d7dec86"
},
"source": [
"dataset.queries_cls()._fields"
],
"execution_count": 16,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"('query_id', 'title', 'description', 'narrative')"
]
},
"metadata": {
"tags": []
},
"execution_count": 16
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "PycaUfckNM80",
"outputId": "1a5f74a7-1131-4937-e3e5-1ae06e8015cd"
},
"source": [
"dataset.queries_cls().__annotations__"
],
"execution_count": 17,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"OrderedDict([('query_id', str),\n",
" ('title', str),\n",
" ('description', str),\n",
" ('narrative', str)])"
]
},
"metadata": {
"tags": []
},
"execution_count": 17
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "p5IA74C3Nlr0"
},
"source": [
"## Query Relevance Assessments\n",
"\n",
"`qrels` (query relevance assessments/judgments) map a `query_id` and `doc_id` to a relevance score.\n",
"\n",
"You probably guessed it; we can fetch qrels for a dataset with `qrels_iter()`. There's a lot of them, so we'll just show them in a DataFrame to start with:"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 424
},
"id": "VYldukNANN8I",
"outputId": "998a629b-89f2-4d23-9de9-4289c92e3287"
},
"source": [
"pd.DataFrame(dataset.qrels_iter())"
],
"execution_count": 18,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" query_id | \n",
" doc_id | \n",
" relevance | \n",
" iteration | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" 005b2j4b | \n",
" 2 | \n",
" 4.5 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 00fmeepz | \n",
" 1 | \n",
" 4 | \n",
"
\n",
" \n",
" | 2 | \n",
" 1 | \n",
" 010vptx3 | \n",
" 2 | \n",
" 0.5 | \n",
"
\n",
" \n",
" | 3 | \n",
" 1 | \n",
" 0194oljo | \n",
" 1 | \n",
" 2.5 | \n",
"
\n",
" \n",
" | 4 | \n",
" 1 | \n",
" 021q9884 | \n",
" 1 | \n",
" 4 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 69313 | \n",
" 50 | \n",
" zvop8bxh | \n",
" 2 | \n",
" 5 | \n",
"
\n",
" \n",
" | 69314 | \n",
" 50 | \n",
" zwf26o63 | \n",
" 1 | \n",
" 5 | \n",
"
\n",
" \n",
" | 69315 | \n",
" 50 | \n",
" zwsvlnwe | \n",
" 0 | \n",
" 5 | \n",
"
\n",
" \n",
" | 69316 | \n",
" 50 | \n",
" zxr01yln | \n",
" 1 | \n",
" 5 | \n",
"
\n",
" \n",
" | 69317 | \n",
" 50 | \n",
" zz8wvos9 | \n",
" 1 | \n",
" 5 | \n",
"
\n",
" \n",
"
\n",
"
69318 rows × 4 columns
\n",
"
"
],
"text/plain": [
" query_id doc_id relevance iteration\n",
"0 1 005b2j4b 2 4.5\n",
"1 1 00fmeepz 1 4\n",
"2 1 010vptx3 2 0.5\n",
"3 1 0194oljo 1 2.5\n",
"4 1 021q9884 1 4\n",
"... ... ... ... ...\n",
"69313 50 zvop8bxh 2 5\n",
"69314 50 zwf26o63 1 5\n",
"69315 50 zwsvlnwe 0 5\n",
"69316 50 zxr01yln 1 5\n",
"69317 50 zz8wvos9 1 5\n",
"\n",
"[69318 rows x 4 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 18
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "piCD2tZkOGaQ"
},
"source": [
"What does relevance=0, 1, and 2 mean? You can find out with `qrels_defs`:"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Fy_xf2vQN-Jx",
"outputId": "4b6264db-7483-4630-a558-a36c8d7420c8"
},
"source": [
"dataset.qrels_defs()"
],
"execution_count": 19,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{0: 'Not Relevant: everything else.',\n",
" 1: 'Partially Relevant: the article answers part of the question but would need to be combined with other information to get a complete answer.',\n",
" 2: 'Relevant: the article is fully responsive to the information need as expressed by the topic, i.e. answers the Question in the topic. The article need not contain all information on the topic, but must, on its own, provide an answer to the question.'}"
]
},
"metadata": {
"tags": []
},
"execution_count": 19
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "B8noJu7xOQ23"
},
"source": [
"Of course we can also get information about the `TrecQrel` type using `qrels_cls()`:"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "0q8oZBIQOEnX",
"outputId": "64ff29a9-e273-4ead-b94d-f5b5aa8eda48"
},
"source": [
"dataset.qrels_cls()"
],
"execution_count": 20,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"ir_datasets.formats.trec.TrecQrel"
]
},
"metadata": {
"tags": []
},
"execution_count": 20
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "4tJ_aXCBOYBG",
"outputId": "479d1dc0-5d02-429c-a243-7b24feda1e4e"
},
"source": [
"dataset.qrels_cls()._fields"
],
"execution_count": 21,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"('query_id', 'doc_id', 'relevance', 'iteration')"
]
},
"metadata": {
"tags": []
},
"execution_count": 21
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "QAImnTAtOZoz",
"outputId": "dbf62766-db08-424e-c3fa-a22634500907"
},
"source": [
"dataset.qrels_cls().__annotations__"
],
"execution_count": 22,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"OrderedDict([('query_id', str),\n",
" ('doc_id', str),\n",
" ('relevance', int),\n",
" ('iteration', str)])"
]
},
"metadata": {
"tags": []
},
"execution_count": 22
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "GQb4jO-HOf6q"
},
"source": [
"## Wrapping Up\n",
"\n",
"So that's the core functionality. You can find more information in the [documentation](https://ir-datasets.com/)."
]
},
{
"cell_type": "code",
"metadata": {
"id": "EldJW1rhObGy"
},
"source": [
""
],
"execution_count": 22,
"outputs": []
}
]
}
================================================
FILE: examples/ir_datasets_cli.ipynb
================================================
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "ir_datasets-cli.ipynb",
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "SP6ophbQq5I0"
},
"source": [
"# ir_datasets - Tutorial - CLI\n",
"\n",
"**NOTE: This tutorial is for the command-line interface. See the other tutorial for Python.**"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "cl8KYrJTq-g0"
},
"source": [
"## Getting Started\n",
"\n",
"We'll start out by installing the package. The package is available on pypi,\n",
"so you can install it with your favorite package manager."
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "vbGhAIREqw1c",
"outputId": "1d7fcdb3-93a2-4668-fd7d-787d1471f648"
},
"source": [
"!pip install ir_datasets"
],
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"text": [
"Requirement already satisfied: ir_datasets in /usr/local/lib/python3.7/dist-packages (0.3.1)\n",
"Requirement already satisfied: tqdm>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (4.41.1)\n",
"Requirement already satisfied: warc3-wet>=0.2.3 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (0.2.3)\n",
"Requirement already satisfied: warc3-wet-clueweb09>=0.2.5 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (0.2.5)\n",
"Requirement already satisfied: beautifulsoup4>=4.4.1 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (4.6.3)\n",
"Requirement already satisfied: requests>=2.22.0 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (2.23.0)\n",
"Requirement already satisfied: lz4>=3.1.1 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (3.1.3)\n",
"Requirement already satisfied: ijson>=3.1.3 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (3.1.4)\n",
"Requirement already satisfied: pyyaml>=5.3.1 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (5.4.1)\n",
"Requirement already satisfied: zlib-state>=0.1.3 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (0.1.3)\n",
"Requirement already satisfied: lxml>=4.5.2 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (4.6.2)\n",
"Requirement already satisfied: trec-car-tools>=2.5.3 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (2.5.3)\n",
"Requirement already satisfied: numpy>=1.18.1 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (1.19.5)\n",
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.22.0->ir_datasets) (1.24.3)\n",
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.22.0->ir_datasets) (3.0.4)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.22.0->ir_datasets) (2020.12.5)\n",
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.22.0->ir_datasets) (2.10)\n",
"Requirement already satisfied: cbor>=1.0.0 in /usr/local/lib/python3.7/dist-packages (from trec-car-tools>=2.5.3->ir_datasets) (1.0.0)\n",
"Requirement already satisfied: typing>=3.6.2 in /usr/local/lib/python3.7/dist-packages (from trec-car-tools>=2.5.3->ir_datasets) (3.7.4.3)\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "7v_X6XqlrTan"
},
"source": [
"## export\n",
"\n",
"The `ir_datasets export` command outputs data to stdout as TSV,\n",
"JSON, and other formats.\n",
"\n",
"The command format is:\n",
"\n",
"```\n",
"ir_datasets export \n",
"```\n",
"\n",
"with optional other arguments following entity-type.\n",
"\n",
"`` is the dataset's identifier, found [in the catalog](https://ir-datasets.com/). `` is one of: `docs`, `queries`, `qrels`, `scoreddocs`."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "myI4M6OCsJQL"
},
"source": [
"Let's start by getting the top 10 documents from the `cord19/trec-covid` collection. The first time you run the command, it will automatically download the dataset.\n"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "wt-QU7q1q-Mn",
"outputId": "469d6b3f-4a0f-44db-a42b-ee8fef1232fe"
},
"source": [
"!ir_datasets export cord19/trec-covid docs | head -n 10"
],
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"text": [
"[INFO] No fields supplied. Using all fields: ('doc_id', 'title', 'doi', 'date', 'abstract')\n",
"ug7v899j\tClinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia\t10.1186/1471-2334-1-6\t2001-07-04\tOBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolates (60%) were associated with pneumonia, 14 (35%) with upper respiratory tract infections, and 2 (5%) with bronchiolitis. Cough (82.5%), fever (75%), and malaise (58.8%) were the most common symptoms, and crepitations (60%), and wheezes (40%) were the most common signs. Most patients with pneumonia had crepitations (79.2%) but only 25% had bronchial breathing. Immunocompromised patients were more likely than non-immunocompromised patients to present with pneumonia (8/9 versus 16/31, P = 0.05). Of the 24 patients with pneumonia, 14 (58.3%) had uneventful recovery, 4 (16.7%) recovered following some complications, 3 (12.5%) died because of M pneumoniae infection, and 3 (12.5%) died due to underlying comorbidities. The 3 patients who died of M pneumoniae pneumonia had other comorbidities. CONCLUSION: our results were similar to published data except for the finding that infections were more common in infants and preschool children and that the mortality rate of pneumonia in patients with comorbidities was high.\n",
"02tnwd4m\tNitric oxide: a pro-inflammatory mediator in lung disease?\t10.1186/rr14\t2000-08-15\tInflammatory diseases of the respiratory tract are commonly associated with elevated production of nitric oxide (NO•) and increased indices of NO• -dependent oxidative stress. Although NO• is known to have anti-microbial, anti-inflammatory and anti-oxidant properties, various lines of evidence support the contribution of NO• to lung injury in several disease models. On the basis of biochemical evidence, it is often presumed that such NO• -dependent oxidations are due to the formation of the oxidant peroxynitrite, although alternative mechanisms involving the phagocyte-derived heme proteins myeloperoxidase and eosinophil peroxidase might be operative during conditions of inflammation. Because of the overwhelming literature on NO• generation and activities in the respiratory tract, it would be beyond the scope of this commentary to review this area comprehensively. Instead, it focuses on recent evidence and concepts of the presumed contribution of NO• to inflammatory diseases of the lung.\n",
"ejv2xln0\tSurfactant protein-D and pulmonary host defense\t10.1186/rr19\t2000-08-25\tSurfactant protein-D (SP-D) participates in the innate response to inhaled microorganisms and organic antigens, and contributes to immune and inflammatory regulation within the lung. SP-D is synthesized and secreted by alveolar and bronchiolar epithelial cells, but is also expressed by epithelial cells lining various exocrine ducts and the mucosa of the gastrointestinal and genitourinary tracts. SP-D, a collagenous calcium-dependent lectin (or collectin), binds to surface glycoconjugates expressed by a wide variety of microorganisms, and to oligosaccharides associated with the surface of various complex organic antigens. SP-D also specifically interacts with glycoconjugates and other molecules expressed on the surface of macrophages, neutrophils, and lymphocytes. In addition, SP-D binds to specific surfactant-associated lipids and can influence the organization of lipid mixtures containing phosphatidylinositol in vitro. Consistent with these diverse in vitro activities is the observation that SP-D-deficient transgenic mice show abnormal accumulations of surfactant lipids, and respond abnormally to challenge with respiratory viruses and bacterial lipopolysaccharides. The phenotype of macrophages isolated from the lungs of SP-D-deficient mice is altered, and there is circumstantial evidence that abnormal oxidant metabolism and/or increased metalloproteinase expression contributes to the development of emphysema. The expression of SP-D is increased in response to many forms of lung injury, and deficient accumulation of appropriately oligomerized SP-D might contribute to the pathogenesis of a variety of human lung diseases.\n",
"2b73a28n\tRole of endothelin-1 in lung disease\t10.1186/rr44\t2001-02-22\tEndothelin-1 (ET-1) is a 21 amino acid peptide with diverse biological activity that has been implicated in numerous diseases. ET-1 is a potent mitogen regulator of smooth muscle tone, and inflammatory mediator that may play a key role in diseases of the airways, pulmonary circulation, and inflammatory lung diseases, both acute and chronic. This review will focus on the biology of ET-1 and its role in lung disease.\n",
"9785vg6d\tGene expression in epithelial cells in response to pneumovirus infection\t10.1186/rr61\t2001-05-11\tRespiratory syncytial virus (RSV) and pneumonia virus of mice (PVM) are viruses of the family Paramyxoviridae, subfamily pneumovirus, which cause clinically important respiratory infections in humans and rodents, respectively. The respiratory epithelial target cells respond to viral infection with specific alterations in gene expression, including production of chemoattractant cytokines, adhesion molecules, elements that are related to the apoptosis response, and others that remain incompletely understood. Here we review our current understanding of these mucosal responses and discuss several genomic approaches, including differential display reverse transcription-polymerase chain reaction (PCR) and gene array strategies, that will permit us to unravel the nature of these responses in a more complete and systematic manner.\n",
"zjufx4fo\tSequence requirements for RNA strand transfer during nidovirus discontinuous subgenomic RNA synthesis\t10.1093/emboj/20.24.7220\t2001-12-17\tNidovirus subgenomic mRNAs contain a leader sequence derived from the 5′ end of the genome fused to different sequences (‘bodies’) derived from the 3′ end. Their generation involves a unique mechanism of discontinuous subgenomic RNA synthesis that resembles copy-choice RNA recombination. During this process, the nascent RNA strand is transferred from one site in the template to another, during either plus or minus strand synthesis, to yield subgenomic RNA molecules. Central to this process are transcription-regulating sequences (TRSs), which are present at both template sites and ensure the fidelity of strand transfer. Here we present results of a comprehensive co-variation mutagenesis study of equine arteritis virus TRSs, demonstrating that discontinuous RNA synthesis depends not only on base pairing between sense leader TRS and antisense body TRS, but also on the primary sequence of the body TRS. While the leader TRS merely plays a targeting role for strand transfer, the body TRS fulfils multiple functions. The sequences of mRNA leader–body junctions of TRS mutants strongly suggested that the discontinuous step occurs during minus strand synthesis.\n",
"5yhe786e\tDebate: Transfusing to normal haemoglobin levels will not improve outcome\t10.1186/cc987\t2001-03-08\tRecent evidence suggests that critically ill patients are able to tolerate lower levels of haemoglobin than was previously believed. It is our goal to show that transfusing to a level of 100 g/l does not improve mortality and other clinically important outcomes in a critical care setting. Although many questions remain, many laboratory and clinical studies, including a recent randomized controlled trial (RCT), have established that transfusing to normal haemoglobin concentrations does not improve organ failure and mortality in the critically ill patient. In addition, a restrictive transfusion strategy will reduce exposure to allogeneic transfusions, result in more efficient use of red blood cells (RBCs), save blood overall, and decrease health care costs.\n",
"8zchiykl\tThe 21st International Symposium on Intensive Care and Emergency Medicine, Brussels, Belgium, 20-23 March 2001\t10.1186/cc1013\t2001-05-02\tThe 21st International Symposium on Intensive Care and Emergency Medicine was dominated by the results of recent clinical trials in sepsis and acute respiratory distress syndrome (ARDS). The promise of extracorporeal liver replacement therapy and noninvasive ventilation were other areas of interest. Ethical issues also received attention. Overall, the 'state of the art' lectures, pro/con debates, seminars and tutorials were of a high standard. The meeting was marked by a sense of renewed enthusiasm that positive progress is occurring in intensive care medicine.\n",
"8qnrcgnk\tHeme oxygenase-1 and carbon monoxide in pulmonary medicine\t10.1186/1465-9921-4-7\t2003-08-07\tHeme oxygenase-1 (HO-1), an inducible stress protein, confers cytoprotection against oxidative stress in vitro and in vivo. In addition to its physiological role in heme degradation, HO-1 may influence a number of cellular processes, including growth, inflammation, and apoptosis. By virtue of anti-inflammatory effects, HO-1 limits tissue damage in response to proinflammatory stimuli and prevents allograft rejection after transplantation. The transcriptional upregulation of HO-1 responds to many agents, such as hypoxia, bacterial lipopolysaccharide, and reactive oxygen/nitrogen species. HO-1 and its constitutively expressed isozyme, heme oxygenase-2, catalyze the rate-limiting step in the conversion of heme to its metabolites, bilirubin IXα, ferrous iron, and carbon monoxide (CO). The mechanisms by which HO-1 provides protection most likely involve its enzymatic reaction products. Remarkably, administration of CO at low concentrations can substitute for HO-1 with respect to anti-inflammatory and anti-apoptotic effects, suggesting a role for CO as a key mediator of HO-1 function. Chronic, low-level, exogenous exposure to CO from cigarette smoking contributes to the importance of CO in pulmonary medicine. The implications of the HO-1/CO system in pulmonary diseases will be discussed in this review, with an emphasis on inflammatory states.\n",
"jg13scgo\tTechnical Description of RODS: A Real-time Public Health Surveillance System\t10.1197/jamia.m1345\t2003-09-01\tThis report describes the design and implementation of the Real-time Outbreak and Disease Surveillance (RODS) system, a computer-based public health surveillance system for early detection of disease outbreaks. Hospitals send RODS data from clinical encounters over virtual private networks and leased lines using the Health Level 7 (HL7) message protocol. The data are sent in real time. RODS automatically classifies the registration chief complaint from the visit into one of seven syndrome categories using Bayesian classifiers. It stores the data in a relational database, aggregates the data for analysis using data warehousing techniques, applies univariate and multivariate statistical detection algorithms to the data, and alerts users of when the algorithms identify anomalous patterns in the syndrome counts. RODS also has a Web-based user interface that supports temporal and spatial analyses. RODS processes sales of over-the-counter health care products in a similar manner but receives such data in batch mode on a daily basis. RODS was used during the 2002 Winter Olympics and currently operates in two states—Pennsylvania and Utah. It has been and continues to be a resource for implementing, evaluating, and applying new methods of public health surveillance.\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "HTQaik0isguS"
},
"source": [
"You can export in other formats too. Here's an exporting in JSON-Lines."
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "XaYh4lwLrTDZ",
"outputId": "b50827d6-02e6-409c-bdcf-72dfbfdf1529"
},
"source": [
"!ir_datasets export cord19/trec-covid docs --format jsonl | head -n 10"
],
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"text": [
"{\"doc_id\": \"ug7v899j\", \"title\": \"Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia\", \"doi\": \"10.1186/1471-2334-1-6\", \"date\": \"2001-07-04\", \"abstract\": \"OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolates (60%) were associated with pneumonia, 14 (35%) with upper respiratory tract infections, and 2 (5%) with bronchiolitis. Cough (82.5%), fever (75%), and malaise (58.8%) were the most common symptoms, and crepitations (60%), and wheezes (40%) were the most common signs. Most patients with pneumonia had crepitations (79.2%) but only 25% had bronchial breathing. Immunocompromised patients were more likely than non-immunocompromised patients to present with pneumonia (8/9 versus 16/31, P = 0.05). Of the 24 patients with pneumonia, 14 (58.3%) had uneventful recovery, 4 (16.7%) recovered following some complications, 3 (12.5%) died because of M pneumoniae infection, and 3 (12.5%) died due to underlying comorbidities. The 3 patients who died of M pneumoniae pneumonia had other comorbidities. CONCLUSION: our results were similar to published data except for the finding that infections were more common in infants and preschool children and that the mortality rate of pneumonia in patients with comorbidities was high.\"}\n",
"{\"doc_id\": \"02tnwd4m\", \"title\": \"Nitric oxide: a pro-inflammatory mediator in lung disease?\", \"doi\": \"10.1186/rr14\", \"date\": \"2000-08-15\", \"abstract\": \"Inflammatory diseases of the respiratory tract are commonly associated with elevated production of nitric oxide (NO\\u2022) and increased indices of NO\\u2022 -dependent oxidative stress. Although NO\\u2022 is known to have anti-microbial, anti-inflammatory and anti-oxidant properties, various lines of evidence support the contribution of NO\\u2022 to lung injury in several disease models. On the basis of biochemical evidence, it is often presumed that such NO\\u2022 -dependent oxidations are due to the formation of the oxidant peroxynitrite, although alternative mechanisms involving the phagocyte-derived heme proteins myeloperoxidase and eosinophil peroxidase might be operative during conditions of inflammation. Because of the overwhelming literature on NO\\u2022 generation and activities in the respiratory tract, it would be beyond the scope of this commentary to review this area comprehensively. Instead, it focuses on recent evidence and concepts of the presumed contribution of NO\\u2022 to inflammatory diseases of the lung.\"}\n",
"{\"doc_id\": \"ejv2xln0\", \"title\": \"Surfactant protein-D and pulmonary host defense\", \"doi\": \"10.1186/rr19\", \"date\": \"2000-08-25\", \"abstract\": \"Surfactant protein-D (SP-D) participates in the innate response to inhaled microorganisms and organic antigens, and contributes to immune and inflammatory regulation within the lung. SP-D is synthesized and secreted by alveolar and bronchiolar epithelial cells, but is also expressed by epithelial cells lining various exocrine ducts and the mucosa of the gastrointestinal and genitourinary tracts. SP-D, a collagenous calcium-dependent lectin (or collectin), binds to surface glycoconjugates expressed by a wide variety of microorganisms, and to oligosaccharides associated with the surface of various complex organic antigens. SP-D also specifically interacts with glycoconjugates and other molecules expressed on the surface of macrophages, neutrophils, and lymphocytes. In addition, SP-D binds to specific surfactant-associated lipids and can influence the organization of lipid mixtures containing phosphatidylinositol in vitro. Consistent with these diverse in vitro activities is the observation that SP-D-deficient transgenic mice show abnormal accumulations of surfactant lipids, and respond abnormally to challenge with respiratory viruses and bacterial lipopolysaccharides. The phenotype of macrophages isolated from the lungs of SP-D-deficient mice is altered, and there is circumstantial evidence that abnormal oxidant metabolism and/or increased metalloproteinase expression contributes to the development of emphysema. The expression of SP-D is increased in response to many forms of lung injury, and deficient accumulation of appropriately oligomerized SP-D might contribute to the pathogenesis of a variety of human lung diseases.\"}\n",
"{\"doc_id\": \"2b73a28n\", \"title\": \"Role of endothelin-1 in lung disease\", \"doi\": \"10.1186/rr44\", \"date\": \"2001-02-22\", \"abstract\": \"Endothelin-1 (ET-1) is a 21 amino acid peptide with diverse biological activity that has been implicated in numerous diseases. ET-1 is a potent mitogen regulator of smooth muscle tone, and inflammatory mediator that may play a key role in diseases of the airways, pulmonary circulation, and inflammatory lung diseases, both acute and chronic. This review will focus on the biology of ET-1 and its role in lung disease.\"}\n",
"{\"doc_id\": \"9785vg6d\", \"title\": \"Gene expression in epithelial cells in response to pneumovirus infection\", \"doi\": \"10.1186/rr61\", \"date\": \"2001-05-11\", \"abstract\": \"Respiratory syncytial virus (RSV) and pneumonia virus of mice (PVM) are viruses of the family Paramyxoviridae, subfamily pneumovirus, which cause clinically important respiratory infections in humans and rodents, respectively. The respiratory epithelial target cells respond to viral infection with specific alterations in gene expression, including production of chemoattractant cytokines, adhesion molecules, elements that are related to the apoptosis response, and others that remain incompletely understood. Here we review our current understanding of these mucosal responses and discuss several genomic approaches, including differential display reverse transcription-polymerase chain reaction (PCR) and gene array strategies, that will permit us to unravel the nature of these responses in a more complete and systematic manner.\"}\n",
"{\"doc_id\": \"zjufx4fo\", \"title\": \"Sequence requirements for RNA strand transfer during nidovirus discontinuous subgenomic RNA synthesis\", \"doi\": \"10.1093/emboj/20.24.7220\", \"date\": \"2001-12-17\", \"abstract\": \"Nidovirus subgenomic mRNAs contain a leader sequence derived from the 5\\u2032 end of the genome fused to different sequences (\\u2018bodies\\u2019) derived from the 3\\u2032 end. Their generation involves a unique mechanism of discontinuous subgenomic RNA synthesis that resembles copy-choice RNA recombination. During this process, the nascent RNA strand is transferred from one site in the template to another, during either plus or minus strand synthesis, to yield subgenomic RNA molecules. Central to this process are transcription-regulating sequences (TRSs), which are present at both template sites and ensure the fidelity of strand transfer. Here we present results of a comprehensive co-variation mutagenesis study of equine arteritis virus TRSs, demonstrating that discontinuous RNA synthesis depends not only on base pairing between sense leader TRS and antisense body TRS, but also on the primary sequence of the body TRS. While the leader TRS merely plays a targeting role for strand transfer, the body TRS fulfils multiple functions. The sequences of mRNA leader\\u2013body junctions of TRS mutants strongly suggested that the discontinuous step occurs during minus strand synthesis.\"}\n",
"{\"doc_id\": \"5yhe786e\", \"title\": \"Debate: Transfusing to normal haemoglobin levels will not improve outcome\", \"doi\": \"10.1186/cc987\", \"date\": \"2001-03-08\", \"abstract\": \"Recent evidence suggests that critically ill patients are able to tolerate lower levels of haemoglobin than was previously believed. It is our goal to show that transfusing to a level of 100 g/l does not improve mortality and other clinically important outcomes in a critical care setting. Although many questions remain, many laboratory and clinical studies, including a recent randomized controlled trial (RCT), have established that transfusing to normal haemoglobin concentrations does not improve organ failure and mortality in the critically ill patient. In addition, a restrictive transfusion strategy will reduce exposure to allogeneic transfusions, result in more efficient use of red blood cells (RBCs), save blood overall, and decrease health care costs.\"}\n",
"{\"doc_id\": \"8zchiykl\", \"title\": \"The 21st International Symposium on Intensive Care and Emergency Medicine, Brussels, Belgium, 20-23 March 2001\", \"doi\": \"10.1186/cc1013\", \"date\": \"2001-05-02\", \"abstract\": \"The 21st International Symposium on Intensive Care and Emergency Medicine was dominated by the results of recent clinical trials in sepsis and acute respiratory distress syndrome (ARDS). The promise of extracorporeal liver replacement therapy and noninvasive ventilation were other areas of interest. Ethical issues also received attention. Overall, the 'state of the art' lectures, pro/con debates, seminars and tutorials were of a high standard. The meeting was marked by a sense of renewed enthusiasm that positive progress is occurring in intensive care medicine.\"}\n",
"{\"doc_id\": \"8qnrcgnk\", \"title\": \"Heme oxygenase-1 and carbon monoxide in pulmonary medicine\", \"doi\": \"10.1186/1465-9921-4-7\", \"date\": \"2003-08-07\", \"abstract\": \"Heme oxygenase-1 (HO-1), an inducible stress protein, confers cytoprotection against oxidative stress in vitro and in vivo. In addition to its physiological role in heme degradation, HO-1 may influence a number of cellular processes, including growth, inflammation, and apoptosis. By virtue of anti-inflammatory effects, HO-1 limits tissue damage in response to proinflammatory stimuli and prevents allograft rejection after transplantation. The transcriptional upregulation of HO-1 responds to many agents, such as hypoxia, bacterial lipopolysaccharide, and reactive oxygen/nitrogen species. HO-1 and its constitutively expressed isozyme, heme oxygenase-2, catalyze the rate-limiting step in the conversion of heme to its metabolites, bilirubin IX\\u03b1, ferrous iron, and carbon monoxide (CO). The mechanisms by which HO-1 provides protection most likely involve its enzymatic reaction products. Remarkably, administration of CO at low concentrations can substitute for HO-1 with respect to anti-inflammatory and anti-apoptotic effects, suggesting a role for CO as a key mediator of HO-1 function. Chronic, low-level, exogenous exposure to CO from cigarette smoking contributes to the importance of CO in pulmonary medicine. The implications of the HO-1/CO system in pulmonary diseases will be discussed in this review, with an emphasis on inflammatory states.\"}\n",
"{\"doc_id\": \"jg13scgo\", \"title\": \"Technical Description of RODS: A Real-time Public Health Surveillance System\", \"doi\": \"10.1197/jamia.m1345\", \"date\": \"2003-09-01\", \"abstract\": \"This report describes the design and implementation of the Real-time Outbreak and Disease Surveillance (RODS) system, a computer-based public health surveillance system for early detection of disease outbreaks. Hospitals send RODS data from clinical encounters over virtual private networks and leased lines using the Health Level 7 (HL7) message protocol. The data are sent in real time. RODS automatically classifies the registration chief complaint from the visit into one of seven syndrome categories using Bayesian classifiers. It stores the data in a relational database, aggregates the data for analysis using data warehousing techniques, applies univariate and multivariate statistical detection algorithms to the data, and alerts users of when the algorithms identify anomalous patterns in the syndrome counts. RODS also has a Web-based user interface that supports temporal and spatial analyses. RODS processes sales of over-the-counter health care products in a similar manner but receives such data in batch mode on a daily basis. RODS was used during the 2002 Winter Olympics and currently operates in two states\\u2014Pennsylvania and Utah. It has been and continues to be a resource for implementing, evaluating, and applying new methods of public health surveillance.\"}\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "hhVG2gp6sqdZ"
},
"source": [
"If you do not want all the fields, you can specify which ones with `--fields`:"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "kFI8UHbzq6Cu",
"outputId": "a06ac0f5-2248-4c09-e5e6-f8b49f5cf29f"
},
"source": [
"!ir_datasets export cord19/trec-covid docs --format jsonl --fields doc_id date | head -n 10"
],
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"text": [
"{\"doc_id\": \"ug7v899j\", \"date\": \"2001-07-04\"}\n",
"{\"doc_id\": \"02tnwd4m\", \"date\": \"2000-08-15\"}\n",
"{\"doc_id\": \"ejv2xln0\", \"date\": \"2000-08-25\"}\n",
"{\"doc_id\": \"2b73a28n\", \"date\": \"2001-02-22\"}\n",
"{\"doc_id\": \"9785vg6d\", \"date\": \"2001-05-11\"}\n",
"{\"doc_id\": \"zjufx4fo\", \"date\": \"2001-12-17\"}\n",
"{\"doc_id\": \"5yhe786e\", \"date\": \"2001-03-08\"}\n",
"{\"doc_id\": \"8zchiykl\", \"date\": \"2001-05-02\"}\n",
"{\"doc_id\": \"8qnrcgnk\", \"date\": \"2003-08-07\"}\n",
"{\"doc_id\": \"jg13scgo\", \"date\": \"2003-09-01\"}\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "WUjwx7i1s5HD"
},
"source": [
"The export command works the same way for `queries`, `qrels`, and `scoreddocs` (where available). By default, `qrels` and `scoreddocs` output in the TREC format. But you can choose to export as tsv or jsonl as well."
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "JoeB2aresxAV",
"outputId": "872f8a51-ceb2-4c29-84ba-eb503f58ce1d"
},
"source": [
"!ir_datasets export cord19/trec-covid queries --fields query_id title | head -n 10"
],
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"text": [
"1\tcoronavirus origin\n",
"2\tcoronavirus response to weather changes\n",
"3\tcoronavirus immunity\n",
"4\thow do people die from the coronavirus\n",
"5\tanimal models of COVID-19\n",
"6\tcoronavirus test rapid testing\n",
"7\tserological tests for coronavirus\n",
"8\tcoronavirus under reporting\n",
"9\tcoronavirus in Canada\n",
"10\tcoronavirus social distancing impact\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Me_hppfJtRxG",
"outputId": "b2fdb388-7eea-4e47-f5ea-859e07fe1b74"
},
"source": [
"!ir_datasets export cord19/trec-covid qrels | head -n 10"
],
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"text": [
"1 4.5 005b2j4b 2\n",
"1 4 00fmeepz 1\n",
"1 0.5 010vptx3 2\n",
"1 2.5 0194oljo 1\n",
"1 4 021q9884 1\n",
"1 1 02f0opkr 1\n",
"1 3.5 047xpt2c 0\n",
"1 1 04ftw7k9 0\n",
"1 1 05qglt1f 0\n",
"1 3 05vx82oo 0\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "2zocHeB1tgKu"
},
"source": [
"If you're savvy at the command line, piping can let you capture some dataset statistics pretty easily. Here's an example giving the label proportions using `awk`:"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "vqCnPJOVtaWl",
"outputId": "6b041b9e-9b85-47bc-91c1-1595c9d5968b"
},
"source": [
"!ir_datasets export cord19/trec-covid qrels | awk '{a[$4]+=1; s+=1}END{for (x in a){print x, a[x], a[x]/s}}'"
],
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"text": [
"-1 2 2.88525e-05\n",
"0 42652 0.615309\n",
"1 11055 0.159482\n",
"2 15609 0.22518\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IgE2qowjuZV8"
},
"source": [
"## lookup\n",
"\n",
"You can look up documents by their ID with the `ir_datasets lookup` command. The command format is:\n",
"\n",
"```\n",
"ir_datasets lookup ...\n",
"```\n",
"\n",
"These lookups are generally O(1) and memory-efficient."
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "dBrzCdwbtug4",
"outputId": "cd3522f7-3acf-450b-ae68-ac72ce9f0877"
},
"source": [
"!ir_datasets lookup cord19/trec-covid 005b2j4b 00fmeepz 010vptx3"
],
"execution_count": 8,
"outputs": [
{
"output_type": "stream",
"text": [
"[INFO] No fields supplied. Using all fields: ('doc_id', 'title', 'doi', 'date', 'abstract')\n",
"005b2j4b\tMonophyletic Relationship between Severe Acute Respiratory Syndrome Coronavirus and Group 2 Coronaviruses\t10.1086/382892\t2004-05-01\tAlthough primary genomic analysis has revealed that severe acute respiratory syndrome coronavirus (SARS CoV) is a new type of coronavirus, the different protein trees published in previous reports have provided no conclusive evidence indicating the phylogenetic position of SARS CoV. To clarify the phylogenetic relationship between SARS CoV and other coronaviruses, we compiled a large data set composed of 7 concatenated protein sequences and performed comprehensive analyses, using the maximum-likelihood, Bayesian-inference, and maximum-parsimony methods. All resulting phylogenetic trees displayed an identical topology and supported the hypothesis that the relationship between SARS CoV and group 2 CoVs is monophyletic. Relationships among all major groups were well resolved and were supported by all statistical analyses.\n",
"00fmeepz\tComprehensive overview of COVID-19 based on current evidence\t\t2020\tIn December 2019, twenty-seven pneumonia patients with unknown causes originated in South China seafood market in Wuhan. The virus infection spread rapidly and swept through China in less than a month. Subsequently, the virus was proven a novel coronavirus and named SARS-CoV-2. The outbreak of novel coronavirus has been determined as a Public Health Emergency of International Concern (PHEIC) by WHO on January 31, 2020. Similar to other coronaviruses like the Middle East Respiratory Syndrome (MERS) CoV and Severe Acute Respiratory Syndrome (SARS) CoV, the novel coronavirus was reported to spread via respiratory droplets and close contact from human to human, which means the virus is highly infectious and dangerous. Unfortunately, till now the virus has spread to over 200 countries/territories/areas around the world and the Coronavirus Disease 2019 (COVID-19) outbreak is continuing to grow. Currently, information sharing and transparency are essential for risk assessment and epidemic control in all endemic areas. In this article, we compared SARS-CoV-2 with SARS-CoV and influenza virus, discussed current researching progress of COVID-19, including clinical characteristics, pathological changes, treatment measures, and so on.\n",
"010vptx3\tThe SARS, MERS and novel coronavirus (COVID-19) epidemics, the newest and biggest global health threats: what lessons have we learned?\t10.1093/ije/dyaa033\t2020-02-22\tOBJECTIVES: To provide an overview of the three major deadly coronaviruses and identify areas for improvement of future preparedness plans, as well as provide a critical assessment of the risk factors and actionable items for stopping their spread, utilizing lessons learned from the first two deadly coronavirus outbreaks, as well as initial reports from the current novel coronavirus (COVID-19) epidemic in Wuhan, China. METHODS: Utilizing the Centers for Disease Control and Prevention (CDC, USA) website, and a comprehensive review of PubMed literature, we obtained information regarding clinical signs and symptoms, treatment and diagnosis, transmission methods, protection methods and risk factors for Middle East Respiratory Syndrome (MERS), Severe Acute Respiratory Syndrome (SARS) and COVID-19. Comparisons between the viruses were made. RESULTS: Inadequate risk assessment regarding the urgency of the situation, and limited reporting on the virus within China has, in part, led to the rapid spread of COVID-19 throughout mainland China and into proximal and distant countries. Compared with SARS and MERS, COVID-19 has spread more rapidly, due in part to increased globalization and the focus of the epidemic. Wuhan, China is a large hub connecting the North, South, East and West of China via railways and a major international airport. The availability of connecting flights, the timing of the outbreak during the Chinese (Lunar) New Year, and the massive rail transit hub located in Wuhan has enabled the virus to perforate throughout China, and eventually, globally. CONCLUSIONS: We conclude that we did not learn from the two prior epidemics of coronavirus and were ill-prepared to deal with the challenges the COVID-19 epidemic has posed. Future research should attempt to address the uses and implications of internet of things (IoT) technologies for mapping the spread of infection.\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "v6leewIGvYKf"
},
"source": [
"You can also specify the fields to return."
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "GkVpdPsXvFIq",
"outputId": "106acd40-9d69-495e-e9f8-7191b1c81d78"
},
"source": [
"!ir_datasets lookup cord19/trec-covid 005b2j4b 00fmeepz 010vptx3 --fields doc_id title"
],
"execution_count": 9,
"outputs": [
{
"output_type": "stream",
"text": [
"005b2j4b\tMonophyletic Relationship between Severe Acute Respiratory Syndrome Coronavirus and Group 2 Coronaviruses\n",
"00fmeepz\tComprehensive overview of COVID-19 based on current evidence\n",
"010vptx3\tThe SARS, MERS and novel coronavirus (COVID-19) epidemics, the newest and biggest global health threats: what lessons have we learned?\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "DOm67Sbsvjon"
},
"source": [
"And of course, you can do all sorts of fancy piping here as well. Let's find all highly-relevant documents for Query 50:"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "yVMHEFRXvLfh",
"outputId": "e4ad1626-5057-40a3-c517-9cc12ad0052a"
},
"source": [
"!ir_datasets lookup cord19/trec-covid $(ir_datasets export cord19/trec-covid qrels | awk '$1==50&&$4==2{printf \"%s \", $3}') --fields doc_id title"
],
"execution_count": 10,
"outputs": [
{
"output_type": "stream",
"text": [
"1v0f2dtx\tSARS-CoV-2 mRNA Vaccine Development Enabled by Prototype Pathogen Preparedness\n",
"3a6l4ktt\tmRNA Vaccines: Possible Tools to Combat SARS-CoV-2\n",
"6emy92i5\tmRNA Vaccines: Possible Tools to Combat SARS-CoV-2\n",
"7q6xi2xx\tAn Evidence Based Perspective on mRNA-SARS-CoV-2 Vaccine Development\n",
"akbq0ogs\tPhase 1/2 Study to Describe the Safety and Immunogenicity of a COVID-19 RNA Vaccine Candidate (BNT162b1) in Adults 18 to 55 Years of Age: Interim Report\n",
"dcg6ui9d\tAn Evidence Based Perspective on mRNA-SARS-CoV-2 Vaccine Development\n",
"g1j8wk11\tImmune-mediated approaches against COVID-19\n",
"gidlrnu8\tDeconvoluting Lipid Nanoparticle Structure for Messenger RNA Delivery\n",
"ino9srb6\tAn overview on COVID-19: reality and expectation\n",
"kf7yz3oz\tVaccines and Therapies in Development for SARS-CoV-2 Infections.\n",
"oiu80002\tSelf-amplifying RNA SARS-CoV-2 lipid nanoparticle vaccine candidate induces high neutralizing antibody titers in mice\n",
"ozf05l65\tPreparing for Pandemics: RNA Vaccines at the Forefront\n",
"q77da2y3\tDesigning a novel mRNA vaccine against SARS-CoV-2: An immunoinformatics approach\n",
"u35rryzi\tVaccines and Therapies in Development for SARS-CoV-2 Infections\n",
"v0m90h3n\tLinearDesign: Efficient Algorithms for Optimized mRNA Sequence Design\n",
"vm3oirur\tPreclinical data from SARS-CoV-2 mRNA vaccine\n",
"wptc95qb\tA recombinant Lactobacillus plantarum strain expressing the spike protein of SARS-CoV-2\n",
"wtmjt3hf\tDevelopment of a COVID-19 vaccine based on the receptor binding domain displayed on virus-like particles\n",
"wx1v0h0q\tDesigning a multi-epitope peptide-based vaccine against SARS-CoV-2\n",
"wxagjqbt\tCOVID-19 Vaccine Candidates: Prediction and Validation of 174 SARS-CoV-2 Epitopes\n",
"wzdgizoo\tQuantitative measurement of activity of JAK-STAT signaling pathways in blood samples and immune cells to predict innate and adaptive cellular immune response to viral infection and accelerate vaccine development\n",
"wzv8n34v\tSingle-dose replicating RNA vaccine induces neutralizing antibodies against SARS-CoV-2 in nonhuman primates\n",
"x5zvwtj7\tVaccines against Coronaviruses: The State of the Art\n",
"xbze5s3c\tAn Evidence Based Perspective on mRNA-SARS-CoV-2 Vaccine Development\n",
"xeq0dq6u\tSARS-CoV-2 will constantly sweep its tracks: a vaccine containing CpG motifs in ‘lasso’ for the multi-faced virus\n",
"xhm97wy2\tRNA to the rescue: RNA is one of the most promising targets for drug development given its wide variety of uses\n",
"xieqswct\tDevelopment of CRISPR as a prophylactic strategy to combat novel coronavirus and influenza\n",
"xjg2e8be\tIn silico approach for designing of a multi-epitope based vaccine against novel Coronavirus (SARS-COV-2)\n",
"xqgqq55q\tEmerging vaccine delivery systems for COVID-19: Functionalised silica nanoparticles offer a potentially safe and effective alternative delivery system for DNA/RNA vaccines and may be useful in the hunt for a COVID-19 vaccine\n",
"xt8tld2i\tThe vaccine journey for COVID-19: a comprehensive systematic review of current clinical trials in humans\n",
"xy7w8hbz\tCharacterization of the receptor-binding domain (RBD) of 2019 novel coronavirus: implication for development of RBD protein as a viral attachment inhibitor and vaccine\n",
"y87tq9wu\tCurrent Status of Multiple Drug Molecules, and Vaccines: An Update in SARS-CoV-2 Therapeutics\n",
"y883anmp\tSARS-CoV-2 vaccines: 'Warp Speed' needs mind melds not warped minds.\n",
"ygwdldae\tImmunization with the receptor–binding domain of SARS-CoV-2 elicits antibodies cross-neutralizing SARS-CoV-2 and SARS-CoV without antibody-dependent enhancement\n",
"ykzsoafe\tOptimization of antigen dose for a receptor-binding domain-based subunit vaccine against MERS coronavirus\n",
"ymvrserl\tImmunoinformatic identification of B cell and T cell epitopes in the SARS-CoV-2 proteome\n",
"yn79jn83\tAnalysis of a SARS-CoV-2-Infected Individual Reveals Development of Potent Neutralizing Antibodies with Limited Somatic Mutation\n",
"yneir8ab\tCOVID-19 vaccine development pipeline gears up\n",
"ypkiptvh\tUpdate on therapeutic approaches and emerging therapies for SARS-CoV-2 virus\n",
"ys8cs84y\tExpected immune recognition of COVID-19 virus by memory from earlier infections with common coronaviruses in a large part of the world population\n",
"ywia2ok7\tThe crystal structure of nsp10-nsp16 heterodimer from SARS-CoV-2 in complex with S-adenosylmethionine\n",
"yx3j6373\tCOVID-19: immunopathology and its implications for therapy\n",
"yxiacesg\tStructural and functional conservation of the programmed -1 ribosomal frameshift signal of SARS coronavirus 2 (SARS-CoV-2).\n",
"z24dqh0y\tAnalysis of a SARS-CoV-2-Infected Individual Reveals Development of Potent Neutralizing Antibodies with Limited Somatic Mutation\n",
"z5q82rmp\tGlobal efforts on vaccines for COVID-19: Since, sooner or later, we all will catch the coronavirus\n",
"z5uhrta5\tIdentification of a Noncanonical Signal for Transcription of a Novel Subgenomic mRNA of Mouse Hepatitis Virus: Implication for the Mechanism of Coronavirus RNA Transcription\n",
"zalk5ul7\tAre genetic vaccines the right weapon against Covid-19?\n",
"zi1l5883\tVaccines against Coronaviruses: The State of the Art.\n",
"zteyfpv9\tCurrent pharmacological treatments for SARS-COV-2: A narrative review\n",
"zv4nbz9p\tEmerging Technologies for Use in the Study, Diagnosis, and Treatment of Patients with COVID-19\n",
"zvop8bxh\tAntiviral RNAi therapy: emerging approaches for hitting a moving target\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "h9dhCjb0y4Rj"
},
"source": [
"## doc_fifos\n",
"\n",
"For indexing using some tools (e.g., Anserini), it is helpful to have multiple concurrent document streams. You can do this with the `ir_datasets doc_fifos` command. Note that this command only works on posix systems (e.g., unix, macos).\n",
"\n",
"This command runs until all the documents are exhausted, so you need to run it in the background or elsewhere. So it's not condusive to show in a Colab setting."
]
},
{
"cell_type": "code",
"metadata": {
"id": "YGKeucnFzFOW"
},
"source": [
"!ir_datasets doc_fifos cord19/trec-covid"
],
"execution_count": null,
"outputs": []
}
]
}
================================================
FILE: ir_datasets/__init__.py
================================================
from enum import Enum
class EntityType(Enum):
docs = "docs"
queries = "queries"
qrels = "qrels"
scoreddocs = "scoreddocs"
docpairs = "docpairs"
qlogs = "qlogs"
from . import lazy_libs
from . import log
from . import util
from . import formats
registry = util.Registry()
from . import datasets
from . import indices
from . import wrappers
from . import commands
Dataset = datasets.base.Dataset
def load(name):
return registry[name]
def parent_id(dataset_id: str, entity_type: EntityType) -> str:
"""
Maps a dataset_id to a more general ID that shares the same entity handler (e.g., docs_handler). For example,
for docs, "msmarco-document/trec-dl-2019/judged" -> "msmarco-document" or "wikir/en1k/test" -> "wikir/en1k".
This is useful when creating shared document resources among multiple subsets, such as an index.
Note: At this time, this function operates by convention; it finds the lowest dataset_id in the
hierarchy that has the same docs_handler instance. This function may be updated in the future to
also use explicit links added when datasets are registered.
"""
entity_type = EntityType(entity_type) # validate & allow strings
ds = load(dataset_id)
segments = dataset_id.split("/")
handler = getattr(ds, f'{entity_type.value}_handler')()
parent_ds_id = dataset_id
while len(segments) > 1:
segments.pop()
try:
parent_ds = load("/".join(segments))
if parent_ds.has(entity_type.value) and getattr(parent_ds, f'{entity_type.value}_handler')() == handler:
parent_ds_id = "/".join(segments)
except KeyError:
pass # this dataset doesn't exist
return parent_ds_id
def docs_parent_id(dataset_id: str) -> str:
return parent_id(dataset_id, EntityType.docs)
corpus_id = docs_parent_id # legacy
def queries_parent_id(dataset_id: str) -> str:
return parent_id(dataset_id, EntityType.queries)
def qrels_parent_id(dataset_id: str) -> str:
return parent_id(dataset_id, EntityType.qrels)
def scoreddocs_parent_id(dataset_id: str) -> str:
return parent_id(dataset_id, EntityType.scoreddocs)
def docpairs_parent_id(dataset_id: str) -> str:
return parent_id(dataset_id, EntityType.docpairs)
def qlogs_parent_id(dataset_id: str) -> str:
return parent_id(dataset_id, EntityType.qlogs)
def create_dataset(docs_tsv=None, queries_tsv=None, qrels_trec=None):
LocalDownload = util.LocalDownload
TsvDocs = formats.TsvDocs
TsvQueries = formats.TsvQueries
TrecQrels = formats.TrecQrels
components = []
if docs_tsv is not None:
components.append(TsvDocs(LocalDownload(docs_tsv)))
if queries_tsv is not None:
components.append(TsvQueries(LocalDownload(queries_tsv)))
if qrels_trec is not None:
components.append(TrecQrels(LocalDownload(qrels_trec), {}))
return datasets.base.Dataset(*components)
def main(args):
import sys
if len(args) < 1 or args[0] not in commands.COMMANDS:
cmds = ','.join(commands.COMMANDS.keys())
sys.stderr.write(f'Usage: ir_datasets {{{cmds}}} ...\n')
sys.exit(1)
commands.COMMANDS[args[0]](args[1:])
def main_cli():
import sys
main(sys.argv[1:])
__version__ = "0.5.11"
================================================
FILE: ir_datasets/__main__.py
================================================
import ir_datasets
if __name__ == '__main__':
ir_datasets.main_cli()
================================================
FILE: ir_datasets/commands/__init__.py
================================================
from . import doc_fifos
from . import export
from . import lookup
from . import list as list_cmd
from . import build_clueweb_warc_indexes
from . import build_download_cache
from . import build_c4_checkpoints
from . import clean
from . import generate_metadata
COMMANDS = {
'doc_fifos': doc_fifos.main,
'export': export.main,
'lookup': lookup.main,
'list': list_cmd.main,
'build_clueweb_warc_indexes': build_clueweb_warc_indexes.main,
'build_c4_checkpoints': build_c4_checkpoints.main,
'build_download_cache': build_download_cache.main,
'clean': clean.main,
'generate_metadata': generate_metadata.main,
}
================================================
FILE: ir_datasets/commands/build_c4_checkpoints.py
================================================
import os
import sys
import multiprocessing
from pathlib import Path
import gzip
import hashlib
import json
import pickle
import argparse
import ir_datasets
_logger = ir_datasets.log.easy()
def process(args):
lz4 = ir_datasets.lazy_libs.lz4_frame()
source_file, output_file = args
checkpoint_data = []
with ir_datasets.lazy_libs.zlib_state().GzipStateFile(str(source_file), keep_last_state=True) as f, _logger.pbar_raw(desc='building checkpoint') as pbar:
idx = 0
while not f.eof():
if idx % 1500 == 0:
state, pos = f.last_state, f.last_state_pos
offset = f.output_pos - f.last_state_output_pos
checkpoint_data.append((pos, state, offset))
f.readline()
idx += 1
pbar.update(1)
with lz4.frame.LZ4FrameFile(output_file, mode='a', block_linked=True, compression_level=lz4.frame.COMPRESSIONLEVEL_MAX, auto_flush=True) as fout:
pickle.dump(checkpoint_data, fout)
return source_file
def main(args):
parser = argparse.ArgumentParser(prog='ir_datasets build_c4_checkpoints', description='Buildes gzip checkpoint files for C4 documents.')
parser.add_argument('source_dir')
parser.add_argument('output_dir')
parser.add_argument('--skip_last', action='store_true')
parser.add_argument('--sources_file')
args = parser.parse_args(args)
source_dir = Path(args.source_dir)
output_dir = Path(args.output_dir)
all_source_files = source_dir.rglob('*.json.gz')
all_source_files = sorted(all_source_files)
if args.sources_file:
sources = []
for file in _logger.pbar(all_source_files, desc='building sources file', unit='file'):
try:
count = 0
with gzip.open(file, 'rb') as f:
for line in f:
count += 1
h = hashlib.new('md5')
h.update(open(file, 'rb').read())
md5 = h.hexdigest().lower()
size = os.path.getsize(file)
sources.append({
"name": f"en.noclean.{file.name}",
"url": f"https://huggingface.co/datasets/allenai/c4/resolve/main/en.noclean/{file.name}",
"expected_md5": md5,
"size_hint": size,
"checkpoint_freq": 1500,
"doc_count": count,
})
except Exception as ex:
print(file, ex)
with gzip.open(args.sources_file + '.gz', 'wt') as f:
json.dump(sources, f)
all_source_files = [f.relative_to(source_dir) for f in all_source_files]
if args.skip_last:
all_source_files = all_source_files[:-1]
process_args = [(source_dir/f, output_dir/f'{f}.chk.pkl.lz4') for f in all_source_files]
process_args = [a for a in process_args if not a[1].exists()]
with _logger.pbar_raw(total=len(process_args), unit='file') as pbar:
for src in map(process, process_args):
pbar.update(1)
pbar.set_postfix(file=str(src)[-20:])
if __name__ == '__main__':
main(sys.argv[1:])
================================================
FILE: ir_datasets/commands/build_clueweb_warc_indexes.py
================================================
import sys
import multiprocessing
from pathlib import Path
import argparse
import ir_datasets
_logger = ir_datasets.log.easy()
def process(args):
source_file, output_file, cw09 = args
index = ir_datasets.indices.ClueWebWarcIndex(str(source_file), str(output_file), warc_cw09=cw09)
output_file.parent.mkdir(parents=True, exist_ok=True)
if not index.built():
index.build()
return source_file
def main(args):
parser = argparse.ArgumentParser(prog='ir_datasets build_clueweb_warc_indexes', description='Buildes indexes for ClueWeb WARC files.')
parser.add_argument('source_dir')
parser.add_argument('output_dir')
parser.add_argument('--processes', default=1, type=int)
parser.add_argument('--cw09', action='store_true')
args = parser.parse_args(args)
source_dir = Path(args.source_dir)
output_dir = Path(args.output_dir)
all_source_files = [f.relative_to(source_dir) for f in source_dir.rglob('*.warc.gz')]
all_source_files = sorted(all_source_files)
process_args = [(source_dir/f, output_dir/f'{f}.chk.lz4', args.cw09) for f in all_source_files]
process_args = [a for a in process_args if not a[1].exists()]
with _logger.pbar_raw(total=len(process_args), unit='file') as pbar:
if args.processes == 1:
for src in map(process, process_args):
pbar.update(1)
pbar.set_postfix(file=str(src))
else:
with multiprocessing.Pool(args.processes) as pool:
for src in pool.imap_unordered(process, process_args):
pbar.update(1)
pbar.set_postfix(file=src.relative_to(source_dir))
if __name__ == '__main__':
main(sys.argv[1:])
================================================
FILE: ir_datasets/commands/build_download_cache.py
================================================
import sys
import time
import io
import os
import argparse
import json
from contextlib import contextmanager
import ir_datasets
_logger = ir_datasets.log.easy()
@contextmanager
def tmp_environ(**kwargs):
orig_values = {}
for key, value in kwargs.items():
orig_values[key] = os.environ.get(key)
os.environ[key] = value
try:
yield
finally:
for key, value in kwargs.items():
orig_value = orig_values[key]
if orig_value is not None:
os.environ[key] = orig_value
else:
del os.environ[key]
def _build_cache(data, dir, prefix=''):
if 'url' in data and 'expected_md5' in data:
cache_path = f'{dir}/{data["expected_md5"]}'
if os.path.exists(cache_path):
_logger.info(f'skipping {prefix}; already exists')
return
try:
with ir_datasets.util.finialized_file(cache_path, 'wb') as fout, _logger.duration(prefix):
download = ir_datasets.util.Download([ir_datasets.util.RequestsDownload(data['url'])], expected_md5=data['expected_md5'], stream=True)
with download.stream() as stream:
inp = stream.read(io.DEFAULT_BUFFER_SIZE)
while len(inp) > 0:
fout.write(inp)
inp = stream.read(io.DEFAULT_BUFFER_SIZE)
except KeyboardInterrupt:
_logger.info('download skipped by user (ctrl+c again in the next 0.5 seconds to exit)')
try:
time.sleep(0.5)
except KeyboardInterrupt:
sys.exit(1)
except Exception as ex:
_logger.warn(f'error: {ex}')
elif 'instructions' in data:
pass
else:
for key in data.keys():
_build_cache(data[key], dir, prefix=f'{prefix}/{key}' if prefix else key)
def main(args):
parser = argparse.ArgumentParser(prog='ir_datasets build_download_cache', description='Builds a cache of downloadable content')
parser.add_argument('--dir', default=f'{ir_datasets.util.home_path()}/downloads')
parser.add_argument('--retries', default='10')
args = parser.parse_args(args)
with open('ir_datasets/etc/downloads.json') as f:
data = json.load(f)
with tmp_environ(IR_DATASETS_DL_TRIES=args.retries):
_build_cache(data, args.dir)
if __name__ == '__main__':
main(sys.argv[1:])
================================================
FILE: ir_datasets/commands/clean.py
================================================
import sys
import os
import argparse
import multiprocessing
from collections import deque
import ir_datasets
from ir_datasets.util import DownloadConfig
RED = '\u001b[31m'
RES = '\u001b[0m'
_logger = ir_datasets.log.easy()
def walk_path(start_path='.', skips=[]):
# adapted from
total_size = 0
files = []
for dirpath, dirnames, filenames in os.walk(start_path):
if any(s for s in skips if dirpath.startswith(s)):
continue
for f in filenames:
fp = os.path.join(dirpath, f)
if fp in skips:
continue
if not os.path.islink(fp):
total_size += os.path.getsize(fp)
files.append(fp)
return total_size, files
def clean(dataset, yes=False, list=False, human=True):
base_path = os.path.join(ir_datasets.util.home_path()/dataset)
dlc = DownloadConfig.context(dataset, base_path)
skips = []
for dl_item in dlc.contents().values():
if 'instructions' in dl_item and 'cache_path' in dl_item: # non-downloadble item
skips.append(os.path.join(base_path, dl_item['cache_path']))
size, files = walk_path(base_path, skips)
files_fmt = f'{len(files)} files'
if human:
size_fmt = ir_datasets.util.format_file_size(size)
if size > 1_000_000_000: # sizes over 1GB: list in red
size_fmt = f'{RED}{size_fmt}{RES}'
else:
size_fmt = str(size)
if list:
if size > 0:
print(f'{size_fmt}\t{files_fmt}\t{dataset}')
return
if not yes:
inp = None
while inp not in ('y', 'yes'):
inp = input(f'clean up {size_fmt} from {dataset} ({files_fmt})?\n[y(es) / n(o) / l(ist files)] ').lower()
if inp in ('l', 'list', 'list files'):
for file in files:
f_size = os.path.getsize(file)
if human:
fsize_fmt = ir_datasets.util.format_file_size(f_size)
if f_size > 1_000_000_000: # sizes over 1GB: list in red
fsize_fmt = f'{RED}{fsize_fmt}{RES}'
else:
fsize_fmt = str(size)
print(f'{fsize_fmt}\t{file}')
if inp in ('n', 'no'):
return
# remove identified files
for file in files:
os.remove(file)
# remove empty directories
for dirpath, dirnames, filenames in os.walk(base_path, topdown=False):
if not dirnames and not filenames:
os.rmdir(dirpath)
def main(args):
parser = argparse.ArgumentParser(prog='ir_datasets clean', description='Cleans up space by removing files that can automatically be rec-reated or re-downloaded.')
parser.add_argument('datasets', nargs='*', help='dataset IDs to clean up')
parser.add_argument('--yes', '-y', action='store_true', help='automatically say yes to confirmation messages')
parser.add_argument('--list', '-l', action='store_true', help='lists datasets available for cleanup and their sizes; does not do any cleanup')
parser.add_argument('-H', action='store_false', help='output raw sizes, rather than human-readable versions')
args = parser.parse_args(args)
try:
if args.datasets:
top_level_datasets = {d for d in ir_datasets.registry._registered if '/' not in d}
for dataset in args.datasets:
if dataset not in top_level_datasets:
print(f'Skipping unknown dataset {dataset}')
else:
clean(dataset, args.yes, list=args.list, human=args.H)
elif args.list:
for dataset in ir_datasets.registry._registered:
if '/' not in dataset:
clean(dataset, list=True, human=args.H)
else:
sys.stderr.write('ERROR: Please provide either --list, dataset IDs to clean, or --help for more details\n')
except KeyboardInterrupt:
pass
if __name__ == '__main__':
main(sys.argv[1:])
================================================
FILE: ir_datasets/commands/doc_fifos.py
================================================
import sys
import os
import select
import tempfile
import contextlib
import json
import argparse
import multiprocessing
from collections import deque
import ir_datasets
_logger = ir_datasets.log.easy()
def main(args):
parser = argparse.ArgumentParser(prog='ir_datasets doc_fifos', description='Starts a process that exports documents in parallel to several named pipes as json. This is useful as inputs to indexers like Anserini.')
parser.add_argument('dataset')
parser.add_argument('--count', type=int, default=max(multiprocessing.cpu_count() - 1, 1))
parser.add_argument('--fields', nargs='+')
parser.add_argument('--dir')
args = parser.parse_args(args)
dataset = ir_datasets.load(args.dataset)
try:
dataset = ir_datasets.load(args.dataset)
except KeyError:
sys.stderr.write(f"Dataset {args.dataset} not found.\n")
sys.exit(1)
if not dataset.has_docs():
sys.stderr.write(f"Dataset {args.dataset} does not have docs.\n")
sys.exit(1)
docs_cls = dataset.docs_cls()
field_idxs = []
if args.fields:
for field in args.fields:
if field not in docs_cls._fields:
sys.stderr.write(f"Field {field} not found ind {args.dataset}. Available fields: {docs_cls._fields}\n")
sys.exit(1)
field_idxs.append(docs_cls._fields.index(field))
else:
if len(docs_cls._fields) == 2:
# there's only one field, silently use it
field_idxs.append(1)
else:
# more than 1 field, let the user know everything is used.
sys.stderr.write(f"Exporting all fields as document content: {docs_cls._fields[1:]}. Use --fields to specify fields.\n")
field_idxs = list(range(1, len(docs_cls._fields)))
with contextlib.ExitStack() as stack:
if args.dir is not None:
d = args.dir
else:
d = stack.enter_context(tempfile.TemporaryDirectory())
fifos = []
for i in range(args.count):
fifo = os.path.join(d, f'{i}.json')
os.mkfifo(fifo)
fifos.append(fifo)
docs_iter = dataset.docs_iter()
docs_iter = _logger.pbar(docs_iter, total=dataset.docs_count(), unit='doc')
print(f'Ready at {d}')
print(f'To index with Anserini, run:\nIndexCollection -collection JsonCollection -input {d} -threads {args.count} -index ')
fifos = [stack.enter_context(open(f, 'wt')) for f in fifos]
ready = None
for doc in docs_iter:
if not ready: # first or no more ready
_, ready, _ = select.select([], fifos, [])
ready = deque(ready)
fifo = ready.popleft()
doc = {'id': doc.doc_id, 'contents': '\n'.join(str(doc[i]) for i in field_idxs)}
json.dump(doc, fifo)
fifo.write('\n')
if __name__ == '__main__':
main(sys.argv[1:])
================================================
FILE: ir_datasets/commands/export.py
================================================
import sys
import json
import argparse
import ir_datasets
_logger = ir_datasets.log.easy()
def main_docs(dataset, args):
assert hasattr(dataset, 'docs_handler'), f"{args.dataset} does not provide docs"
exporter = DEFAULT_EXPORTERS[args.format]
exporter = exporter(dataset.docs_cls(), args.out, args.fields)
for doc in dataset.docs_iter():
exporter.next(doc)
exporter.flush()
def main_queries(dataset, args):
assert hasattr(dataset, 'queries_handler'), f"{args.dataset} does not provide queries"
exporter = DEFAULT_EXPORTERS[args.format]
exporter = exporter(dataset.queries_cls(), args.out, args.fields)
for query in dataset.queries_iter():
exporter.next(query)
exporter.flush()
def main_qrels(dataset, args):
assert hasattr(dataset, 'qrels_handler'), f"{args.dataset} does not provide qrels"
exporter = QRELS_EXPORTERS[args.format]
exporter = exporter(dataset.qrels_cls(), args.out, args.fields)
for qrel in dataset.qrels_iter():
exporter.next(qrel)
exporter.flush()
def main_scoreddocs(dataset, args):
assert hasattr(dataset, 'scoreddocs_handler'), f"{args.dataset} does not provide scoreddocs"
exporter = SCOREDDOCS_EXPORTERS[args.format]
exporter = exporter(dataset.scoreddocs_cls(), args.out, args.fields)
if hasattr(exporter, 'runtag'):
exporter.runtag = args.runtag
for scoreddoc in dataset.scoreddocs_iter():
exporter.next(scoreddoc)
exporter.flush()
def main_docpairs(dataset, args):
assert hasattr(dataset, 'docpairs_handler'), f"{args.dataset} does not provide docpairs"
exporter = DEFAULT_EXPORTERS[args.format]
exporter = exporter(dataset.docpairs_cls(), args.out, args.fields)
for query in dataset.docpairs_iter():
exporter.next(query)
exporter.flush()
class TsvExporter:
def __init__(self, data_cls, out, fields=None):
self.data_cls = data_cls
self.out = out
if fields is None:
fields = data_cls._fields
if len(fields) > 2:
# This message is only really needed if there's more than 2 fields
_logger.info(f'No fields supplied. Using all fields: {fields}')
field_conflicts = [f for f in fields if data_cls.__annotations__[f] not in (str, int, float)]
if len(field_conflicts) == 1:
# special case: if there's only one Tuple[X, ...], we can export unambiguously with variable number of columns
if is_tuple_elip(data_cls.__annotations__[field_conflicts[0]]):
_logger.info(f'Exporting variable number of columns for {field_conflicts[0]}')
field_conflicts = []
if len(field_conflicts) > 0:
fields = [f for f in fields if f not in field_conflicts]
field_conflicts = ', '.join([repr((f, data_cls.__annotations__[f])) for f in field_conflicts])
_logger.info(f'Skipping the following fields due to unsupported data types: {field_conflicts}')
self.idxs = []
for field in fields:
assert field in data_cls._fields
self.idxs.append(data_cls._fields.index(field))
def next(self, record):
output = []
for idx in self.idxs:
if isinstance(record[idx], (list, tuple)):
for sub_rec in record[idx]:
if hasattr(sub_rec, '_fields'):
for value in sub_rec:
output.append(str(value).replace('\t', ' ').replace('\n', ' ').replace('\r', ' '))
else:
output.append(str(sub_rec).replace('\t', ' ').replace('\n', ' ').replace('\r', ' '))
elif hasattr(record[idx], '_fields'):
for value in record[idx]:
output.append(str(value).replace('\t', ' ').replace('\n', ' ').replace('\r', ' '))
else:
output.append(str(record[idx]).replace('\t', ' ').replace('\n', ' ').replace('\r', ' '))
self.out.write('\t'.join(output) + '\n')
def flush(self):
pass
class JsonlExporter:
def __init__(self, data_cls, out, fields=None):
self.data_cls = data_cls
self.out = out
fields = fields or data_cls._fields
if fields is None:
fields = data_cls._fields
if len(fields) > 2:
# This message is only really needed if there's more than 2 fields
_logger.info(f'No fields supplied. Using all fields: {fields}')
field_conflicts = [f for f in fields if data_cls.__annotations__[f] not in (str, int, float) and not is_tuple_elip(data_cls.__annotations__[f])]
if len(field_conflicts) > 0:
fields = [f for f in fields if f not in field_conflicts]
field_conflicts = ', '.join([repr((f, data_cls.__annotations__[f])) for f in field_conflicts])
_logger.info(f'Skipping the following fields due to unsupported data types: {field_conflicts}')
self.fields = fields
self.idxs = []
for field in self.fields:
assert field in data_cls._fields
self.idxs.append(data_cls._fields.index(field))
def next(self, record):
json.dump({f: self.encode(record[i]) for f, i in zip(self.fields, self.idxs)}, self.out)
self.out.write('\n')
def encode(self, value):
if isinstance(value, (list, tuple)):
return [self.encode(v) for v in value]
if hasattr(value, '_fields'):
return {k: self.encode(v) for k, v in value._asdict()}
return value
def flush(self):
pass
def is_tuple_elip(annotation):
if hasattr(annotation, '_name') and annotation._name == 'Tuple' and len(annotation.__args__) == 2 and annotation.__args__[1] is Ellipsis:
if annotation.__args__[0] in (str, int, float) or (hasattr(annotation.__args__[0], '_fields') and all(f in (str, int, float) for f in annotation.__args__[0].__annotations__.values())):
return True
return False
class TrecQrelsExporter:
def __init__(self, data_cls, out, fields=None):
self.data_cls = data_cls
self.out = out
assert 'query_id' in data_cls._fields, f"unsupported dataset cls {data_cls} (missing query_id)"
assert 'doc_id' in data_cls._fields, f"unsupported dataset cls {data_cls} (missing doc_id)"
self.has_iteration = 'iteration' in data_cls._fields
if fields is None:
remaining_fields = set(data_cls._fields) - {'query_id', 'doc_id', 'iteration'}
fields = sorted(remaining_fields, key=lambda f: data_cls._fields.index(f))
if fields != ['relevance']:
_logger.info(f'exporting fields {fields}')
self.rel_field_idxs = []
for field in fields:
assert field in data_cls._fields, f"missing field {repr(field)}; choose --fields from {data_cls._fields}"
self.rel_field_idxs.append(data_cls._fields.index(field))
if len(self.rel_field_idxs) > 1:
_logger.info(f'exporting multiple relevance fields; may not work with some evaluation scripts. Specify fields with --fields')
def next(self, record):
rel_fields = ' '.join(str(record[i]) for i in self.rel_field_idxs)
self.out.write(f'{record.query_id} {record.iteration if self.has_iteration else "0"} {record.doc_id} {rel_fields}\n')
def flush(self):
pass
class TrecRunExporter:
def __init__(self, data_cls, out, fields=None):
self.data_cls = data_cls
self.out = out
assert fields is None, "fields not supported for TREC Run exporter"
self.query_id = None
self.query_scores = []
self.runtag = 'run'
def next(self, record):
if record.query_id != self.query_id:
self.flush()
query_id = record.query_id
self.query_scores.append(record)
def flush(self):
for i, scoreddoc in enumerate(sorted(self.query_scores, key=lambda x: (-x.score, x.doc_id))):
self.out.write(f'{scoreddoc.query_id} Q0 {scoreddoc.doc_id} {i} {scoreddoc.score} {self.runtag}\n')
self.query_scores = []
DEFAULT_EXPORTERS = {
'tsv': TsvExporter,
'jsonl': JsonlExporter,
}
QRELS_EXPORTERS = {**DEFAULT_EXPORTERS, 'trec': TrecQrelsExporter}
SCOREDDOCS_EXPORTERS = {**DEFAULT_EXPORTERS, 'trec': TrecRunExporter}
def main(args):
parser = argparse.ArgumentParser(prog='ir_datasets export', description='Exports documents, queries, qrels, and scoreddocs in various formats.')
parser.add_argument('dataset')
parser.set_defaults(out=sys.stdout)
subparsers = parser.add_subparsers(dest='data')
subparsers.required = True
subparser = subparsers.add_parser('docs')
subparser.add_argument('--format', choices=DEFAULT_EXPORTERS.keys(), default='tsv')
subparser.add_argument('--fields', nargs='+')
subparser.set_defaults(fn=main_docs)
subparser = subparsers.add_parser('queries')
subparser.add_argument('--format', choices=DEFAULT_EXPORTERS.keys(), default='tsv')
subparser.add_argument('--fields', nargs='+')
subparser.set_defaults(fn=main_queries)
subparser = subparsers.add_parser('qrels')
subparser.add_argument('--format', choices=QRELS_EXPORTERS.keys(), default='trec')
subparser.add_argument('--fields', nargs='+')
subparser.set_defaults(fn=main_qrels)
subparser = subparsers.add_parser('scoreddocs')
subparser.add_argument('--format', choices=SCOREDDOCS_EXPORTERS.keys(), default='trec')
subparser.add_argument('--fields', nargs='+')
subparser.add_argument('--runtag', default='run')
subparser.set_defaults(fn=main_scoreddocs)
subparser = subparsers.add_parser('docpairs')
subparser.add_argument('--format', choices=DEFAULT_EXPORTERS.keys(), default='tsv')
subparser.add_argument('--fields', nargs='+')
subparser.set_defaults(fn=main_docpairs)
args = parser.parse_args(args)
dataset = ir_datasets.load(args.dataset)
try:
dataset = ir_datasets.load(args.dataset)
except KeyError:
sys.stderr.write(f"Dataset {args.dataset} not found.\n")
sys.exit(1)
try:
args.fn(dataset, args)
except BrokenPipeError:
sys.stderr.close()
except KeyboardInterrupt:
sys.stderr.close()
except AssertionError as e:
if str(e):
sys.stderr.write(str(e) + '\n')
else:
raise
if __name__ == '__main__':
main(sys.argv[1:])
================================================
FILE: ir_datasets/commands/generate_metadata.py
================================================
import time
import sys
import os
import json
import argparse
from pathlib import Path
from fnmatch import fnmatch
import ir_datasets
from ir_datasets.util import DownloadConfig
_logger = ir_datasets.log.easy()
def dataset2metadata(args):
dsid, data = args
try:
dataset = ir_datasets.load(dsid)
except KeyError:
return dsid, None
try:
for e in ir_datasets.EntityType:
if dataset.has(e):
if e.value not in data:
parent_id = getattr(ir_datasets, f'{e.value}_parent_id')(dsid)
if parent_id != dsid:
data[e.value] = {'_ref': parent_id}
else:
with _logger.duration(f'{dsid} {e.value}'):
data[e.value] = getattr(dataset, f'{e.value}_calc_metadata')()
_logger.info(f'{dsid} {e.value}: {data[e.value]}')
except Exception as ex:
_logger.info(f'{dsid} {e.value} [error]: {ex}')
return dsid, None
return dsid, data
def write_metadata_file(data, file):
with file.open('wt') as f:
# partially-formatted data; one dataset per line
f.write('{\n')
for i, key in enumerate(sorted(data.keys())):
if i != 0:
f.write(',\n')
f.write(f' "{key}": {json.dumps(data[key])}')
f.write('\n}\n')
def main(args):
parser = argparse.ArgumentParser(prog='ir_datasets generate_metadata', description='Generates metadata for the specified datasets')
parser.add_argument('--file', help='output file', type=Path, default=Path('ir_datasets/etc/metadata.json'))
parser.add_argument('--datasets', nargs='+', help='dataset IDs for which to compute metadata. If omitted, generates for all datasets present in the registry (skipping patterns)')
args = parser.parse_args(args)
if args.file.is_file():
with args.file.open('rb') as f:
data = json.load(f)
else:
data = {}
if args.datasets:
def _ds_iter():
for dsid in args.datasets:
yield dsid, data.get(dsid, {})
import multiprocessing
with multiprocessing.Pool(10) as pool:
for dsid, dataset_metadata in _logger.pbar(pool.imap_unordered(dataset2metadata, _ds_iter()), desc='datasets', total=len(args.datasets)):
if dataset_metadata is not None:
data[dsid] = dataset_metadata
write_metadata_file(data, args.file)
else:
for dsid in ir_datasets.registry._registered:
dataset = ir_datasets.load(dsid)
brk = False
try:
_, dataset_metadata = dataset2metadata((dsid, data.get(dsid, {})))
if dataset_metadata is not None:
data[dsid] = dataset_metadata
except KeyboardInterrupt:
_logger.info(f'KeyboardInterrupt; skipping. ctrl+c within 0.5sec to stop compute_metadata.')
try:
time.sleep(0.5)
except KeyboardInterrupt:
brk = True
break
write_metadata_file(data, args.file)
if brk:
break
if __name__ == '__main__':
main(sys.argv[1:])
================================================
FILE: ir_datasets/commands/list.py
================================================
import sys
import argparse
import ir_datasets
from ir_datasets.commands.export import DEFAULT_EXPORTERS
_logger = ir_datasets.log.easy()
def main(args):
parser = argparse.ArgumentParser(prog='ir_datasets list', description='Lists available datasets.')
parser.set_defaults(out=sys.stdout)
args = parser.parse_args(args)
for dataset in sorted(ir_datasets.registry):
args.out.write(f'{dataset}\n')
if __name__ == '__main__':
main(sys.argv[1:])
================================================
FILE: ir_datasets/commands/lookup.py
================================================
import sys
import argparse
import ir_datasets
from ir_datasets.commands.export import DEFAULT_EXPORTERS
_logger = ir_datasets.log.easy()
def qid_lookup(dataset, args):
assert hasattr(dataset, 'queries_handler')
exporter = DEFAULT_EXPORTERS[args.format]
exporter = exporter(dataset.queries_cls(), args.out, args.fields)
store = dataset.queries_store()
for qid in args.ids:
try:
query = store.get(qid)
exporter.next(query)
except KeyError:
_logger.warn(f'query_id {qid} not found')
def did_lookup(dataset, args):
assert hasattr(dataset, 'docs_handler')
exporter = DEFAULT_EXPORTERS[args.format]
exporter = exporter(dataset.docs_cls(), args.out, args.fields)
store = dataset.docs_store()
for did in args.ids:
try:
doc = store.get(did)
exporter.next(doc)
except KeyError:
_logger.warn(f'doc_id {did} not found')
def main(args):
parser = argparse.ArgumentParser(prog='ir_datasets lookup', description='Provides fast lookups of documents and queries '
'using docs_store. Unlike using the exporter and grep (or similar), this tool builds '
'an index for O(log(n)) lookups.')
parser.add_argument('dataset')
parser.set_defaults(out=sys.stdout)
parser.add_argument('--format', choices=DEFAULT_EXPORTERS.keys(), default='tsv')
parser.add_argument('--fields', nargs='+')
parser.add_argument('--qid', '--query_id', '-q', action='store_true')
parser.add_argument('ids', nargs='+')
args = parser.parse_args(args)
try:
dataset = ir_datasets.load(args.dataset)
except KeyError:
sys.stderr.write(f"Dataset {args.dataset} not found.\n")
sys.exit(1)
if args.qid:
qid_lookup(dataset, args)
else:
did_lookup(dataset, args)
if __name__ == '__main__':
main(sys.argv[1:])
================================================
FILE: ir_datasets/datasets/__init__.py
================================================
from . import base
from . import antique
from . import aol_ia
from . import aquaint
from . import argsme
from . import beir
from . import c4
from . import car
from . import clinicaltrials
from . import clirmatrix
from . import clueweb09
from . import clueweb12
from . import codec
from . import cord19
from . import cranfield
from . import csl
from . import disks45
from . import dpr_w100
from . import codesearchnet
from . import gov
from . import gov2
from . import highwire
from . import istella22
from . import kilt
from . import lotte
from . import medline
from . import miracl
from . import mmarco
from . import mr_tydi
from . import msmarco_document
from . import msmarco_document_v2
from . import msmarco_passage
from . import msmarco_passage_v2
from . import msmarco_qna
from . import nano_beir
from . import neumarco
from . import nfcorpus
from . import natural_questions
from . import nyt
from . import pmc
from . import touche_image
from . import touche # must be after argsme,clueweb12,touche_image
from . import trec_arabic
from . import trec_mandarin
from . import trec_spanish
from . import trec_robust04
from . import trec_tot
from . import tripclick
from . import tweets2013_ia
from . import vaswani
from . import wapo
from . import wikiclir
from . import wikir
from . import trec_fair
from . import trec_cast # must be after wapo,car,msmarco_passage
from . import hc4
from . import neuclir # must be after hc4
from . import sara
from . import trec_tot_2025
================================================
FILE: ir_datasets/datasets/antique.py
================================================
import io
import ir_datasets
from ir_datasets.formats import TsvDocs, TrecQrels, TsvQueries
from ir_datasets.util import DownloadConfig, Lazy
from .base import Dataset, FilteredQueries, FilteredQrels, YamlDocumentation
__all__ = ['collection', 'subsets']
_logger = ir_datasets.log.easy()
NAME = 'antique'
DUA = ("Please confirm you agree to the authors' data usage agreement found at "
"")
# Qrel defs taken verbatim from
QREL_DEFS = {
4: "It looks reasonable and convincing. Its quality is on parwith or better than the "
"\"Possibly Correct Answer\". Note that it does not have to provide the same answer "
"as the \"PossiblyCorrect Answer\".",
3: "It can be an answer to the question, however, it is notsufficiently convincing. "
"There should be an answer with much better quality for the question.",
2: "It does not answer the question or if it does, it provides anunreasonable answer, "
"however, it is not out of context. Therefore, you cannot accept it as an answer to "
"the question.",
1: "It is completely out of context or does not make any sense.",
}
VALIDATION_QIDS = {'1158088', '4032777', '1583099', '263783', '4237144', '1097878', '114758', '1211877', '1188438', '2689609', '1191621', '2571912', '1471877', '2961191', '2630860', '4092472', '3178012', '358253', '3913653', '844617', '2764765', '212427', '220575', '11706', '4069320', '3280274', '3159749', '4217473', '4042061', '1037897', '103298', '332662', '752633', '2704', '3635284', '2235825', '3651236', '2155390', '3752394', '2008456', '98438', '511835', '1647624', '3884772', '1536937', '544869', '66151', '2678635', '963523', '1881436', '993601', '3608433', '2048278', '3124162', '1907320', '1970273', '2891885', '2858043', '189364', '397709', '3470651', '3885753', '1933929', '94629', '2500918', '1708787', '2492366', '17665', '278043', '643630', '1727343', '196651', '3731489', '2910592', '1144768', '2573745', '546552', '1341602', '317469', '2735795', '1251077', '3507499', '3374970', '1034050', '1246269', '2901754', '2137263', '1295284', '2180502', '406082', '1443637', '2620488', '3118286', '3814583', '3738877', '684633', '2094435', '242701', '2613648', '2942624', '1495234', '1440810', '2421078', '961127', '595342', '363519', '4048305', '485408', '2573803', '3104841', '3626847', '727663', '3961', '4287367', '2112535', '913424', '1514356', '1512776', '937635', '1321784', '1582044', '1467322', '461995', '884643', '4338583', '2550445', '4165672', '1016750', '1184520', '3152714', '3617468', '3172166', '4031702', '2534994', '2035638', '404359', '1398838', '4183127', '2418824', '2439070', '2632334', '4262151', '3841762', '4400543', '2147417', '514804', '1423289', '2041828', '2776069', '1458676', '3407617', '1450678', '1978816', '2466898', '1607303', '2175167', '772988', '1289770', '3382182', '3690922', '1051346', '344029', '2357505', '1907847', '2587810', '3272207', '2522067', '1107012', '554539', '489705', '3652886', '4287894', '4387641', '1727879', '348777', '566364', '2678484', '4450252', '986260', '4336509', '3824106', '2169746', '2700836', '3495304', '3083719', '126182', '1607924', '1485589', '3211282', '2546730', '2897078', '3556937', '2113006', '929821', '2306533', '2543919', '1639607', '3958214', '2677193', '763189'}
def _init():
documentation = YamlDocumentation('docs/antique.yaml')
base_path = ir_datasets.util.home_path() / NAME
dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
collection = TsvDocs(dlc['docs'], namespace=NAME, lang='en', count_hint=ir_datasets.util.count_hint(NAME))
subsets = {}
for subset in ('train', 'test'):
qrels = TrecQrels(dlc[f'{subset}/qrels'], QREL_DEFS)
queries = TsvQueries(dlc[f'{subset}/queries'], namespace=NAME, lang='en')
subsets[subset] = Dataset(collection, queries, qrels)
# Split the training data into training and validation data
validation_qids = Lazy(lambda: VALIDATION_QIDS)
subsets['train/split200-train'] = Dataset(
FilteredQueries(subsets['train'].queries_handler(), validation_qids, mode='exclude'),
FilteredQrels(subsets['train'].qrels_handler(), validation_qids, mode='exclude'),
subsets['train'])
subsets['train/split200-valid'] = Dataset(
FilteredQueries(subsets['train'].queries_handler(), validation_qids, mode='include'),
FilteredQrels(subsets['train'].qrels_handler(), validation_qids, mode='include'),
subsets['train'])
# Separate test set removing the "offensive (and noisy)" questions
disallow_list = dlc['disallow_list']
def disllow_qids():
with disallow_list.stream() as stream:
stream = io.TextIOWrapper(stream)
return {l.rstrip() for l in stream}
disllow_qids = Lazy(disllow_qids)
subsets['test/non-offensive'] = Dataset(
FilteredQueries(subsets['test'].queries_handler(), disllow_qids, mode='exclude'),
FilteredQrels(subsets['test'].qrels_handler(), disllow_qids, mode='exclude'),
subsets['test'])
ir_datasets.registry.register(NAME, Dataset(collection, documentation('_')))
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s)))
return collection, subsets
collection, subsets = _init()
================================================
FILE: ir_datasets/datasets/aol_ia.py
================================================
from datetime import datetime
import json
import pickle
import re
import contextlib
from collections import Counter
from hashlib import md5
import ir_datasets
from typing import NamedTuple, Tuple
from ir_datasets.util import DownloadConfig, GzipExtract, TarExtract, finialized_file
from ir_datasets.formats import TrecQrels, TsvQueries, DocstoreBackedDocs, BaseQlogs
from ir_datasets.datasets.base import Dataset, YamlDocumentation
from ir_datasets.indices import DEFAULT_DOCSTORE_OPTIONS, DocstoreOptions, PickleLz4FullStore
_logger = ir_datasets.log.easy()
NAME = 'aol-ia'
QREL_DEFS = {
1: 'clicked',
}
QID_LEN = 14
DID_LEN = 12
class LogItem(NamedTuple):
doc_id: str
rank: int
clicked: bool
class AolQlog(NamedTuple):
user_id: str
query_id: str
query: str
query_orig: str
time: datetime
items: Tuple[LogItem, ...]
class AolIaDoc(NamedTuple):
doc_id: str
title: str
text: str
url: str
ia_url: str
def default_text(self):
"""
title and text
"""
return f'{self.title} {self.text}'
class AolQlogs(BaseQlogs):
def __init__(self, dlc):
self.dlc = dlc
def qlogs_iter(self):
LZ4FrameFile = ir_datasets.lazy_libs.lz4_frame().frame.LZ4FrameFile
with self.dlc.stream() as fin, \
LZ4FrameFile(fin) as fin:
try:
while True:
yield pickle.load(fin)
except EOFError:
pass
def qlogs_cls(self):
return AolQlog
def qlogs_count(self):
return 36_389_567
class _ManagedDlc:
def __init__(self, manager, path):
self._manager = manager
self._path = path
@contextlib.contextmanager
def stream(self):
self._manager.build()
with open(self._path, 'rb') as f:
yield f
def path(self, force=True):
if force:
self._manager.build()
return self._path
class AolManager:
def __init__(self, log_dlcs, id2wb_dlc, base_path):
self._log_dlcs = log_dlcs
self.id2wb_dlc = id2wb_dlc # exposed for aolia-tools
self._docs_store = None
self._base_path = base_path
self._logs_built = None
if not self._base_path.exists():
self._base_path.mkdir(exist_ok=True, parents=True)
def docs_store(self, options=DEFAULT_DOCSTORE_OPTIONS):
self._build_docs()
return self._internal_docs_store(options)
def _internal_docs_store(self, options: DocstoreOptions=DEFAULT_DOCSTORE_OPTIONS):
if self._docs_store is None:
self._docs_store = PickleLz4FullStore(self._base_path/'docs.pklz4', None, AolIaDoc, 'doc_id', ['doc_id'], count_hint=ir_datasets.util.count_hint(NAME), options=options)
return self._docs_store
def _build_docs(self):
if self._internal_docs_store().built():
return
if not (self._base_path/'downloaded_docs'/'_done').exists():
raise RuntimeError('''To use the documents of AOLIA, you will need to run the download script in https://github.com/terrierteam/aolia-tools. To run the script, use the following commands:
git clone https://github.com/terrierteam/aolia-tools
cd aolia-tools
pip install -r requirements.txt
python downloader.py
''')
LZ4FrameFile = ir_datasets.lazy_libs.lz4_frame().frame.LZ4FrameFile
with _logger.pbar_raw(desc='', total=1525535) as pbar, self._internal_docs_store().lookup.transaction() as transaction:
for file in sorted((self._base_path/'downloaded_docs').glob('*.jsonl.lz4')):
pbar.set_postfix({'file': file.name})
docs = []
with LZ4FrameFile(file, 'rb') as fin:
for line in fin:
doc = json.loads(line)
docs.append(AolIaDoc(doc['doc_id'], doc['title'], doc['text'], doc['url'], doc['wb_url']))
pbar.update()
for doc in sorted(docs, key=lambda x: x.doc_id): # sort the documents in each file before adding them to the docstore. This ensures a consistent ordering.
transaction.add(doc)
def build(self):
if self._logs_built is None:
self._logs_built = (self._base_path/'_built_logs').exists()
if self._logs_built:
return # already built
# sessionizer = Sessionizer()
lz4_frame = ir_datasets.lazy_libs.lz4_frame().frame
encountered_qids = set()
with finialized_file(self._base_path/'queries.tsv', 'wt') as f_queries, \
finialized_file(self._base_path/'qrels', 'wt') as f_qrels, \
finialized_file(self._base_path/'log.pkl.lz4', 'wb') as f_log, \
lz4_frame.LZ4FrameFile(f_log, 'wb') as f_log, \
_logger.pbar_raw(desc=f'preparing {NAME} log lines', total=36389567) as pbar:
for dlc in self._log_dlcs:
with dlc.stream() as fin:
assert next(fin) == b'AnonID\tQuery\tQueryTime\tItemRank\tClickURL\n' # skip header
for line in fin:
pbar.update()
cols = line.decode().rstrip('\n').split('\t')
if tuple(cols[3:]) == ('', ''):
user_id, query, query_time, _, _ = cols
rank, url = None, None
else:
user_id, query, query_time, rank, url = cols
norm_query = ' '.join(ir_datasets.util.ws_tok(query))
query_id = md5(norm_query.encode()).hexdigest()[:QID_LEN]
if query_id not in encountered_qids:
f_queries.write(f'{query_id}\t{norm_query}\n')
encountered_qids.add(query_id)
log_items = []
if url is not None:
doc_id = md5(url.encode()).hexdigest()[:DID_LEN]
f_qrels.write(f'{query_id}\t{user_id}\t{doc_id}\t1\n')
log_items.append(LogItem(doc_id, rank, True))
log_record = AolQlog(user_id, query_id, norm_query, query, datetime.fromisoformat(query_time), tuple(log_items))
pickle.dump(log_record, f_log)
(self._base_path/'_built_logs').touch()
self._logs_built = True
def file_ref(self, path):
return _ManagedDlc(self, self._base_path/path)
def _init():
subsets = {}
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
manager = AolManager([
GzipExtract(TarExtract(dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-01.txt.gz')),
GzipExtract(TarExtract(dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-02.txt.gz')),
GzipExtract(TarExtract(dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-03.txt.gz')),
GzipExtract(TarExtract(dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-04.txt.gz')),
GzipExtract(TarExtract(dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-05.txt.gz')),
GzipExtract(TarExtract(dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-06.txt.gz')),
GzipExtract(TarExtract(dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-07.txt.gz')),
GzipExtract(TarExtract(dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-08.txt.gz')),
GzipExtract(TarExtract(dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-09.txt.gz')),
GzipExtract(TarExtract(dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-10.txt.gz')),
], GzipExtract(dlc['id2wb']), base_path)
base = Dataset(
DocstoreBackedDocs(manager.docs_store, docs_cls=AolIaDoc, namespace=NAME, lang=None),
TsvQueries(manager.file_ref('queries.tsv'), lang=None),
TrecQrels(manager.file_ref('qrels'), QREL_DEFS),
AolQlogs(manager.file_ref('log.pkl.lz4')),
documentation('_'))
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets, manager, base_path
# Be sure to keep MANAGER and PATH here; they are used by aolia-tools
base, subsets, MANAGER, PATH = _init()
================================================
FILE: ir_datasets/datasets/aquaint.py
================================================
import ir_datasets
from ir_datasets.util import DownloadConfig
from ir_datasets.formats import TrecQrels, TrecDocs, TrecQueries
from ir_datasets.datasets.base import Dataset, YamlDocumentation
NAME = 'aquaint'
QREL_DEFS = {
2: 'highly relevant',
1: 'relevant',
0: 'not relevant',
}
QTYPE_MAP = {
' *(Number:)?': 'query_id',
' *(Topic:)?': 'title',
' *(Description:)?': 'description',
' *(Narrative:)?': 'narrative'
}
def _init():
subsets = {}
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
collection = TrecDocs(dlc['docs'], encoding='utf8', path_globs=['aquaint_comp/apw/*/*.gz', 'aquaint_comp/nyt/*/*.gz', 'aquaint_comp/xie/*/*.gz'], namespace=NAME, lang='en', count_hint=ir_datasets.util.count_hint(NAME))
base = Dataset(collection, documentation('_'))
subsets['trec-robust-2005'] = Dataset(
TrecQueries(dlc['trec-robust-2005/queries'], qtype_map=QTYPE_MAP, namespace='trec-robust', lang='en'),
TrecQrels(dlc['trec-robust-2005/qrels'], QREL_DEFS),
collection,
documentation('trec-robust-2005'))
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/argsme.py
================================================
from itertools import chain
from typing import Dict
from ir_datasets import registry
from ir_datasets.datasets.base import Dataset, YamlDocumentation
from ir_datasets.formats import ArgsMeDocs, ArgsMeProcessedDocs, ArgsMeCombinedDocs
from ir_datasets.util import DownloadConfig, home_path, Cache, ZipExtract, TarExtract
NAME = "argsme"
SUBSETS = {
'1.0': (387692, "en", "args-me.json"),
'1.0-cleaned': (382545, "en", "args-me-1.0-cleaned.json"),
'2020-04-01/debateorg': (338620, "en", "debateorg.json"),
'2020-04-01/debatepedia': (21197, "en", "debatepedia.json"),
'2020-04-01/debatewise': (14353, "en", "debatewise.json"),
'2020-04-01/idebate': (13522, "en", "idebate.json"),
'2020-04-01/parliamentary': (48, "en", "parliamentary.json"),
}
PROCESSED_SUBSETS = {
'2020-04-01/processed': (365408, "en", "args_processed.csv"),
}
COMBINED_SUBSETS = {
'2020-04-01': (
[
'2020-04-01/debateorg',
'2020-04-01/debatepedia',
'2020-04-01/debatewise',
'2020-04-01/idebate',
'2020-04-01/parliamentary'
],
387740,
"en"
),
}
def _init():
base_path = home_path() / NAME
documentation = YamlDocumentation(f"docs/{NAME}.yaml")
download_config = DownloadConfig.context(NAME, base_path)
base = Dataset(documentation('_'))
# Arguments that can be loaded from Zenodo.
arguments: Dict[str, ArgsMeDocs] = {
name: ArgsMeDocs(
Cache(
ZipExtract(
download_config[name],
zip_path
),
base_path / f"{name}.json"
),
namespace=f"{NAME}/{name}",
language=language,
count_hint=count_hint
)
for name, (count_hint, language, zip_path)
in SUBSETS.items()
}
# Processed arguments that can be loaded from Zenodo.
processed_arguments: Dict[str, ArgsMeProcessedDocs] = {
name: ArgsMeProcessedDocs(
Cache(
TarExtract(
download_config[name],
zip_path
),
base_path / f"{name}.json"
),
namespace=f"{NAME}/{name}",
language=language,
count_hint=count_hint
)
for name, (count_hint, language, zip_path)
in PROCESSED_SUBSETS.items()
}
# Arguments that are combined versions of other subsets.
combined_arguments: Dict[str, ArgsMeCombinedDocs] = {
name: ArgsMeCombinedDocs(
base_path / f"{name}.json",
[arguments[subset_name] for subset_name in subset_names],
namespace=f"{NAME}/{name}",
language=language,
count_hint=count_hint
)
for name, (subset_names, count_hint, language)
in COMBINED_SUBSETS.items()
}
# Wrap in datasets with documentation.
datasets = {
name: Dataset(
arguments,
documentation(name)
)
for name, arguments in chain(
arguments.items(),
processed_arguments.items(),
combined_arguments.items(),
)
}
# NOTE: the following datasets are defined in touche.py:
# - argsme/1.0/touche-2020-task-1/uncorrected
# - argsme/2020-04-01/touche-2020-task-1
# - argsme/2020-04-01/touche-2020-task-1/uncorrected
# - argsme/2020-04-01/touche-2021-task-1
# - argsme/2020-04-01/processed/touche-2022-task-1
# Register datasets.
registry.register(NAME, base)
for name, arguments in datasets.items():
registry.register(f'{NAME}/{name}', arguments)
return base, datasets
dataset = _init()
================================================
FILE: ir_datasets/datasets/base.py
================================================
import pkgutil
import contextlib
import itertools
from pathlib import Path
import ir_datasets
from ir_datasets.formats import BaseQueries, BaseQrels, BaseScoredDocs, BaseDocPairs
_logger = ir_datasets.log.easy()
class Dataset:
def __init__(self, *constituents):
self._constituents = [c for c in constituents if c is not None]
self._beta_apis = {}
def __getstate__(self):
return self._constituents
def __setstate__(self, state):
self._constituents = state
def __getattr__(self, attr):
if attr == 'docs' and self.has_docs():
if 'docs' not in self._beta_apis:
self._beta_apis['docs'] = _BetaPythonApiDocs(self)
return self._beta_apis['docs']
if attr == 'queries' and self.has_queries():
if 'queries' not in self._beta_apis:
self._beta_apis['queries'] = _BetaPythonApiQueries(self)
return self._beta_apis['queries']
if attr == 'qrels' and self.has_qrels():
if 'qrels' not in self._beta_apis:
self._beta_apis['qrels'] = _BetaPythonApiQrels(self)
return self._beta_apis['qrels']
if attr == 'scoreddocs' and self.has_scoreddocs():
if 'scoreddocs' not in self._beta_apis:
self._beta_apis['scoreddocs'] = _BetaPythonApiScoreddocs(self)
return self._beta_apis['scoreddocs']
if attr == 'docpairs' and self.has_docpairs():
if 'docpairs' not in self._beta_apis:
self._beta_apis['docpairs'] = _BetaPythonApiDocpairs(self)
return self._beta_apis['docpairs']
if attr == 'qlogs' and self.has_qlogs():
if 'qlogs' not in self._beta_apis:
self._beta_apis['qlogs'] = _BetaPythonApiQlogs(self)
return self._beta_apis['qlogs']
for cons in self._constituents:
if hasattr(cons, attr):
return getattr(cons, attr)
raise AttributeError(attr)
def __repr__(self):
supplies = []
if self.has_docs():
supplies.append('docs')
if self.has_queries():
supplies.append('queries')
if self.has_qrels():
supplies.append('qrels')
if self.has_scoreddocs():
supplies.append('scoreddocs')
if self.has_docpairs():
supplies.append('docpairs')
if self.has_qlogs():
supplies.append('qlogs')
if hasattr(self, 'dataset_id'):
return f'Dataset(id={repr(self.dataset_id())}, provides={repr(supplies)})'
else:
return f'Dataset(provides={repr(supplies)})'
def __dir__(self):
result = set(dir(super()))
for cons in self._constituents:
result |= set(dir(cons))
return list(result)
def has(self, etype: ir_datasets.EntityType) -> bool:
etype = ir_datasets.EntityType(etype) # validate & allow strings
return hasattr(self, f'{etype.value}_handler')
def has_docs(self):
return self.has(ir_datasets.EntityType.docs)
def has_queries(self):
return self.has(ir_datasets.EntityType.queries)
def has_qrels(self):
return self.has(ir_datasets.EntityType.qrels)
def has_scoreddocs(self):
return self.has(ir_datasets.EntityType.scoreddocs)
def has_docpairs(self):
return self.has(ir_datasets.EntityType.docpairs)
def has_qlogs(self):
return self.has(ir_datasets.EntityType.qlogs)
class _BetaPythonApiDocs:
def __init__(self, handler):
self._handler = handler
self._docstore = None
self.type = handler.docs_cls()
self.lang = handler.docs_lang()
def __iter__(self):
return self._handler.docs_iter()
def __len__(self):
return self._handler.docs_count()
def __getitem__(self, key):
return self._handler.docs_iter()[key]
def __repr__(self):
return f'BetaPythonApiDocs({repr(self._handler)})'
def lookup(self, doc_ids):
if self._docstore is None:
self._docstore = self._handler.docs_store()
if isinstance(doc_ids, str):
return self._docstore.get(doc_ids)
return self._docstore.get_many(doc_ids)
def lookup_iter(self, doc_ids):
if self._docstore is None:
self._docstore = self._handler.docs_store()
if isinstance(doc_ids, str):
yield self._docstore.get(doc_ids)
else:
yield from self._docstore.get_many_iter(doc_ids)
@property
def metadata(self):
return self._handler.docs_metadata()
class _BetaPythonApiQueries:
def __init__(self, handler):
self._handler = handler
self._query_lookup = None
self.type = handler.queries_cls()
self.lang = handler.queries_lang()
def __iter__(self):
return self._handler.queries_iter()
def __repr__(self):
return f'BetaPythonApiQueries({repr(self._handler)})'
def __len__(self):
result = None
if hasattr(self._handler, 'queries_count'):
result = self._handler.queries_count()
if result is None:
if self._query_lookup is None:
self._query_lookup = {q.query_id: q for q in self._handler.queries_iter()}
result = len(self._query_lookup)
return result
def lookup(self, query_ids):
if self._query_lookup is None:
self._query_lookup = {q.query_id: q for q in self._handler.queries_iter()}
if isinstance(query_ids, str):
return self._query_lookup[query_ids]
return {qid: self._query_lookup[qid] for qid in query_ids if qid in self._query_lookup}
def lookup_iter(self, query_ids):
if self._query_lookup is None:
self._query_lookup = {q.query_id: q for q in self._handler.queries_iter()}
if isinstance(query_ids, str):
yield self._query_lookup[query_ids]
else:
for qid in query_ids:
if qid in self._query_lookup:
yield self._query_lookup[qid]
@property
def metadata(self):
return self._handler.queries_metadata()
class _BetaPythonApiQrels:
def __init__(self, handler):
self._handler = handler
self.type = handler.qrels_cls()
self.defs = handler.qrels_defs()
self._qrels_dict = None
def __iter__(self):
return self._handler.qrels_iter()
def __repr__(self):
return f'BetaPythonApiQrels({repr(self._handler)})'
def asdict(self):
if self._qrels_dict is None:
self._qrels_dict = self._handler.qrels_dict()
return self._qrels_dict
def __len__(self):
result = None
if hasattr(self._handler, 'qrels_count'):
result = self._handler.qrels_count()
if result is None:
if self._qrels_dict is None:
self._qrels_dict = self._handler.qrels_dict()
result = sum(len(x) for x in self._qrels_dict.values())
return result
@property
def metadata(self):
return self._handler.qrels_metadata()
class _BetaPythonApiScoreddocs:
def __init__(self, handler):
self._handler = handler
self.type = handler.scoreddocs_cls()
def __iter__(self):
return self._handler.scoreddocs_iter()
def __repr__(self):
return f'BetaPythonApiScoreddocs({repr(self._handler)})'
def __len__(self):
result = None
if hasattr(self._handler, 'scoreddocs_count'):
result = self._handler.scoreddocs_count()
if result is None:
result = sum(1 for _ in self._handler.scoreddocs_iter())
return result
@property
def metadata(self):
return self._handler.scoreddocs_metadata()
class _BetaPythonApiDocpairs:
def __init__(self, handler):
self._handler = handler
self.type = handler.docpairs_cls()
def __iter__(self):
return self._handler.docpairs_iter()
def __repr__(self):
return f'BetaPythonApiDocpairs({repr(self._handler)})'
def __len__(self):
result = None
if hasattr(self._handler, 'docpairs_count'):
result = self._handler.docpairs_count()
if result is None:
result = sum(1 for _ in self._handler.docpairs_iter())
return result
@property
def metadata(self):
return self._handler.docpairs_metadata()
class _BetaPythonApiQlogs:
def __init__(self, handler):
self._handler = handler
self.type = handler.qlogs_cls()
def __iter__(self):
return self._handler.qlogs_iter()
def __repr__(self):
return f'BetaPythonApiQlogs({repr(self._handler)})'
def __len__(self):
result = None
if hasattr(self._handler, 'qlogs_count'):
result = self._handler.qlogs_count()
if result is None:
result = sum(1 for _ in self._handler.qlogs_iter())
return result
@property
def metadata(self):
return self._handler.qlogs_metadata()
class FilteredQueries(BaseQueries):
def __init__(self, queries_handler, lazy_qids, mode='include'):
self._queries_handler = queries_handler
self._lazy_qids = lazy_qids
self._mode = mode
def queries_iter(self):
qids = self._lazy_qids()
operator = {
'include': (lambda x: x.query_id in qids),
'exclude': (lambda x: x.query_id not in qids),
}[self._mode]
for query in self._queries_handler.queries_iter():
if operator(query):
yield query
def queries_cls(self):
return self._queries_handler.queries_cls()
def queries_handler(self):
return self
def queries_lang(self):
return self._queries_handler.queries_lang()
class FilteredQrels(BaseQrels):
def __init__(self, qrels_handler, lazy_qids, mode='include'):
self._qrels_handler = qrels_handler
self._lazy_qids = lazy_qids
self._mode = mode
def qrels_iter(self):
qids = self._lazy_qids()
operator = {
'include': (lambda x: x.query_id in qids),
'exclude': (lambda x: x.query_id not in qids),
}[self._mode]
for query in self._qrels_handler.qrels_iter():
if operator(query):
yield query
def qrels_defs(self):
return self._qrels_handler.qrels_defs()
def qrels_handler(self):
return self
class FilteredScoredDocs(BaseScoredDocs):
def __init__(self, scoreddocs_handler, lazy_qids, mode='include'):
self._scoreddocs_handler = scoreddocs_handler
self._lazy_qids = lazy_qids
self._mode = mode
def scoreddocs_iter(self):
qids = self._lazy_qids()
operator = {
'include': (lambda x: x.query_id in qids),
'exclude': (lambda x: x.query_id not in qids),
}[self._mode]
for query in self._scoreddocs_handler.scoreddocs_iter():
if operator(query):
yield query
def scoreddocs_handler(self):
return self
class FilteredDocPairs(BaseDocPairs):
def __init__(self, docpairs_handler, lazy_qids, mode='include'):
self._docpairs_handler = docpairs_handler
self._lazy_qids = lazy_qids
self._mode = mode
def docpairs_iter(self):
qids = self._lazy_qids()
operator = {
'include': (lambda x: x.query_id in qids),
'exclude': (lambda x: x.query_id not in qids),
}[self._mode]
for query in self._docpairs_handler.docpairs_iter():
if operator(query):
yield query
def docpairs_handler(self):
return self
class YamlDocumentation:
def __init__(self, file):
self._file = file
self._contents = None
def __call__(self, key):
return YamlDocumentationProvider(self, key)
def get_key(self, key):
if not self._contents:
yaml = ir_datasets.lazy_libs.yaml()
data = pkgutil.get_data('ir_datasets', self._file)
self._contents = yaml.load(data, Loader=yaml.BaseLoader) # only strings
return self._contents.get(key)
class YamlDocumentationProvider:
def __init__(self, documentation, key):
self._documentation = documentation
self._key = key
def documentation(self):
docs = self._documentation.get_key(self._key)
if self._documentation.get_key(self._key):
return dict(docs.items())
return {}
class Deprecated:
def __init__(self, message):
self._message = message
def deprecated(self):
return self._message
class ExpectedFile:
def __init__(self, path, expected_md5=None, instructions=None):
self._path = Path(path)
self._expected_md5 = expected_md5
self._instructions = instructions
def path(self, force=True):
if force and not self._path.exists():
self._path.parent.mkdir(parents=True, exist_ok=True)
inst = '\n\n' + self._instructions.format(path=self._path) if self._instructions else ''
raise IOError(f"{self._path} does not exist.{inst}")
return self._path
@contextlib.contextmanager
def stream(self):
with self.path().open('rb') as result:
if self._expected_md5:
result = ir_datasets.util.HashStream(result, expected=self._expected_md5, algo='md5')
yield result
class Concat(Dataset):
def __getattr__(self, attr):
if attr.endswith('_iter'):
iters = []
for ds in self._constituents:
if hasattr(ds, attr):
iters.append(getattr(ds, attr)())
if iters:
return lambda: itertools.chain(*iters)
return super().__getattr__(attr)
================================================
FILE: ir_datasets/datasets/beir.py
================================================
import json
import codecs
from typing import NamedTuple, Dict, List
import ir_datasets
from ir_datasets.util import ZipExtract, Cache, Lazy, Migrator
from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries
from ir_datasets.formats import BaseQueries, BaseDocs, BaseQrels, GenericDoc, GenericQuery, TrecQrel
from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS
_logger = ir_datasets.log.easy()
NAME = 'beir'
class BeirDoc(NamedTuple):
doc_id: str
text: str
title: str
metadata: Dict[str, str]
def default_text(self):
"""
title text
"""
return f'{self.title} {self.text}'
class BeirTitleDoc(NamedTuple):
doc_id: str
text: str
title: str
def default_text(self):
"""
title text
"""
return f'{self.title} {self.text}'
class BeirTitleUrlDoc(NamedTuple):
doc_id: str
text: str
title: str
url: str
def default_text(self):
"""
title text
"""
return f'{self.title} {self.text}'
class BeirSciDoc(NamedTuple):
doc_id: str
text: str
title: str
authors: List[str]
year: int
cited_by: List[str]
references: List[str]
def default_text(self):
"""
title text
"""
return f'{self.title} {self.text}'
class BeirCordDoc(NamedTuple):
doc_id: str
text: str
title: str
url: str
pubmed_id: str
def default_text(self):
"""
title text
"""
return f'{self.title} {self.text}'
class BeirToucheDoc(NamedTuple):
doc_id: str
text: str
title: str
stance: str
url: str
def default_text(self):
"""
title text
"""
return f'{self.title} {self.text}'
class BeirCqaDoc(NamedTuple):
doc_id: str
text: str
title: str
tags: List[str]
def default_text(self):
"""
title text
"""
return f'{self.title} {self.text}'
class BeirUrlQuery(NamedTuple):
query_id: str
text: str
url: str
def default_text(self):
"""
text
"""
return self.text
class BeirSciQuery(NamedTuple):
query_id: str
text: str
authors: List[str]
year: int
cited_by: List[str]
references: List[str]
def default_text(self):
"""
text
"""
return self.text
class BeirToucheQuery(NamedTuple):
query_id: str
text: str
description: str
narrative: str
def default_text(self):
"""
text
"""
return self.text
class BeirCovidQuery(NamedTuple):
query_id: str
text: str
query: str
narrative: str
def default_text(self):
"""
text
"""
return self.text
class BeirCqaQuery(NamedTuple):
query_id: str
text: str
tags: List[str]
def default_text(self):
"""
text
"""
return self.text
def _map_field(field, data):
if field in ('doc_id', 'query_id'):
return data['_id']
if field == 'text':
return data['text']
if field == 'title':
return data['title']
else:
return data['metadata'][field]
class BeirDocs(BaseDocs):
def __init__(self, name, dlc, doc_type):
super().__init__()
self._name = name
self._dlc = dlc
self._doc_type = doc_type
def docs_iter(self):
return iter(self.docs_store())
def _docs_iter(self):
with self._dlc.stream() as stream:
for line in stream:
data = json.loads(line)
yield self._doc_type(*(_map_field(f, data) for f in self._doc_type._fields))
def docs_cls(self):
return self._doc_type
def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
return PickleLz4FullStore(
path=f'{ir_datasets.util.home_path()/NAME/self._name}/docs.pklz4',
init_iter_fn=self._docs_iter,
data_cls=self.docs_cls(),
lookup_field=field,
index_fields=['doc_id'],
count_hint=ir_datasets.util.count_hint(f'{NAME}/{self._name}'),
options=options
)
def docs_count(self):
if self.docs_store().built():
return self.docs_store().count()
def docs_namespace(self):
return f'{NAME}/{self._name}'
def docs_lang(self):
return 'en'
class BeirQueries(BaseQueries):
def __init__(self, name, dlc, query_type):
super().__init__()
self._name = name
self._dlc = dlc
self._query_type = query_type
def queries_iter(self):
with self._dlc.stream() as stream:
for line in stream:
data = json.loads(line)
yield self._query_type(*(_map_field(f, data) for f in self._query_type._fields))
def queries_cls(self):
return self._query_type
def queries_namespace(self):
return f'{NAME}/{self._name}'
def queries_lang(self):
return 'en'
class BeirQrels(BaseQrels):
def __init__(self, qrels_dlc, qrels_defs):
self._qrels_dlc = qrels_dlc
self._qrels_defs = qrels_defs
def qrels_path(self):
return self._qrels_dlc.path()
def qrels_iter(self):
with self._qrels_dlc.stream() as f:
f = codecs.getreader('utf8')(f)
it = iter(f)
assert next(it).strip() == 'query-id\tcorpus-id\tscore' # header row
for line in it:
if line == '\n':
continue # ignore blank lines
cols = line.rstrip().split()
if len(cols) != 3:
raise RuntimeError(f'expected 3 columns, got {len(cols)}')
qid, did, score = cols
yield TrecQrel(qid, did, int(score), '0')
def qrels_cls(self):
return TrecQrel
def qrels_defs(self):
return self._qrels_defs
def _init():
base_path = ir_datasets.util.home_path()/NAME
dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path)
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
base = Dataset(documentation('_'))
subsets = {}
benchmarks = {
'msmarco': (['train', 'dev', 'test'], GenericDoc, GenericQuery),
'trec-covid': (['test'], BeirCordDoc, BeirCovidQuery),
'nfcorpus': (['train', 'dev', 'test'], BeirTitleUrlDoc, BeirUrlQuery),
'nq': (['test'], BeirTitleDoc, GenericQuery),
'hotpotqa': (['train', 'dev', 'test'], BeirTitleUrlDoc, GenericQuery),
'fiqa': (['train', 'dev', 'test'], GenericDoc, GenericQuery),
'arguana': (['test'], BeirTitleDoc, GenericQuery),
'webis-touche2020': (['test'], BeirToucheDoc, BeirToucheQuery),
'webis-touche2020/v2': (['test'], BeirToucheDoc, BeirToucheQuery),
'quora': (['dev', 'test'], GenericDoc, GenericQuery),
'dbpedia-entity': (['dev', 'test'], BeirTitleUrlDoc, GenericQuery),
'scidocs': (['test'], BeirSciDoc, BeirSciQuery),
'fever': (['train', 'dev', 'test'], BeirTitleDoc, GenericQuery),
'climate-fever': (['test'], BeirTitleDoc, GenericQuery),
'scifact': (['train', 'test'], BeirTitleDoc, GenericQuery),
}
for ds, (qrels, doc_type, query_type) in benchmarks.items():
dlc_ds = dlc[ds]
ds_zip = ds.split('/')[0]
docs_migrator = Migrator(base_path/ds/'irds_version.txt', 'v2',
affected_files=[f'{base_path/ds}/docs.pklz4'],
message=f'Migrating {NAME}/{ds} (structuring fields)')
docs = docs_migrator(BeirDocs(ds, ZipExtract(dlc_ds, f'{ds_zip}/corpus.jsonl'), doc_type))
queries = BeirQueries(ds, Cache(ZipExtract(dlc_ds, f'{ds_zip}/queries.jsonl'), base_path/ds/'queries.json'), query_type)
if len(qrels) == 1:
subsets[ds] = Dataset(
docs,
queries,
BeirQrels(Cache(ZipExtract(dlc_ds, f'{ds_zip}/qrels/{qrels[0]}.tsv'), base_path/ds/f'{qrels[0]}.qrels'), qrels_defs={}),
documentation(ds)
)
else:
subsets[ds] = Dataset(
docs,
queries,
documentation(ds)
)
for qrel in qrels:
subset_qrels = BeirQrels(Cache(ZipExtract(dlc_ds, f'{ds_zip}/qrels/{qrel}.tsv'), base_path/ds/f'{qrel}.qrels'), qrels_defs={})
subset_qids = qid_filter(subset_qrels)
subsets[f'{ds}/{qrel}'] = Dataset(
docs,
FilteredQueries(queries, subset_qids, mode='include'),
subset_qrels,
documentation(f'{ds}/{qrel}')
)
cqa = ['android', 'english', 'gaming', 'gis', 'mathematica', 'physics', 'programmers', 'stats', 'tex', 'unix', 'webmasters', 'wordpress']
cqa_dlc = dlc['cqadupstack']
for ds in cqa:
docs_migrator = Migrator(base_path/'cqadupstack'/ds/'irds_version.txt', 'v2',
affected_files=[f'{base_path/"cqadupstack"/ds}/docs.pklz4'],
message=f'Migrating {NAME}/cqadupstack/{ds} (structuring fields)')
subsets[f'cqadupstack/{ds}'] = Dataset(
docs_migrator(BeirDocs(f'cqadupstack/{ds}', ZipExtract(cqa_dlc, f'cqadupstack/{ds}/corpus.jsonl'), BeirCqaDoc)),
BeirQueries(f'cqadupstack/{ds}', Cache(ZipExtract(cqa_dlc, f'cqadupstack/{ds}/queries.jsonl'), base_path/'cqadupstack'/ds/'queries.json'), BeirCqaQuery),
BeirQrels(Cache(ZipExtract(cqa_dlc, f'cqadupstack/{ds}/qrels/test.tsv'), base_path/'cqadupstack'/ds/f'test.qrels'), qrels_defs={}),
documentation(f'cqadupstack/{ds}')
)
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
def qid_filter(subset_qrels):
# NOTE: this must be in a separate function otherwise there can be weird lambda binding problems
return Lazy(lambda: {q.query_id for q in subset_qrels.qrels_iter()})
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/c4.py
================================================
import re
import os
import json
import pickle
from pathlib import Path
from typing import NamedTuple, Tuple
import ir_datasets
from ir_datasets.util import DownloadConfig, Download, RequestsDownload, TarExtractAll, GzipExtract
from ir_datasets.formats import BaseDocs, TrecXmlQueries, DocSourceSeekableIter, DocSource, SourceDocIter
from ir_datasets.datasets.base import Dataset, YamlDocumentation
from ir_datasets.indices import Docstore, DEFAULT_DOCSTORE_OPTIONS
_logger = ir_datasets.log.easy()
NAME = 'c4'
misinfo_map = {'number': 'query_id', 'query': 'text', 'description': 'description', 'narrative': 'narrative', 'disclaimer': 'disclaimer', 'stance': 'stance', 'evidence': 'evidence'}
class C4Doc(NamedTuple):
doc_id: str
text: str
url: str
timestamp: str
def default_text(self):
"""
text
"""
return self.text
class MisinfoQuery(NamedTuple):
query_id: str
text: str
description: str
narrative: str
disclaimer: str
stance: str
evidence: str
def default_text(self):
"""
text
"""
return self.text
class C4Source(DocSource):
def __init__(self, name, dlc, checkpoint_dlc, doc_count, checkpoint_freq, size_hint, cache_path):
self.name = name # e.g., en.noclean.c4-train.01234-of-07168
self.dlc = dlc
self.checkpoint_dlc = checkpoint_dlc
self.doc_count = doc_count
self.checkpoint_freq = checkpoint_freq
self._checkpoints = None
self.size_hint = size_hint
self.cache_path = cache_path
def __len__(self):
return self.doc_count
def __iter__(self):
return C4SourceIter(self)
def checkpoints(self):
if self._checkpoints is None:
chk_file_name = self.dlc.path().split('/')[-1] + '.chk.pkl.lz4'
with ir_datasets.lazy_libs.lz4_frame().frame.open(os.path.join(self.checkpoint_dlc.path(), chk_file_name)) as f:
self._checkpoints = pickle.load(f)
return self._checkpoints
class C4SourceIter(DocSourceSeekableIter):
def __init__(self, source):
self.source = source
self.idx = 0
self.source_f = ir_datasets.lazy_libs.zlib_state().GzipStateFile(self.source.dlc.path())
def close(self):
if self.source_f is not None:
self.source_f.close()
self.source_f = None
def __next__(self):
line = self.source_f.readline()
if not line:
raise StopIteration()
data = json.loads(line)
doc_id = f'{self.source.name}.{self.idx}'
self.idx += 1
return C4Doc(doc_id, data['text'], data['url'], data['timestamp'])
def seek(self, idx):
if (idx < self.idx) or \
(idx // self.source.checkpoint_freq != self.idx // self.source.checkpoint_freq) and \
(idx - self.idx > 100):
# either we're going backward in the file or the index is in a different
# checkpoint than we're at now, so we can jump ahead.
# (or we're not jumping very far ahead (<100 documents), so don't bother
# loading checkpoints, e.g., this is a case where step is used when iterating
# over the documents.)
target_checkpoint = idx // self.source.checkpoint_freq
checkpoints = self.source.checkpoints()
effective_checkpoint = min(target_checkpoint, len(checkpoints) - 1)
pos, state, offset = checkpoints[effective_checkpoint]
self.source_f.zseek(pos, state)
self.source_f.read(offset)
self.idx = effective_checkpoint * self.source.checkpoint_freq
while idx > self.idx:
# read the file in sequence 'till we get to the desired index
self.source_f.readline()
self.idx += 1
class C4Docstore(Docstore):
def __init__(self, docs, options=DEFAULT_DOCSTORE_OPTIONS):
super().__init__(docs.docs_cls(), 'doc_id', options=options)
self.docs = docs
def get_many_iter(self, doc_ids):
files_to_search = {}
for doc_id in doc_ids:
match = re.match(r'^en.noclean.c4-train.(\d+)-of-07168.(\d+)$', doc_id)
if not match:
continue
file_idx, doc_idx = match.groups()
file_idx, doc_idx = int(file_idx), int(doc_idx)
if file_idx not in files_to_search:
files_to_search[file_idx] = []
files_to_search[file_idx].append(doc_idx)
sources = self.docs._docs_sources()
for file_idx, doc_idxs in files_to_search.items():
if file_idx >= len(sources):
continue
source = sources[file_idx]
doc_idxs = sorted(doc_idxs)
with iter(source) as it:
for doc_idx in doc_idxs:
it.seek(doc_idx)
res = next(it, StopIteration)
if res is not StopIteration:
yield res
class C4Docs(BaseDocs):
def __init__(self, sources_dlc, checkpoint_dlc, base_path, source_name_filter=None, filter_name=''):
super().__init__()
self._sources_dlc = sources_dlc
self._checkpoint_dlc = checkpoint_dlc
self._sources = None
self._base_path = Path(base_path)
self._source_name_filter = source_name_filter
self._filter_name = filter_name
def docs_iter(self):
return SourceDocIter(self, slice(0, self.docs_count(force=True)))
def docs_cls(self):
return C4Doc
def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
assert field == 'doc_id'
return C4Docstore(self, options=options)
def docs_count(self, force=False):
if force or self._sources is not None:
return sum(s.doc_count for s in self._docs_sources())
def docs_namespace(self):
return NAME
def docs_lang(self):
return 'en'
def docs_source_iter(self):
return iter(self._docs_sources())
def _docs_sources(self):
if self._sources is None:
sources = []
with self._sources_dlc.stream() as stream:
for source in json.load(stream):
if self._source_name_filter:
if not re.match(self._source_name_filter, source['name']):
continue
cache_path = os.path.join(self._base_path, 'en.noclean', source['url'].split('/')[-1])
dlc = Download([RequestsDownload(source['url'])], expected_md5=source['expected_md5'], cache_path=cache_path)
sources.append(C4Source(source['name'].replace('.json.gz', ''), dlc, self._checkpoint_dlc, source['doc_count'], source['checkpoint_freq'], source['size_hint'], cache_path))
self._sources = sources
build_flag = self._base_path / 'en.noclean' / f'_built{self._filter_name}'
if not build_flag.exists():
remaining_size = sum(s.size_hint for s in sources if not os.path.exists(s.cache_path))
if remaining_size > 0:
_logger.info(f'Will start downloading c4/en-noclean files ({ir_datasets.util.format_file_size(remaining_size)}). '
f'If you already have a copy, you may link them to {self._base_path / "en.noclean"} (should contain '
f'files like c4-train.00000-of-07168.json.gz)')
ir_datasets.util.check_disk_free(self._base_path / 'en.noclean', remaining_size)
for source in sources:
path = source.dlc.path() # downloads if it doesn't already exist
# A quick check that should help make sure it's probably correct if the user downloaded
# it themselves. (Not much overhead if downloaded ourselves.)
true_size = os.path.getsize(path)
if true_size != source.size_hint:
raise RuntimeError(f'Expected {path} to be {source.size_hint} bytes but it was actually {true_size} bytes.')
build_flag.touch()
return self._sources
def _init():
subsets = {}
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
en_noclean_tr_collection = C4Docs(
GzipExtract(dlc['en-noclean/sources']),
TarExtractAll(dlc['en-noclean/checkpoints'], base_path / 'en.noclean.checkpoints'),
base_path, source_name_filter=r'en\.noclean\.c4-train', filter_name='train') # exclude validation files (only include train)
base = Dataset(documentation('_'))
subsets['en-noclean-tr'] = Dataset(
en_noclean_tr_collection,
documentation('en-noclean-tr'))
subsets['en-noclean-tr/trec-misinfo-2021'] = Dataset(
en_noclean_tr_collection,
TrecXmlQueries(dlc['trec-misinfo-2021/queries'], qtype=MisinfoQuery, qtype_map=misinfo_map, namespace='trec-misinfo', lang='en'),
documentation('en-noclean-tr/trec-misinfo-2021'))
ir_datasets.registry.register(NAME, base)
for subset in subsets:
ir_datasets.registry.register(f'{NAME}/{subset}', subsets[subset])
return base, subsets
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/car.py
================================================
from typing import NamedTuple, Tuple
import ir_datasets
from ir_datasets.util import DownloadConfig, TarExtract, ReTar
from ir_datasets.formats import TrecQrels, BaseDocs, BaseQueries, GenericDoc
from ir_datasets.datasets.base import Dataset, YamlDocumentation
from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS
NAME = 'car'
AUTO_QRELS = {
1: 'Paragraph appears under heading'
}
MANUAL_QRELS = {
3: 'MUST be mentioned',
2: 'SHOULD be mentioned',
1: 'CAN be mentioned',
0: 'Non-relevant, but roughly on TOPIC',
-1: 'NO, non-relevant',
-2: 'Trash',
}
class CarQuery(NamedTuple):
query_id: str
text: str
title: str
headings: Tuple[str, ...]
def default_text(self):
"""
text (which is title + headings)
"""
return self.text
class CarDocs(BaseDocs):
def __init__(self, streamer, count_hint=None):
super().__init__()
self._streamer = streamer
self._count_hint = count_hint
@ir_datasets.util.use_docstore
def docs_iter(self):
trec_car = ir_datasets.lazy_libs.trec_car()
with self._streamer.stream() as stream:
paras = trec_car.read_data.iter_paragraphs(stream)
for p in paras:
yield GenericDoc(p.para_id, p.get_text())
def docs_cls(self):
return GenericDoc
def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
return PickleLz4FullStore(
path=f'{ir_datasets.util.home_path()/NAME}/docs.pklz4',
init_iter_fn=self.docs_iter,
data_cls=self.docs_cls(),
lookup_field=field,
index_fields=['doc_id'],
count_hint=self._count_hint,
options=options
)
def docs_count(self):
if self.docs_store().built():
return self.docs_store().count()
def docs_namespace(self):
return NAME
def docs_lang(self):
return 'en'
class CarQueries(BaseQueries):
def __init__(self, streamer):
super().__init__()
self._streamer = streamer
def queries_iter(self):
trec_car = ir_datasets.lazy_libs.trec_car()
with self._streamer.stream() as stream:
for page in trec_car.read_data.iter_outlines(stream):
for heads in page.flat_headings_list():
qid = '/'.join([page.page_id] + [h.headingId for h in heads])
title = page.page_name
headings = tuple(h.heading for h in heads)
text = ' '.join((title,) + headings)
yield CarQuery(qid, text, title, headings)
def queries_namespace(self):
return NAME
def queries_cls(self):
return CarQuery
def queries_lang(self):
return 'en'
def _init():
subsets = {}
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
docs_v15 = CarDocs(TarExtract(dlc['docs'], 'paragraphcorpus/paragraphcorpus.cbor', compression='xz'), count_hint=ir_datasets.util.count_hint(f'{NAME}/v1.5'))
docs_v20 = CarDocs(TarExtract(dlc['docs/v2.0'], 'paragraphCorpus/dedup.articles-paragraphs.cbor', compression='xz'), count_hint=ir_datasets.util.count_hint(f'{NAME}/v2.0'))
base = Dataset(documentation('_'))
subsets['v1.5'] = Dataset(docs_v15, documentation('v1.5'))
subsets['v1.5/trec-y1'] = Dataset(
docs_v15,
CarQueries(TarExtract(dlc['trec-y1/queries'], 'benchmarkY1test.public/test.benchmarkY1test.cbor.outlines', compression='xz')),)
subsets['v1.5/trec-y1/manual'] = Dataset(
subsets['v1.5/trec-y1'],
TrecQrels(TarExtract(dlc['trec-y1/qrels'], 'TREC_CAR_2017_qrels/manual.benchmarkY1test.cbor.hierarchical.qrels'), MANUAL_QRELS))
subsets['v1.5/trec-y1/auto'] = Dataset(
subsets['v1.5/trec-y1'],
TrecQrels(TarExtract(dlc['trec-y1/qrels'], 'TREC_CAR_2017_qrels/automatic.benchmarkY1test.cbor.hierarchical.qrels'), AUTO_QRELS))
subsets['v1.5/test200'] = Dataset(
docs_v15,
CarQueries(TarExtract(dlc['test200'], 'test200/train.test200.cbor.outlines', compression='xz')),
TrecQrels(TarExtract(dlc['test200'], 'test200/train.test200.cbor.hierarchical.qrels', compression='xz'), AUTO_QRELS))
train_data = ReTar(dlc['train'], base_path/'train.smaller.tar.xz', ['train/train.fold?.cbor.outlines', 'train/train.fold?.cbor.hierarchical.qrels'], compression='xz')
subsets['v1.5/train/fold0'] = Dataset(
docs_v15,
CarQueries(TarExtract(train_data, 'train/train.fold0.cbor.outlines', compression='xz')),
TrecQrels(TarExtract(train_data, 'train/train.fold0.cbor.hierarchical.qrels', compression='xz'), AUTO_QRELS))
subsets['v1.5/train/fold1'] = Dataset(
docs_v15,
CarQueries(TarExtract(train_data, 'train/train.fold1.cbor.outlines', compression='xz')),
TrecQrels(TarExtract(train_data, 'train/train.fold1.cbor.hierarchical.qrels', compression='xz'), AUTO_QRELS))
subsets['v1.5/train/fold2'] = Dataset(
docs_v15,
CarQueries(TarExtract(train_data, 'train/train.fold2.cbor.outlines', compression='xz')),
TrecQrels(TarExtract(train_data, 'train/train.fold2.cbor.hierarchical.qrels', compression='xz'), AUTO_QRELS))
subsets['v1.5/train/fold3'] = Dataset(
docs_v15,
CarQueries(TarExtract(train_data, 'train/train.fold3.cbor.outlines', compression='xz')),
TrecQrels(TarExtract(train_data, 'train/train.fold3.cbor.hierarchical.qrels', compression='xz'), AUTO_QRELS))
subsets['v1.5/train/fold4'] = Dataset(
docs_v15,
CarQueries(TarExtract(train_data, 'train/train.fold4.cbor.outlines', compression='xz')),
TrecQrels(TarExtract(train_data, 'train/train.fold4.cbor.hierarchical.qrels', compression='xz'), AUTO_QRELS))
subsets['v2.0'] = Dataset(docs_v20, documentation('v2.0'))
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s)))
return base, subsets
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/clinicaltrials.py
================================================
import codecs
import itertools
import io
import gzip
from contextlib import ExitStack
import itertools
from typing import NamedTuple, Tuple
import tarfile
import zipfile
import xml.etree.ElementTree as ET
import ir_datasets
from ir_datasets.util import DownloadConfig, GzipExtract, ZipExtract
from ir_datasets.formats import BaseDocs, BaseQueries, GenericQuery, TrecQrels, TrecXmlQueries
from ir_datasets.datasets.base import Dataset, YamlDocumentation
from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS
from . import medline
_logger = ir_datasets.log.easy()
QREL_DEFS = {
0: 'not relevant',
1: 'possibly relevant',
2: 'definitely relevant'
}
QREL_DEFS_2021 = {
0: 'Not Relevant',
1: 'Excluded',
2: 'Eligible',
}
NAME = 'clinicaltrials'
ct_qmap = {'topic': 'text'}
class ClinicalTrialsDoc(NamedTuple):
doc_id: str
title: str
condition: str
summary: str
detailed_description: str
eligibility: str
class ClinicalTrialsDocs(BaseDocs):
def __init__(self, name, dlcs, compress_format='tgz', count_hint=None):
self._name = name
self._dlcs = dlcs
self._compress_format = compress_format
self._count_hint = count_hint
def docs_iter(self):
return iter(self.docs_store())
def _docs_iter(self):
for dlc in self._dlcs:
with dlc.stream() as stream, ExitStack() as stack:
if self._compress_format == 'tgz':
tarf = stack.enter_context(tarfile.open(fileobj=stream, mode='r|gz'))
tarf_iter = iter(tarf)
extract = tarf.extractfile
path_attr = 'path'
elif self._compress_format == 'zip':
tarf = stack.enter_context(zipfile.ZipFile(stream))
tarf_iter = tarf.filelist
extract = tarf.open
path_attr = 'filename'
else:
raise ValueError('unknown compress format')
for record in tarf_iter:
if getattr(record, path_attr).endswith('.xml'):
xml = extract(record).read()
yield self._parse_doc(xml)
def _parse_doc(self, xml):
xml = ET.fromstring(xml)
doc_id = ''.join(xml.find('.//nct_id').itertext())
title = xml.find('.//official_title')
if not title:
title = xml.find('.//brief_title')
title = ''.join(title.itertext())
condition = xml.find('.//condition')
condition = ''.join(condition.itertext()) if condition else ''
summary = xml.find('.//brief_summary')
summary = ''.join(summary.itertext()) if summary else ''
detailed_description = xml.find('.//detailed_description')
detailed_description = ''.join(detailed_description.itertext()) if detailed_description else ''
eligibility = xml.find('.//eligibility/criteria')
eligibility = ''.join(eligibility.itertext()) if eligibility else ''
return ClinicalTrialsDoc(doc_id, title, condition, summary, detailed_description, eligibility)
def docs_path(self, force=True):
return ir_datasets.util.home_path()/NAME/self._name/'corpus'
def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
return PickleLz4FullStore(
path=f'{self.docs_path(force=False)}.pklz4',
init_iter_fn=self._docs_iter,
data_cls=self.docs_cls(),
lookup_field=field,
index_fields=['doc_id'],
count_hint=self._count_hint,
options=options
)
def docs_cls(self):
return ClinicalTrialsDoc
def docs_namespace(self):
return NAME
def docs_count(self):
if self.docs_store().built():
return self.docs_store().count()
def docs_lang(self):
return 'en'
def _init():
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
subsets = {}
base = Dataset(documentation('_'))
collection17 = ClinicalTrialsDocs('2017', [dlc['docs/2017']], count_hint=ir_datasets.util.count_hint(f'{NAME}/2017'))
collection19 = ClinicalTrialsDocs('2019', [dlc['docs/2019/0'], dlc['docs/2019/1'], dlc['docs/2019/2'], dlc['docs/2019/3']], count_hint=ir_datasets.util.count_hint(f'{NAME}/2019'))
collection21 = ClinicalTrialsDocs('2021', [dlc['docs/2021/1'], dlc['docs/2021/2'], dlc['docs/2021/3'], dlc['docs/2021/4'], dlc['docs/2021/5']], compress_format='zip', count_hint=ir_datasets.util.count_hint(f'{NAME}/2021'))
subsets['2017'] = Dataset(collection17, documentation('2017'))
subsets['2019'] = Dataset(collection19, documentation('2019'))
subsets['2021'] = Dataset(collection21, documentation('2021'))
subsets['2017/trec-pm-2017'] = Dataset(
collection17,
medline.subsets['2017/trec-pm-2017'].queries_handler(),
TrecQrels(dlc['trec-pm-2017/qrels'], QREL_DEFS),
documentation('trec-pm-2017')
)
subsets['2017/trec-pm-2018'] = Dataset(
collection17,
medline.subsets['2017/trec-pm-2018'].queries_handler(),
TrecQrels(dlc['trec-pm-2018/qrels'], QREL_DEFS),
documentation('trec-pm-2018')
)
subsets['2019/trec-pm-2019'] = Dataset(
collection19,
TrecXmlQueries(dlc['trec-pm-2019/queries'], qtype=medline.TrecPmQuery, namespace='trec-pm-2019', lang='en'),
TrecQrels(dlc['trec-pm-2019/qrels'], QREL_DEFS),
documentation('trec-pm-2019')
)
subsets['2021/trec-ct-2021'] = Dataset(
collection21,
TrecXmlQueries(dlc['trec-ct-2021/queries'], qtype=GenericQuery, qtype_map=ct_qmap, namespace='trec-ct-2021', lang='en'),
TrecQrels(dlc['trec-ct-2021/qrels'], QREL_DEFS_2021),
documentation('trec-ct-2021'))
subsets['2021/trec-ct-2022'] = Dataset(
collection21,
TrecXmlQueries(dlc['trec-ct-2022/queries'], qtype=GenericQuery, qtype_map=ct_qmap, namespace='trec-ct-2022', lang='en'),
documentation('trec-ct-2022'))
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/clirmatrix.py
================================================
import json
import contextlib
from pathlib import Path
from typing import NamedTuple
import ir_datasets
from ir_datasets.util import GzipExtract, Lz4Extract, DownloadConfig, _DownloadConfig, MetadataProvider, MetadataComponent
from ir_datasets.datasets.base import Dataset, YamlDocumentation
from ir_datasets.formats import TsvDocs, CLIRMatrixQueries, CLIRMatrixQrels
NAME = 'clirmatrix'
_logger = ir_datasets.log.easy()
QRELS_DEFS = {
6: "Most relevant, based on Jenks-optimized BM25 retrieval scores in the source language",
5: "Jenks-optimized BM25 retrieval scores in the source language",
4: "Jenks-optimized BM25 retrieval scores in the source language",
3: "Jenks-optimized BM25 retrieval scores in the source language",
2: "Jenks-optimized BM25 retrieval scores in the source language",
1: "Jenks-optimized BM25 retrieval scores in the source language",
0: "Document not retrieved in the source language",
}
def _init():
LANGS = ('af', 'als', 'am', 'an', 'ar', 'arz', 'ast', 'az', 'azb', 'ba', 'bar', 'be', 'bg', 'bn', 'bpy', 'br', 'bs', 'bug', 'ca', 'cdo', 'ce', 'ceb', 'ckb', 'cs', 'cv', 'cy', 'da', 'de', 'diq', 'el', 'eml', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fo', 'fr', 'fy', 'ga', 'gd', 'gl', 'gu', 'he', 'hi', 'hr', 'hsb', 'ht', 'hu', 'hy', 'ia', 'id', 'ilo', 'io', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'kn', 'ko', 'ku', 'ky', 'la', 'lb', 'li', 'lmo', 'lt', 'lv', 'mai', 'mg', 'mhr', 'min', 'mk', 'ml', 'mn', 'mr', 'mrj', 'ms', 'my', 'mzn', 'nap', 'nds', 'ne', 'new', 'nl', 'nn', 'no', 'oc', 'or', 'os', 'pa', 'pl', 'pms', 'pnb', 'ps', 'pt', 'qu', 'ro', 'ru', 'sa', 'sah', 'scn', 'sco', 'sd', 'sh', 'si', 'simple', 'sk', 'sl', 'sq', 'sr', 'su', 'sv', 'sw', 'szl', 'ta', 'te', 'tg', 'th', 'tl', 'tr', 'tt', 'uk', 'ur', 'uz', 'vec', 'vi', 'vo', 'wa', 'war', 'wuu', 'xmf', 'yi', 'yo', 'zh')
LANG_REGEX = '(' + '|'.join(LANGS) + ')'
MULTI8_LANGS = ('ar', 'de', 'en', 'es', 'fr', 'ja', 'ru', 'zh')
MULTI8_LANG_REGEX = '(' + '|'.join(MULTI8_LANGS) + ')'
base_path = ir_datasets.util.home_path()/NAME
base_dlc = DownloadConfig.context(NAME, base_path)
def _dlc_init():
with GzipExtract(base_dlc['downloads']).stream() as f:
clirmatrix_dlc = _DownloadConfig(contents=json.load(f))
return clirmatrix_dlc
_dlc = ir_datasets.util.Lazy(_dlc_init)
metadata = MetadataProvider(MetadataProvider.json_loader(Lz4Extract(base_dlc['metadata'])))
_docs_cache = {}
def _docs_initializer(lang_code):
if lang_code not in _docs_cache:
dlc = _dlc().context("clirmatrix_docs", base_path)
docs = TsvDocs(GzipExtract(dlc[f'docs/{lang_code}']), namespace=f'{NAME}/{lang_code}', lang=lang_code)
_docs_cache[lang_code] = docs
return _docs_cache[lang_code]
def _initializer(dsid, args, dlc_context=None):
docs_lang, queries_lang, split = args
docs = _docs_initializer(docs_lang)
components = [docs]
if queries_lang: # queries & split are optional
dlc = _dlc().context(dlc_context, base_path)
dlc_key = f'queries/{queries_lang}_{docs_lang}/{split}'
qrel_dlc = GzipExtract(dlc[dlc_key])
qrels = CLIRMatrixQrels(qrel_dlc, QRELS_DEFS)
queries = CLIRMatrixQueries(qrel_dlc, queries_lang)
components += [queries, qrels]
result = Dataset(*components)
result = Dataset(MetadataComponent(dsid, result, metadata), result)
return result
def _multi8_initializer(dsid, args):
return _initializer(dsid, args, 'clirmatrix_multi8')
def _bi139_base_initializer(dsid, args):
return _initializer(dsid, args, 'clirmatrix_bi139_base')
def _bi139_full_initializer(dsid, args):
return _initializer(dsid, args, 'clirmatrix_bi139_full')
def _corpus_initializer(dsid, args):
return _initializer(dsid, (args[0], None, None))
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
base = Dataset(documentation('_'))
ir_datasets.registry.register(NAME, base)
ir_datasets.registry.register_pattern(rf'^{NAME}/{LANG_REGEX}$', _corpus_initializer)
ir_datasets.registry.register_pattern(rf'^{NAME}/{MULTI8_LANG_REGEX}/multi8/{MULTI8_LANG_REGEX}/(train|dev|test1|test2)$', _multi8_initializer)
ir_datasets.registry.register_pattern(rf'^{NAME}/{LANG_REGEX}/bi139-base/{LANG_REGEX}/(train|dev|test1|test2)$', _bi139_base_initializer)
ir_datasets.registry.register_pattern(rf'^{NAME}/{LANG_REGEX}/bi139-full/{LANG_REGEX}/(train|dev|test1|test2)$', _bi139_full_initializer)
return base
collection = _init()
================================================
FILE: ir_datasets/datasets/clueweb09.py
================================================
import os
import codecs
from pathlib import Path
from typing import NamedTuple, Tuple
from glob import glob
import ir_datasets
from ir_datasets.util import GzipExtract, Lazy, DownloadConfig, TarExtract, Cache, Bz2Extract, ZipExtract, TarExtractAll
from ir_datasets.formats import TrecQrels, TrecDocs, TrecXmlQueries, WarcDocs, GenericDoc, GenericQuery, TrecQrel, TrecSubQrels, TrecSubQrel, TrecSubtopic, TrecPrel, TrecPrels, TrecColonQueries, BaseQrels
from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredQrels, YamlDocumentation
from ir_datasets.indices import Docstore, CacheDocstore
NAME = 'clueweb09'
QREL_DEFS = {
4: 'Nav: This page represents a home page of an entity directly named by the query; the user may be searching for this specific page or site.',
3: 'Key: This page or site is dedicated to the topic; authoritative and comprehensive, it is worthy of being a top result in a web search engine.',
2: 'HRel: The content of this page provides substantial information on the topic.',
1: 'Rel: The content of this page provides some information on the topic, which may be minimal; the relevant information must be on that page, not just promising-looking anchor text pointing to a possibly useful page.',
0: 'Non: The content of this page does not provide useful information on the topic, but may provide useful information on other topics, including other interpretations of the same query.',
-2: 'Junk: This page does not appear to be useful for any reasonable purpose; it may be spam or junk',
}
QREL_DEFS_09 = {
2: 'highly relevant',
1: 'relevant',
0: 'not relevant',
}
SQREL_DEFS_09 = {
1: 'relevant',
0: 'not relevant'
}
class TrecWebTrackQuery(NamedTuple):
query_id: str
query: str
description: str
type: str
subtopics: Tuple[TrecSubtopic, ...]
def default_text(self):
"""
query
"""
return self.query
class ClueWeb09Docs(WarcDocs):
def __init__(self, docs_dlc, chk_dlc, dirs=None, lang=None):
super().__init__(warc_cw09=True, lang=lang)
self.docs_dlc = docs_dlc
self.chk_dlc = chk_dlc
# All available languages
self.dirs = dirs or ['ClueWeb09_Arabic_1', 'ClueWeb09_Chinese_1', 'ClueWeb09_Chinese_2', 'ClueWeb09_Chinese_3', 'ClueWeb09_Chinese_4', 'ClueWeb09_English_1', 'ClueWeb09_English_2', 'ClueWeb09_English_3', 'ClueWeb09_English_4', 'ClueWeb09_English_5', 'ClueWeb09_English_6', 'ClueWeb09_English_7', 'ClueWeb09_English_8', 'ClueWeb09_English_9', 'ClueWeb09_English_10', 'ClueWeb09_French_1', 'ClueWeb09_German_1', 'ClueWeb09_Italian_1', 'ClueWeb09_Japanese_1', 'ClueWeb09_Japanese_2', 'ClueWeb09_Korean_1', 'ClueWeb09_Portuguese_1', 'ClueWeb09_Spanish_1', 'ClueWeb09_Spanish_2']
self._docs_warc_file_counts_cache = None
def docs_path(self, force=True):
return self.docs_dlc.path(force)
def _docs_iter_source_files(self):
files = []
for d in self.dirs:
files += sorted(glob(os.path.join(self.docs_dlc.path(), d, '*')))
for source_dir in files:
for source_file in sorted(glob(os.path.join(source_dir, '*.gz'))):
yield source_file
def _docs_id_to_source_file(self, doc_id):
parts = doc_id.split('-')
if len(parts) != 4:
return None
dataset, sec, part, doc = parts
if dataset != 'clueweb09':
return None
source_glob = os.path.join(self.docs_dlc.path(), f'ClueWeb09_*', sec, f'{part}.warc.gz')
source_file = glob(source_glob)
if len(source_file) == 0:
return None
if len(source_file) > 1:
raise ValueError(f'doc_id {doc_id} found in multiple files: {source_file}')
return source_file[0]
def _docs_source_file_to_checkpoint(self, source_file):
source_prefix = Path(self.docs_dlc.path())
source_file = Path(source_file)
index_prefix = Path(self.chk_dlc.path())
result = index_prefix / source_file.relative_to(source_prefix)
if result == source_file:
return None
return f'{result}.chk.lz4'
def _docs_warc_file_counts(self):
if self._docs_warc_file_counts_cache is None:
result = {}
for d in self.dirs:
counts_file = os.path.join(self.docs_dlc.path(), f'record_counts/{d}_counts.txt')
with open(counts_file, 'rt') as f:
for line in f:
file, count = line.strip().split()
# Fixing bug in record_counts: en0054 is under ClueWeb09_English_4, not _5
if d == 'ClueWeb09_English_5' and 'en0054' in file:
file = os.path.join(self.docs_dlc.path(), 'ClueWeb09_English_4', file[3:])
else:
file = os.path.join(self.docs_dlc.path(), d, file[3:])
result[file] = int(count)
self._docs_warc_file_counts_cache = result
return self._docs_warc_file_counts_cache
def docs_namespace(self):
return NAME
class CatBQrelFilter(BaseQrels):
def __init__(self, qrels_handler):
self._qrels_handler = qrels_handler
def qrels_iter(self):
catb_segs = {'en0000','en0001','en0002','en0003','en0004','en0005','en0006','en0007','en0008','en0009','en0010','en0011','enwp00','enwp01','enwp02','enwp03'}
for qrel in self._qrels_handler.qrels_iter():
_, seg_id, _, _ = qrel.doc_id.split('-')
if seg_id in catb_segs:
yield qrel
def qrels_defs(self):
return self._qrels_handler.qrels_defs()
def qrels_cls(self):
return self._qrels_handler.qrels_cls()
def qrels_path(self):
return self._qrels_handler.qrels_path()
def _init():
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
subsets = {}
docs_dlc = dlc['docs']
chk_dlc = TarExtractAll(dlc['docs.chk'], base_path/'corpus.chk')
collection = ClueWeb09Docs(docs_dlc, chk_dlc, lang=None) # multiple langs
collection_ar = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_Arabic_1'], lang='ar')
collection_zh = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_Chinese_1', 'ClueWeb09_Chinese_2', 'ClueWeb09_Chinese_3', 'ClueWeb09_Chinese_4'], lang='zh')
collection_en = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_English_1', 'ClueWeb09_English_2', 'ClueWeb09_English_3', 'ClueWeb09_English_4', 'ClueWeb09_English_5', 'ClueWeb09_English_6', 'ClueWeb09_English_7', 'ClueWeb09_English_8', 'ClueWeb09_English_9', 'ClueWeb09_English_10'], lang='en')
collection_fr = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_French_1'], lang='fr')
collection_de = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_German_1'], lang='de')
collection_it = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_Italian_1'], lang='it')
collection_ja = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_Japanese_1', 'ClueWeb09_Japanese_2'], lang='ja')
collection_ko = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_Korean_1'], lang='ko')
collection_pt = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_Portuguese_1'], lang='pt')
collection_es = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_Spanish_1', 'ClueWeb09_Spanish_2'], lang='es')
collection_catb = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_English_1'], lang='en')
base = Dataset(collection, documentation('_'))
subsets['ar'] = Dataset(collection_ar, documentation('ar'))
subsets['zh'] = Dataset(collection_zh, documentation('zh'))
subsets['en'] = Dataset(collection_en, documentation('en'))
subsets['fr'] = Dataset(collection_fr, documentation('fr'))
subsets['de'] = Dataset(collection_de, documentation('de'))
subsets['it'] = Dataset(collection_it, documentation('it'))
subsets['ja'] = Dataset(collection_ja, documentation('ja'))
subsets['ko'] = Dataset(collection_ko, documentation('ko'))
subsets['pt'] = Dataset(collection_pt, documentation('pt'))
subsets['es'] = Dataset(collection_es, documentation('es'))
subsets['catb'] = Dataset(collection_catb, documentation('catb'))
subsets['en/trec-web-2009'] = Dataset(
collection_en,
TrecXmlQueries(dlc['trec-web-2009/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'),
TrecPrels(GzipExtract(dlc['trec-web-2009/qrels.adhoc']), QREL_DEFS_09),
documentation('trec-web-2009'))
# NOTE: Contains positive (1) and negative (0) judgements at subtopic level
subsets['en/trec-web-2009/diversity'] = Dataset(
collection_en,
TrecXmlQueries(dlc['trec-web-2009/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'),
TrecSubQrels(GzipExtract(dlc['trec-web-2009/qrels.all']), SQREL_DEFS_09),
documentation('trec-web-2009'))
subsets['en/trec-web-2010'] = Dataset(
collection_en,
TrecXmlQueries(dlc['trec-web-2010/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'),
TrecQrels(dlc['trec-web-2010/qrels.adhoc'], QREL_DEFS),
documentation('trec-web-2010'))
subsets['en/trec-web-2010/diversity'] = Dataset(
collection_en,
TrecXmlQueries(dlc['trec-web-2010/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'),
TrecSubQrels(dlc['trec-web-2010/qrels.all'], QREL_DEFS),
documentation('trec-web-2010'))
subsets['en/trec-web-2011'] = Dataset(
collection_en,
TrecXmlQueries(dlc['trec-web-2011/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'),
TrecQrels(dlc['trec-web-2011/qrels.adhoc'], QREL_DEFS),
documentation('trec-web-2011'))
subsets['en/trec-web-2011/diversity'] = Dataset(
collection_en,
TrecXmlQueries(dlc['trec-web-2011/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'),
TrecSubQrels(dlc['trec-web-2011/qrels.all'], QREL_DEFS),
documentation('trec-web-2011'))
subsets['en/trec-web-2012'] = Dataset(
collection_en,
TrecXmlQueries(dlc['trec-web-2012/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'),
TrecQrels(dlc['trec-web-2012/qrels.adhoc'], QREL_DEFS),
documentation('trec-web-2012'))
subsets['en/trec-web-2012/diversity'] = Dataset(
collection_en,
TrecXmlQueries(dlc['trec-web-2012/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'),
TrecSubQrels(dlc['trec-web-2012/qrels.all'], QREL_DEFS),
documentation('trec-web-2012'))
subsets['catb/trec-web-2009'] = Dataset(
collection_catb,
TrecXmlQueries(dlc['trec-web-2009/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'),
CatBQrelFilter(TrecPrels(GzipExtract(dlc['trec-web-2009/qrels.adhoc']), QREL_DEFS_09)),
documentation('trec-web-2009'))
subsets['catb/trec-web-2009/diversity'] = Dataset(
collection_catb,
TrecXmlQueries(dlc['trec-web-2009/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'),
CatBQrelFilter(TrecSubQrels(GzipExtract(dlc['trec-web-2009/qrels.all']), SQREL_DEFS_09)),
documentation('trec-web-2009'))
subsets['catb/trec-web-2010'] = Dataset(
collection_catb,
TrecXmlQueries(dlc['trec-web-2010/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'),
CatBQrelFilter(TrecQrels(dlc['trec-web-2010/qrels.adhoc'], QREL_DEFS)),
documentation('trec-web-2010'))
subsets['catb/trec-web-2010/diversity'] = Dataset(
collection_catb,
TrecXmlQueries(dlc['trec-web-2010/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'),
CatBQrelFilter(TrecSubQrels(dlc['trec-web-2010/qrels.all'], QREL_DEFS)),
documentation('trec-web-2010'))
subsets['catb/trec-web-2011'] = Dataset(
collection_catb,
TrecXmlQueries(dlc['trec-web-2011/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'),
CatBQrelFilter(TrecQrels(dlc['trec-web-2011/qrels.adhoc'], QREL_DEFS)),
documentation('trec-web-2011'))
subsets['catb/trec-web-2011/diversity'] = Dataset(
collection_catb,
TrecXmlQueries(dlc['trec-web-2011/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'),
CatBQrelFilter(TrecSubQrels(dlc['trec-web-2011/qrels.all'], QREL_DEFS)),
documentation('trec-web-2011'))
subsets['catb/trec-web-2012'] = Dataset(
collection_catb,
TrecXmlQueries(dlc['trec-web-2012/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'),
CatBQrelFilter(TrecQrels(dlc['trec-web-2012/qrels.adhoc'], QREL_DEFS)),
documentation('trec-web-2012'))
subsets['catb/trec-web-2012/diversity'] = Dataset(
collection_catb,
TrecXmlQueries(dlc['trec-web-2012/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'),
CatBQrelFilter(TrecSubQrels(dlc['trec-web-2012/qrels.all'], QREL_DEFS)),
documentation('trec-web-2012'))
subsets['trec-mq-2009'] = Dataset(
collection,
TrecColonQueries(GzipExtract(dlc['trec-mq-2009/queries']), encoding='latin1', lang='en'),
TrecPrels(GzipExtract(dlc['trec-mq-2009/qrels']), QREL_DEFS_09),
documentation('trec-mq-2009'))
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/clueweb12.py
================================================
import codecs
import io
import os
import gzip
import contextlib
from typing import NamedTuple, Tuple
from glob import glob
from pathlib import Path
import ir_datasets
from ir_datasets.util import DownloadConfig, TarExtract, TarExtractAll, Cache, Bz2Extract, ZipExtract, IterStream
from ir_datasets.formats import TrecQrels, TrecSubQrels, TrecDocs, TrecXmlQueries, WarcDocs, GenericDoc, GenericQuery, TrecQrel, TrecSubQrel, NtcirQrels, TrecSubtopic
from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredQrels, YamlDocumentation
from ir_datasets.indices import Docstore, CacheDocstore
_logger = ir_datasets.log.easy()
NAME = 'clueweb12'
QREL_DEFS = {
4: 'Nav: This page represents a home page of an entity directly named by the query; the user may be searching for this specific page or site.',
3: 'Key: This page or site is dedicated to the topic; authoritative and comprehensive, it is worthy of being a top result in a web search engine.',
2: 'HRel: The content of this page provides substantial information on the topic.',
1: 'Rel: The content of this page provides some information on the topic, which may be minimal; the relevant information must be on that page, not just promising-looking anchor text pointing to a possibly useful page.',
0: 'Non: The content of this page does not provide useful information on the topic, but may provide useful information on other topics, including other interpretations of the same query.',
-2: 'Junk: This page does not appear to be useful for any reasonable purpose; it may be spam or junk',
}
NTCIR_QREL_DEFS = {
0: 'Two annotators rated as non-relevant',
1: 'One annotator rated as relevant, one as non-relevant',
2: 'Two annotators rated as relevant, OR one rates as highly relevant and one as non-relevant',
3: 'One annotator rated as highly relevant, one as relevant',
4: 'Two annotators rated as highly relevant',
}
MISINFO_QREL_DEFS = {
0: 'Not relevant',
1: 'Relevant',
2: 'Highly relevant',
}
EHEALTH_QREL_DEFS = {
0: 'Not relevant',
1: 'Somewhat relevant',
2: 'Highly relevant',
}
ntcir_map = {'qid': 'query_id', 'content': 'title', 'description': 'description'}
misinfo_map = {'number': 'query_id', 'query': 'title', 'cochranedoi': 'cochranedoi', 'description': 'description', 'narrative': 'narrative'}
ehealth_map = {'id': 'query_id', 'title': 'text'}
class TrecWebTrackQuery(NamedTuple):
query_id: str
query: str
description: str
type: str
subtopics: Tuple[TrecSubtopic, ...]
def default_text(self):
"""
query
"""
return self.query
class NtcirQuery(NamedTuple):
query_id: str
title: str
description: str
def default_text(self):
"""
title
"""
return self.title
class MisinfoQuery(NamedTuple):
query_id: str
title: str
cochranedoi: str
description: str
narrative: str
def default_text(self):
"""
title
"""
return self.title
class MisinfoQrel(NamedTuple):
query_id: str
doc_id: str
relevance: int
effectiveness: int
redibility: int
class EhealthQrel(NamedTuple):
query_id: str
doc_id: str
relevance: int
trustworthiness: int
understandability: int
iteration: str
class MsinfoQrels(TrecQrels):
def qrels_iter(self):
with self._qrels_dlc.stream() as f:
f = codecs.getreader('utf8')(f)
for line in f:
if line == '\n':
continue # ignore blank lines
cols = line.rstrip().split()
if len(cols) != 6:
raise RuntimeError(f'expected 6 columns, got {len(cols)}')
qid, it, did, rel, eff, cred = cols
yield MisinfoQrel(qid, did, int(rel), int(eff), int(cred))
def qrels_cls(self):
return MisinfoQrel
class EhealthQrels(TrecQrels):
def __init__(self, qrels_dlcs, qtrust_dlcs, qunder_dlcs, qrels_defs, query_id_suffix=''):
super().__init__(None, qrels_defs)
self._qrels_dlcs = qrels_dlcs
self._qtrust_dlcs = qtrust_dlcs
self._qunder_dlcs = qunder_dlcs
self._query_id_suffix = query_id_suffix
def qrels_iter(self):
for i, (qrel_dlc, qtrust_dlc, qunder_dlc) in enumerate(zip(self._qrels_dlcs, self._qtrust_dlcs, self._qunder_dlcs)):
with qrel_dlc.stream() as frel, \
qtrust_dlc.stream() as ftrust, \
qunder_dlc.stream() as funder:
frel = codecs.getreader('utf8')(frel)
ftrust = codecs.getreader('utf8')(ftrust)
funder = codecs.getreader('utf8')(funder)
for lrel, ltrust, lunder in zip(frel, ftrust, funder):
cols_rel = lrel.rstrip().split()
cols_trust = ltrust.rstrip().split()
cols_under = lunder.rstrip().split()
assert len(cols_rel) == 4 and len(cols_trust) == 4 and len(cols_under) == 4
assert cols_rel[0] == cols_trust[0] and cols_trust[0] == cols_under[0] # qid
assert cols_rel[2] == cols_trust[2] and cols_trust[2] == cols_under[2] # did
qid, did = cols_rel[0], cols_rel[2]
yield EhealthQrel(qid + self._query_id_suffix, did, int(cols_rel[3]), int(cols_trust[3]), int(cols_under[3]), str(i))
def qrels_cls(self):
return EhealthQrel
class FixAmp:
def __init__(self, streamer):
self._streamer = streamer
def stream(self):
return io.BufferedReader(IterStream(iter(self)), buffer_size=io.DEFAULT_BUFFER_SIZE)
def __iter__(self):
with self._streamer.stream() as stream:
for line in stream:
yield line.replace(b' & ', b' & ')
class ClueWeb12Docs(WarcDocs):
def __init__(self, docs_dlc, chk_dlc=None):
super().__init__(lang='en') # all CW12 are english
self.docs_dlc = docs_dlc
self.chk_dlc = chk_dlc
self._docs_warc_file_counts_cache = None
def docs_path(self, force=True):
return self.docs_dlc.path(force)
def _docs_iter_source_files(self):
for source_dir in sorted(glob(os.path.join(self.docs_dlc.path(), 'ClueWeb12_*', '*'))):
for source_file in sorted(glob(os.path.join(source_dir, '*.gz'))):
yield source_file
def _docs_id_to_source_file(self, doc_id):
parts = doc_id.split('-')
if len(parts) != 4:
return None
dataset, sec, part, doc = parts
if dataset != 'clueweb12':
return None
return os.path.join(self.docs_dlc.path(), f'ClueWeb12_{sec[:2]}', sec, f'{sec}-{part}.warc.gz')
def _docs_source_file_to_checkpoint(self, source_file):
if self.chk_dlc is None:
return None
source_prefix = Path(self.docs_dlc.path())
source_file = Path(source_file)
index_prefix = Path(self.chk_dlc.path())
result = index_prefix / source_file.relative_to(source_prefix)
if result == source_file:
return None
return f'{result}.chk.lz4'
def _docs_warc_file_counts(self):
if self._docs_warc_file_counts_cache is None:
result = {}
for counts_file in glob(os.path.join(self.docs_dlc.path(), 'recordcounts', '*.txt')):
d = os.path.basename(counts_file)[:-len('_counts.txt')]
with open(counts_file, 'rt') as f:
for line in f:
file, count = line.strip().split()
file = os.path.join(self.docs_dlc.path(), d, file[2:])
result[file] = int(count)
self._docs_warc_file_counts_cache = result
return self._docs_warc_file_counts_cache
def docs_namespace(self):
return NAME
class ClueWeb12b13Extractor:
def __init__(self, docs_dlc, extract_jar_dlc):
self.docs_dlc = docs_dlc
self.extract_jar_dlc = extract_jar_dlc
def path(self, force=True):
source_path = self.docs_dlc.path()
path = f'{source_path}-b13'
if not force:
return path
if os.path.exists(path):
self._create_record_counts_if_needed(path)
return path
extract_path = self.extract_jar_dlc.path()
message = f'''clueweb12-b13 docs not found. Please either:
(1) Link docs to {path} if b13 subset already built, or
(2) Run the following command to build the b13 subset:
java -jar {extract_path} {source_path}/ {path}/
'''
_logger.info(message)
raise RuntimeError(message)
def _create_record_counts_if_needed(self, path):
# The official JAR doesn't build up the recordcounts files that we use for jumping ahead.
# So we will build them ourselves the first time. Luckily, the header of each WARC file
# in CW12 contains a warc-number-of-documents header, which we can use (avoids reading)
# the entire file. It still takes a little time, but not super long.
rc_dir = os.path.join(path, 'recordcounts')
if len(os.listdir(rc_dir)) != 0:
return
warc = ir_datasets.lazy_libs.warc()
with contextlib.ExitStack() as stack, _logger.pbar_raw(desc='building b13 document count cache', unit='file') as pbar:
for d in glob(os.path.join(path, 'ClueWeb12_??')):
d = os.path.basename(d)
out = stack.enter_context(ir_datasets.util.finialized_file(f'{rc_dir}/{d}_counts.txt', 'wt'))
for file in sorted(glob(os.path.join(path, d, '*', '*.warc.gz'))):
shortf = file[-24:]
with gzip.open(file, 'rb') as f, warc.WARCFile(fileobj=f) as warcf:
num_docs = next(iter(warcf)).header['warc-number-of-documents']
out.write(f'./{shortf} {num_docs}\n')
pbar.update(1)
def stream(self):
raise NotImplementedError
def _init():
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
subsets = {}
docs_dlc = dlc['docs']
docs_chk_dlc = TarExtractAll(dlc['docs.chk'], base_path/'corpus.chk')
b13_dlc = Bz2Extract(Cache(TarExtract(dlc['cw12b-info'], 'ClueWeb12-CreateB13/software/CreateClueWeb12B13Dataset.jar'), base_path/'CreateClueWeb12B13Dataset.jar'))
collection = ClueWeb12Docs(docs_dlc, docs_chk_dlc)
collection_b13 = ClueWeb12Docs(ClueWeb12b13Extractor(docs_dlc, b13_dlc))
base = Dataset(collection, documentation('_'))
subsets['b13'] = Dataset(collection_b13, documentation('b13'))
subsets['trec-web-2013'] = Dataset(
collection,
TrecXmlQueries(dlc['trec-web-2013/queries'], qtype=TrecWebTrackQuery, namespace='trec-web', lang='en'),
TrecQrels(dlc['trec-web-2013/qrels.adhoc'], QREL_DEFS),
documentation('trec-web-2013'))
subsets['trec-web-2013/diversity'] = Dataset(
collection,
TrecXmlQueries(dlc['trec-web-2013/queries'], qtype=TrecWebTrackQuery, namespace='trec-web', lang='en'),
TrecSubQrels(dlc['trec-web-2013/qrels.all'], QREL_DEFS),
documentation('trec-web-2013/diversity'))
subsets['trec-web-2014'] = Dataset(
collection,
TrecXmlQueries(dlc['trec-web-2014/queries'], qtype=TrecWebTrackQuery, namespace='trec-web', lang='en'),
TrecQrels(dlc['trec-web-2014/qrels.adhoc'], QREL_DEFS),
documentation('trec-web-2014'))
subsets['trec-web-2014/diversity'] = Dataset(
collection,
TrecXmlQueries(dlc['trec-web-2014/queries'], qtype=TrecWebTrackQuery, namespace='trec-web', lang='en'),
TrecSubQrels(dlc['trec-web-2014/qrels.all'], QREL_DEFS),
documentation('trec-web-2014/diversity'))
subsets['b13/ntcir-www-1'] = Dataset(
collection_b13,
TrecXmlQueries(Cache(ZipExtract(dlc['ntcir-www-1/queries'], 'eng.queries.xml'), base_path/'ntcir-www-1'/'queries.xml'), qtype=GenericQuery, qtype_map={'qid': 'query_id', 'content': 'text'}, namespace='ntcir-www', lang='en'),
NtcirQrels(dlc['ntcir-www-1/qrels'], NTCIR_QREL_DEFS),
documentation('ntcir-www-1'))
subsets['b13/ntcir-www-2'] = Dataset(
collection_b13,
TrecXmlQueries(Cache(ZipExtract(dlc['ntcir-www-2/queries'], 'qEng.xml'), base_path/'ntcir-www-2'/'queries.xml'), qtype=NtcirQuery, qtype_map=ntcir_map, namespace='ntcir-www', lang='en'),
NtcirQrels(dlc['ntcir-www-2/qrels'], NTCIR_QREL_DEFS),
documentation('ntcir-www-2'))
subsets['b13/ntcir-www-3'] = Dataset(
collection_b13,
TrecXmlQueries(dlc['ntcir-www-3/queries'], qtype=NtcirQuery, qtype_map=ntcir_map, namespace='ntcir-www', lang='en'),
documentation('ntcir-www-3'))
subsets['b13/trec-misinfo-2019'] = Dataset(
collection_b13,
TrecXmlQueries(dlc['trec-misinfo-2019/queries'], qtype=MisinfoQuery, qtype_map=misinfo_map, namespace='trec-misinfo-2019', lang='en'),
MsinfoQrels(dlc['trec-misinfo-2019/qrels'], MISINFO_QREL_DEFS),
documentation('trec-misinfo-2019'))
subsets['b13/clef-ehealth'] = Dataset(
collection_b13,
TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='en'),
EhealthQrels(
[dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
[dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
[dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
EHEALTH_QREL_DEFS),
documentation('clef-ehealth'))
subsets['b13/clef-ehealth/cs'] = Dataset(
collection_b13,
TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/cs']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='cs'),
EhealthQrels(
[dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
[dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
[dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
EHEALTH_QREL_DEFS, query_id_suffix='-cs'),
documentation('clef-ehealth/cs'))
subsets['b13/clef-ehealth/de'] = Dataset(
collection_b13,
TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/de']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='de'),
EhealthQrels(
[dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
[dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
[dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
EHEALTH_QREL_DEFS, query_id_suffix='-de'),
documentation('clef-ehealth/de'))
subsets['b13/clef-ehealth/fr'] = Dataset(
collection_b13,
TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/fr']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='fr'),
EhealthQrels(
[dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
[dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
[dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
EHEALTH_QREL_DEFS, query_id_suffix='-fr'),
documentation('clef-ehealth/fr'))
subsets['b13/clef-ehealth/hu'] = Dataset(
collection_b13,
TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/hu']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='hu'),
EhealthQrels(
[dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
[dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
[dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
EHEALTH_QREL_DEFS, query_id_suffix='-hu'),
documentation('clef-ehealth/hu'))
subsets['b13/clef-ehealth/pl'] = Dataset(
collection_b13,
TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/pl']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='pl'),
EhealthQrels(
[dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
[dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
[dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
EHEALTH_QREL_DEFS, query_id_suffix='-pl'),
documentation('clef-ehealth/pl'))
subsets['b13/clef-ehealth/sv'] = Dataset(
collection_b13,
TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/sv']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='sv'),
EhealthQrels(
[dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
[dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
[dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
EHEALTH_QREL_DEFS, query_id_suffix='-sv'),
documentation('clef-ehealth/sv'))
# NOTE: the following datasets are defined in touche.py:
# - clueweb12/touche-2020-task-2
# - clueweb12/touche-2021-task-2
# - clueweb12/touche-2022-task-2
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/codec.py
================================================
import json
from typing import NamedTuple
import ir_datasets
from ir_datasets.util import Lazy
from ir_datasets.formats import BaseQueries, TrecQrels, JsonlDocs
from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQrels
_logger = ir_datasets.log.easy()
NAME = 'codec'
QREL_DEFS = {
3: 'Very Valuable. Includes central topic-specific arguments, evidence, or knowledge. This does not include general definitions or background.',
2: 'Somewhat Valuable. Includes valuable topic-specific arguments, evidence, or knowledge.',
1: 'Not Valuable. Consists of definitions or background.',
0: 'Not Relevant. Not useful or on topic.',
}
DOMAINS = ['economics', 'history', 'politics']
class CodecDoc(NamedTuple):
doc_id: str
title: str
text: str
url: str
def default_text(self):
return f'{self.title} {self.text}'
class CodecQuery(NamedTuple):
query_id: str
query: str
domain: str
guidelines: str
def default_text(self):
"""
query
"""
return self.query
class CodecQueries(BaseQueries):
def __init__(self, streamer, qid_filter=None):
super().__init__()
self._streamer = streamer
self._qid_filter = qid_filter
def queries_iter(self):
with self._streamer.stream() as stream:
data = json.load(stream)
for qid, query in data.items():
if self._qid_filter is None or qid.startswith(self._qid_filter):
yield CodecQuery(qid, query['Query'], query['Domain'], query['Guidelines'])
def queries_cls(self):
return CodecQuery
def queries_namespace(self):
return NAME
def queries_lang(self):
return 'en'
def filter_qids(domain, queries_handler):
return Lazy(lambda: {q.query_id for q in queries_handler.queries_iter()})
def _init():
base_path = ir_datasets.util.home_path()/NAME
dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path)
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
corpus = JsonlDocs(dlc['documents'], doc_cls=CodecDoc, mapping={'doc_id': "id", "title": "title", "text": "contents", "url": "url"}, lang='en', count_hint=729824)
base = Dataset(
corpus,
CodecQueries(dlc['topics']),
TrecQrels(dlc['qrels'], QREL_DEFS),
documentation('_'))
subsets = {}
for domain in DOMAINS:
queries_handler = CodecQueries(dlc['topics'], qid_filter=domain)
subsets[domain] = Dataset(
corpus,
queries_handler,
FilteredQrels(base.qrels_handler(), filter_qids(domain, queries_handler), mode='include'),
documentation(domain))
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/codesearchnet.py
================================================
import json
import csv
import gzip
from typing import NamedTuple
import io
import itertools
from pathlib import Path
import ir_datasets
from ir_datasets.util import DownloadConfig, TarExtract, ZipExtractCache
from ir_datasets.formats import BaseDocs, BaseQueries, BaseQrels
from ir_datasets.formats import GenericDoc, GenericQuery, TrecQrel
from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS
from ir_datasets.datasets.base import Dataset, YamlDocumentation
NAME = 'codesearchnet'
QREL_DEFS = {
1: 'Matches docstring',
}
QREL_DEFS_CHALLENGE = {
0: 'Irrelevant',
1: 'Weak Match',
2: 'String Match',
3: 'Exact Match',
}
class CodeSearchNetDoc(NamedTuple):
doc_id: str
repo: str
path: str
func_name: str
code: str
language: str
class CodeSearchNetChallengeQrel(NamedTuple):
query_id: str
doc_id: str
relevance: str
note: str
class CodeSearchNetDocs(BaseDocs):
def __init__(self, docs_dlcs):
super().__init__()
self.docs_dlcs = docs_dlcs
@ir_datasets.util.use_docstore
def docs_iter(self):
for dlc in self.docs_dlcs:
base_path = Path(dlc.path())
for file in sorted(base_path.glob('**/*.gz')):
with gzip.open(file, 'rt') as f:
for line in f:
data = json.loads(line)
yield CodeSearchNetDoc(
data['url'], # doc_id = url
data['repo'],
data['path'],
data['func_name'],
data['code'],
data['language'],
)
def docs_cls(self):
return CodeSearchNetDoc
def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
return PickleLz4FullStore(
path=f'{ir_datasets.util.home_path()/NAME}/docs.pklz4',
init_iter_fn=self.docs_iter,
data_cls=self.docs_cls(),
lookup_field=field,
index_fields=['doc_id'],
count_hint=ir_datasets.util.count_hint(NAME),
options=options
)
def docs_count(self):
if self.docs_store().built():
return self.docs_store().count()
def docs_namespace(self):
return NAME
def docs_lang(self):
return None # not natural languages
class CodeSearchNetQueries(BaseQueries):
def __init__(self, queries_dlcs, split):
super().__init__()
self.queries_dlcs = queries_dlcs
self.split = split
def queries_iter(self):
for dlc in self.queries_dlcs:
base_path = Path(dlc.path())
for file in sorted(base_path.glob(f'**/{self.split}/*.gz')):
with gzip.open(file, 'rt') as f:
for line in f:
data = json.loads(line)
yield GenericQuery(
data['url'], # query_id = url
data['docstring'], # text = docstring
)
def queries_cls(self):
return GenericQuery
def queries_namespace(self):
return NAME
def queries_lang(self):
return 'en'
class CodeSearchNetQrels(BaseQrels):
def __init__(self, qrels_dlcs, split):
super().__init__()
self.qrels_dlcs = qrels_dlcs
self.split = split
def qrels_iter(self):
for dlc in self.qrels_dlcs:
base_path = Path(dlc.path())
for file in sorted(base_path.glob(f'**/{self.split}/*.gz')):
with gzip.open(file, 'rt') as f:
for line in f:
data = json.loads(line)
yield TrecQrel(
query_id=data['url'],
doc_id=data['url'],
relevance=1,
iteration='0',
)
def qrels_cls(self):
return TrecQrel
def qrels_defs(self):
return QREL_DEFS
def queries_lang(self):
return 'en'
class CodeSearchNetChallengeQueries(BaseQueries):
def __init__(self, queries_dlc):
super().__init__()
self.queries_dlc = queries_dlc
def queries_path(self):
return self.queries_dlc.path()
def queries_iter(self):
with self.queries_dlc.stream() as stream:
stream = io.TextIOWrapper(stream)
for i, line in enumerate(stream):
if i == 0:
continue # skip first (header) line
yield GenericQuery(str(i), line.rstrip())
def queries_cls(self):
return GenericQuery
def queries_namespace(self):
return NAME
class CodeSearchNetChallengeQrels(BaseQrels):
def __init__(self, qrels_dlc, queries_handler):
super().__init__()
self.qrels_dlc = qrels_dlc
self._queries_handler = queries_handler
def qrels_path(self):
return self.qrels_dlc.path()
def qrels_iter(self):
query_map = {q.text: q.query_id for q in self._queries_handler.queries_iter()}
with self.qrels_dlc.stream() as stream:
stream = io.TextIOWrapper(stream)
for data in csv.DictReader(stream):
yield CodeSearchNetChallengeQrel(
query_id=query_map[data['Query']],
doc_id=data['GitHubUrl'],
relevance=data['Relevance'],
note=data['Notes'])
def qrels_cls(self):
return CodeSearchNetChallengeQrel
def qrels_defs(self):
return QREL_DEFS_CHALLENGE
def _init():
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
subsets = {}
langs = ['python', 'java', 'go', 'php', 'ruby', 'javascript']
dlcs = {lang: ZipExtractCache(dlc[lang], base_path/lang) for lang in langs}
all_dlcs = [dlcs[lang] for lang in langs]
collection = CodeSearchNetDocs(all_dlcs)
base = Dataset(
collection,
documentation('_'),
)
subsets['train'] = Dataset(
collection,
CodeSearchNetQueries(all_dlcs, 'train'),
CodeSearchNetQrels(all_dlcs, 'train'),
documentation('train'),
)
subsets['valid'] = Dataset(
collection,
CodeSearchNetQueries(all_dlcs, 'valid'),
CodeSearchNetQrels(all_dlcs, 'valid'),
documentation('valid'),
)
subsets['test'] = Dataset(
collection,
CodeSearchNetQueries(all_dlcs, 'test'),
CodeSearchNetQrels(all_dlcs, 'test'),
documentation('test'),
)
challenge_queries = CodeSearchNetChallengeQueries(dlc['challenge/queries'])
subsets['challenge'] = Dataset(
collection,
challenge_queries,
CodeSearchNetChallengeQrels(dlc['challenge/qrels'], challenge_queries),
documentation('challenge'),
)
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/cord19.py
================================================
import io
import codecs
import json
import csv
import contextlib
import os
import shutil
import tarfile
from collections import defaultdict
from typing import NamedTuple, Tuple
from pathlib import Path
import ir_datasets
from ir_datasets.util import Lazy, DownloadConfig
from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredQrels, YamlDocumentation
from ir_datasets.formats import BaseDocs, TrecXmlQueries, TrecQrels, GenericQuery, GenericQrel
from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS
NAME = 'cord19'
_logger = ir_datasets.log.easy()
class Cord19Doc(NamedTuple):
doc_id: str
title: str
doi: str
date: str
abstract: str
def default_text(self):
"""
title + abstract
"""
return f'{self.title} {self.abstract}'
class Cord19FullTextSection(NamedTuple):
title: str
text: str
class Cord19FullTextDoc(NamedTuple):
doc_id: str
title: str
doi: str
date: str
abstract: str
body: Tuple[Cord19FullTextSection, ...]
def default_text(self):
"""
title + abstract + body
"""
body = ' '.join(f'{b.title} {b.text}' for b in self.body)
return f'{self.title} {self.abstract} {body}'
QRELS_DEFS = {
2: 'Relevant: the article is fully responsive to the information need as expressed by the topic, i.e. answers the Question in the topic. The article need not contain all information on the topic, but must, on its own, provide an answer to the question.',
1: 'Partially Relevant: the article answers part of the question but would need to be combined with other information to get a complete answer.',
0: 'Not Relevant: everything else.',
}
QTYPE_MAP = {
'query': 'title',
'question': 'description',
'narrative': 'narrative'
}
class Cord19Docs(BaseDocs):
def __init__(self, streamer, extr_path, date, include_fulltext=False, count_hint=None):
self._streamer = streamer
self._extr_path = Path(extr_path)
self._date = date
self._include_fulltext = include_fulltext
self._count_hint = count_hint
def docs_path(self, force=True):
result = self._streamer.path(force)
if self._include_fulltext:
return f'{result}.fulltext'
return result
def docs_cls(self):
return Cord19FullTextDoc if self._include_fulltext else Cord19Doc
def docs_iter(self):
return iter(self.docs_store())
def _docs_iter(self):
if self._include_fulltext:
if not os.path.exists(self._extr_path):
try:
with self._streamer.stream() as stream:
mode = 'r|'
if self._streamer.path().endswith('.gz'):
mode += 'gz'
elif self._streamer.path().endswith('.bz2'):
mode += 'bz2'
with _logger.duration('extracting tarfile'):
with tarfile.open(fileobj=stream, mode=mode) as tarf:
tarf.extractall(self._extr_path)
except:
shutil.rmtree(self._extr_path)
raise
with contextlib.ExitStack() as ctxt:
# Sometiems the document parses are in a single big file, sometimes in separate.
fulltexts = None
if self._include_fulltext:
bigfile = self._extr_path/self._date/'document_parses.tar.gz'
if bigfile.exists():
fulltexts = tarfile.open(fileobj=ctxt.push(bigfile.open('rb')))
else:
fulltexts = {
'biorxiv_medrxiv': tarfile.open(fileobj=ctxt.push((self._extr_path/self._date/'biorxiv_medrxiv.tar.gz').open('rb'))),
'comm_use_subset': tarfile.open(fileobj=ctxt.push((self._extr_path/self._date/'comm_use_subset.tar.gz').open('rb'))),
'noncomm_use_subset': tarfile.open(fileobj=ctxt.push((self._extr_path/self._date/'noncomm_use_subset.tar.gz').open('rb'))),
'custom_license': tarfile.open(fileobj=ctxt.push((self._extr_path/self._date/'custom_license.tar.gz').open('rb'))),
}
if self._include_fulltext:
csv_reader = ctxt.push((self._extr_path/self._date/'metadata.csv').open('rt'))
else:
csv_reader = ctxt.enter_context(self._streamer.stream())
csv_reader = codecs.getreader('utf8')(csv_reader)
csv_reader = csv.DictReader(csv_reader)
for record in csv_reader:
did = record['cord_uid']
title = record['title']
doi = record['doi']
abstract = record['abstract']
date = record['publish_time']
if self._include_fulltext:
body = None
# Sometiems the document parses are in a single big file, sometimes in separate.
# The metadata format is also different in these cases.
if isinstance(fulltexts, dict):
if record['has_pmc_xml_parse']:
path = os.path.join(record['full_text_file'], 'pmc_json', record['pmcid'] + '.xml.json')
body = json.load(fulltexts[record['full_text_file']].extractfile(path))
elif record['has_pdf_parse']:
path = os.path.join(record['full_text_file'], 'pdf_json', record['sha'].split(';')[0].strip() + '.json')
body = json.load(fulltexts[record['full_text_file']].extractfile(path))
elif fulltexts is not None:
if record['pmc_json_files']:
body = json.load(fulltexts.extractfile(record['pmc_json_files'].split(';')[0]))
elif record['pdf_json_files']:
body = json.load(fulltexts.extractfile(record['pdf_json_files'].split(';')[0]))
if body is not None:
if 'body_text' in body:
body = tuple(Cord19FullTextSection(b['section'], b['text']) for b in body['body_text'])
else:
body = tuple() # no body available
else:
body = tuple() # no body available
yield Cord19FullTextDoc(did, title, doi, date, abstract, body)
else:
yield Cord19Doc(did, title, doi, date, abstract)
def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
return PickleLz4FullStore(
path=f'{self.docs_path(force=False)}.pklz4',
init_iter_fn=self._docs_iter,
data_cls=self.docs_cls(),
lookup_field=field,
index_fields=['doc_id'],
count_hint=self._count_hint,
options=options
)
def docs_count(self):
if self.docs_store().built():
return self.docs_store().count()
def docs_namespace(self):
return NAME
def docs_lang(self):
return 'en'
def _init():
subsets = {}
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
collection = Cord19Docs(dlc['docs/2020-07-16/metadata'], base_path/'2020-07-16', '2020-07-16', count_hint=ir_datasets.util.count_hint(f'{NAME}'))
collection_ft = Cord19Docs(dlc['docs/2020-07-16'], base_path/'2020-07-16.fulltext', '2020-07-16', include_fulltext=True, count_hint=ir_datasets.util.count_hint(f'{NAME}/fulltext'))
queries = TrecXmlQueries(dlc['trec-covid/queries'], qtype_map=QTYPE_MAP, namespace=NAME, lang='en')
qrels = TrecQrels(dlc['trec-covid/qrels'], QRELS_DEFS)
base = Dataset(collection, documentation('_'))
subsets['trec-covid'] = Dataset(queries, qrels, collection, documentation('trec-covid'))
subsets['fulltext'] = Dataset(collection_ft, documentation('fulltext'))
subsets['fulltext/trec-covid'] = Dataset(queries, qrels, collection_ft, documentation('fulltext/trec-covid'))
subsets['trec-covid/round1'] = Dataset(
Cord19Docs(dlc['docs/2020-04-10/metadata'], base_path/'2020-04-10', '2020-04-10', count_hint=ir_datasets.util.count_hint(f'{NAME}/round1')),
TrecXmlQueries(dlc['trec-covid/round1/queries'], qtype_map=QTYPE_MAP, namespace=NAME, lang='en'),
TrecQrels(dlc['trec-covid/round1/qrels'], QRELS_DEFS),
documentation('trec-covid/round1'))
subsets['trec-covid/round2'] = Dataset(
Cord19Docs(dlc['docs/2020-05-01/metadata'], base_path/'2020-05-01', '2020-05-01', count_hint=ir_datasets.util.count_hint(f'{NAME}/round2')),
TrecXmlQueries(dlc['trec-covid/round2/queries'], qtype_map=QTYPE_MAP, namespace=NAME, lang='en'),
TrecQrels(dlc['trec-covid/round2/qrels'], QRELS_DEFS),
documentation('trec-covid/round2'))
subsets['trec-covid/round3'] = Dataset(
Cord19Docs(dlc['docs/2020-05-19/metadata'], base_path/'2020-05-19', '2020-05-19', count_hint=ir_datasets.util.count_hint(f'{NAME}/round3')),
TrecXmlQueries(dlc['trec-covid/round3/queries'], qtype_map=QTYPE_MAP, namespace=NAME, lang='en'),
TrecQrels(dlc['trec-covid/round3/qrels'], QRELS_DEFS),
documentation('trec-covid/round3'))
subsets['trec-covid/round4'] = Dataset(
Cord19Docs(dlc['docs/2020-06-19/metadata'], base_path/'2020-06-19', '2020-06-19', count_hint=ir_datasets.util.count_hint(f'{NAME}/round4')),
TrecXmlQueries(dlc['trec-covid/round4/queries'], qtype_map=QTYPE_MAP, namespace=NAME, lang='en'),
TrecQrels(dlc['trec-covid/round4/qrels'], QRELS_DEFS),
documentation('trec-covid/round4'))
subsets['trec-covid/round5'] = Dataset(
collection,
queries,
TrecQrels(dlc['trec-covid/round5/qrels'], QRELS_DEFS),
documentation('trec-covid/round5'))
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/cranfield.py
================================================
import io
import codecs
import itertools
import ir_datasets
from typing import NamedTuple
from ir_datasets.util import DownloadConfig, TarExtract, Cache
from ir_datasets.formats import BaseDocs, BaseQueries, BaseQrels
from ir_datasets.formats import GenericDoc, GenericQuery, TrecQrel
from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS
from ir_datasets.datasets.base import Dataset, YamlDocumentation
NAME = 'cranfield'
QREL_DEFS = {
-1: 'References of no interest.',
1: 'References of minimum interest, for example, those that have been included from an historical viewpoint.',
2: 'References which were useful, either as general background to the work or as suggesting methods of tackling certain aspects of the work.',
3: 'References of a high degree of relevance, the lack of which either would have made the research impracticable or would have resulted in a considerable amount of extra work.',
4: 'References which are a complete answer to the question.',
}
class CranfieldDoc(NamedTuple):
doc_id: str
title: str
text: str
author: str
bib: str
def default_text(self):
"""
title + text
"""
return f'{self.title} {self.text}'
def prefix_sentinel_splitter(it, sentinel):
lines = None
for is_sentinel, group in itertools.groupby(it, lambda l: l.startswith(sentinel)):
if is_sentinel:
lines = [list(group)[0].replace(sentinel, '')]
else:
lines += list(group)
yield lines
class CranfieldDocs(BaseDocs):
def __init__(self, docs_dlc):
super().__init__()
self.docs_dlc = docs_dlc
def docs_path(self, force=True):
return self.docs_dlc.path(force)
@ir_datasets.util.use_docstore
def docs_iter(self):
with self.docs_dlc.stream() as stream:
stream = io.TextIOWrapper(stream)
for lines in prefix_sentinel_splitter(stream, sentinel='.I '):
record = {'doc_id': '', 'title': '', 'author': '', 'bib': '', 'text': ''}
field = 'doc_id'
for line in lines:
if line.startswith('.T'):
field = 'title'
elif line.startswith('.A'):
field = 'author'
elif line.startswith('.B'):
field = 'bib'
elif line.startswith('.W'):
field = 'text'
else:
record[field] += line
record = {k: v.strip() for k, v in record.items()}
yield CranfieldDoc(**record)
def docs_cls(self):
return CranfieldDoc
def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
return PickleLz4FullStore(
path=f'{ir_datasets.util.home_path()/NAME}/docs.pklz4',
init_iter_fn=self.docs_iter,
data_cls=self.docs_cls(),
lookup_field=field,
index_fields=['doc_id'],
count_hint=ir_datasets.util.count_hint(NAME),
options=options
)
def docs_count(self):
if self.docs_store().built():
return self.docs_store().count()
def docs_namespace():
return NAME
def docs_lang(self):
return 'en'
class CranfieldQueries(BaseQueries):
def __init__(self, queries_dlc):
super().__init__()
self.queries_dlc = queries_dlc
def queries_path(self):
return self.queries_dlc.path()
def queries_iter(self):
with self.queries_dlc.stream() as stream:
stream = io.TextIOWrapper(stream)
query_id = 1
for lines in prefix_sentinel_splitter(stream, sentinel='.I '):
record = {'query_id': '', 'text': ''}
field = 'query_id'
for line in lines:
if line.startswith('.W'):
field = 'text'
else:
record[field] += line
record = {k: v.strip() for k, v in record.items()}
record['query_id'] = str(query_id) # overwrite query_id to match qrels
query_id += 1
yield GenericQuery(**record)
def queries_cls(self):
return GenericQuery
def queries_namespace(self):
return NAME
def queries_lang(self):
return 'en'
class CranfieldQrels(BaseQrels):
def __init__(self, qrels_dlc):
self._qrels_dlc = qrels_dlc
def qrels_path(self):
return self._qrels_dlc.path()
def qrels_iter(self):
with self._qrels_dlc.stream() as f:
f = codecs.getreader('utf8')(f)
for line in f:
cols = line.rstrip().split()
if len(cols) != 3:
raise RuntimeError(f'expected 3 columns, got {len(cols)}')
qid, did, score = cols
yield TrecQrel(qid, did, int(score), '0')
def qrels_cls(self):
return TrecQrel
def qrels_defs(self):
return QREL_DEFS
def _init():
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
subsets = {}
main_dlc = dlc['main']
base = Dataset(
CranfieldDocs(Cache(TarExtract(main_dlc, 'cran.all.1400'), base_path/'docs.txt')),
CranfieldQueries(Cache(TarExtract(main_dlc, 'cran.qry'), base_path/'queries.txt')),
CranfieldQrels(Cache(TarExtract(main_dlc, 'cranqrel'), base_path/'qrels.txt')),
documentation('_'),
)
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/csl.py
================================================
from typing import List, NamedTuple
from enum import Enum
import ir_datasets
from ir_datasets.util import DownloadConfig, GzipExtract
from ir_datasets.datasets.base import Dataset, YamlDocumentation
from ir_datasets.formats.trec import TrecQrels
from ir_datasets.formats import JsonlDocs, ExctractedCCQueries, ExctractedCCNoReportQuery
from ir_datasets.util.fileio import TarExtract
NAME = 'csl'
class CslDoc(NamedTuple):
doc_id: str
title: str
abstract: str
keywords: List[str]
category: str
category_eng: str
discipline: str
discipline_eng: str
def default_text(self):
return f'{self.title}\n{self.abstract}'
QREL_DEFS = {
3: 'Very-valuable. Information in the document would be found in the lead paragraph of a report that is later written on the topic.',
1: 'Somewhat-valuable. The most valuable information in the document would be found in the remainder of such a report.',
0: 'Not-valuable. Information in the document might be included in a report footnote, or omitted entirely.',
}
def _init():
subsets = {}
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
docs = JsonlDocs(GzipExtract(dlc['docs']), doc_cls=CslDoc, namespace=NAME, lang='zh', count_hint=395927)
base = Dataset(
docs,
documentation('_')
)
subsets["trec-2023"] = Dataset(
docs,
ExctractedCCQueries(dlc['trec-2023/queries'], subset_lang='zh', filter_lwq=False, cls=ExctractedCCNoReportQuery, namespace=NAME),
TrecQrels(TarExtract(dlc['trec-2023/qrels'], 'tech_final_qrels.txt'), QREL_DEFS),
documentation('trec-2023'),
)
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/disks45.py
================================================
import ir_datasets
from ir_datasets.util import GzipExtract, TarExtract, Lazy, DownloadConfig
from ir_datasets.formats import TrecQrels, TrecDocs, TrecQueries
from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredQrels, YamlDocumentation
NAME = 'disks45'
QREL_DEFS = {
2: 'highly relevant',
1: 'relevant',
0: 'not relevant',
}
QREL_DEFS_TREC78 = {
1: 'relevant',
0: 'not relevant',
}
DUA = ("Please confirm you agree to the TREC data usage agreement found at "
"")
# folds from Huston & Croft 2014
ROBUST04_FOLDS = {
'fold1': {'302', '303', '309', '316', '317', '319', '323', '331', '336', '341', '356', '357', '370', '373', '378', '381', '383', '392', '394', '406', '410', '411', '414', '426', '428', '433', '447', '448', '601', '607', '608', '612', '617', '619', '635', '641', '642', '646', '647', '654', '656', '662', '665', '669', '670', '679', '684', '690', '692', '700'},
'fold2': {'301', '308', '312', '322', '327', '328', '338', '343', '348', '349', '352', '360', '364', '365', '369', '371', '374', '386', '390', '397', '403', '419', '422', '423', '424', '432', '434', '440', '446', '602', '604', '611', '623', '624', '627', '632', '638', '643', '651', '652', '663', '674', '675', '678', '680', '683', '688', '689', '695', '698'},
'fold3': {'306', '307', '313', '321', '324', '326', '334', '347', '351', '354', '358', '361', '362', '363', '376', '380', '382', '396', '404', '413', '415', '417', '427', '436', '437', '439', '444', '445', '449', '450', '603', '605', '606', '614', '620', '622', '626', '628', '631', '637', '644', '648', '661', '664', '666', '671', '677', '685', '687', '693'},
'fold4': {'320', '325', '330', '332', '335', '337', '342', '344', '350', '355', '368', '377', '379', '387', '393', '398', '402', '405', '407', '408', '412', '420', '421', '425', '430', '431', '435', '438', '616', '618', '625', '630', '633', '636', '639', '649', '650', '653', '655', '657', '659', '667', '668', '672', '673', '676', '682', '686', '691', '697'},
'fold5': {'304', '305', '310', '311', '314', '315', '318', '329', '333', '339', '340', '345', '346', '353', '359', '366', '367', '372', '375', '384', '385', '388', '389', '391', '395', '399', '400', '401', '409', '416', '418', '429', '441', '442', '443', '609', '610', '613', '615', '621', '629', '634', '640', '645', '658', '660', '681', '694', '696', '699'}
}
def _init():
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
subsets = {}
collection_nocr = TrecDocs(dlc['docs'],
path_globs=['**/FBIS/FB*', '**/FR94/??/FR*', '**/FT/*/FT*', '**/LATIMES/LA*'],
namespace=NAME,
lang='en',
expected_file_count=2295,
count_hint=ir_datasets.util.count_hint(NAME),
parser='sax',
docstore_path=base_path/'corpus.nocr.pklz4')
robust_queries = TrecQueries(GzipExtract(dlc['robust04-queries']), namespace=NAME, lang='en')
robust_qrels = TrecQrels(dlc['robust04-qrels'], QREL_DEFS)
base = Dataset(documentation('_'))
subsets['nocr'] = Dataset(
collection_nocr,
documentation('nocr'))
subsets['nocr/trec-robust-2004'] = Dataset(
collection_nocr,
robust_queries,
robust_qrels,
documentation('nocr/trec-robust-2004'))
for fold in ROBUST04_FOLDS:
qid_filter = make_filter(fold)
subsets[f'nocr/trec-robust-2004/{fold}'] = Dataset(
collection_nocr,
FilteredQueries(robust_queries, qid_filter),
FilteredQrels(robust_qrels, qid_filter),
documentation(f'nocr/trec-robust-2004/{fold}'))
subsets['nocr/trec8'] = Dataset(
collection_nocr,
TrecQrels(TarExtract(dlc['trec8-qrels'], 'qrels.trec8.adhoc.parts1-5'), QREL_DEFS_TREC78),
TrecQueries(GzipExtract(dlc['trec8-queries']), namespace=NAME, lang='en'),
documentation('nocr/trec8'))
subsets['nocr/trec7'] = Dataset(
collection_nocr,
TrecQrels([
GzipExtract(TarExtract(dlc['trec7-qrels'], 'qrels.trec7.adhoc.part1.gz')),
GzipExtract(TarExtract(dlc['trec7-qrels'], 'qrels.trec7.adhoc.part2.gz')),
GzipExtract(TarExtract(dlc['trec7-qrels'], 'qrels.trec7.adhoc.part3.gz')),
GzipExtract(TarExtract(dlc['trec7-qrels'], 'qrels.trec7.adhoc.part4.gz')),
GzipExtract(TarExtract(dlc['trec7-qrels'], 'qrels.trec7.adhoc.part5.gz')),
], QREL_DEFS_TREC78),
TrecQueries(GzipExtract(dlc['trec7-queries']), namespace=NAME, lang='en'),
documentation('nocr/trec7'))
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
def make_filter(fold):
return Lazy(lambda: ROBUST04_FOLDS[fold])
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/dpr_w100.py
================================================
from typing import NamedTuple, Tuple
import contextlib
import itertools
import ir_datasets
from ir_datasets.util import GzipExtract
from ir_datasets.datasets.base import Dataset, YamlDocumentation
from ir_datasets.formats import TsvDocs, BaseQueries, TrecQrels
_logger = ir_datasets.log.easy()
NAME = 'dpr-w100'
QREL_DEFS = {
2: 'marked by human annotator as containing the answer',
1: 'contains the answer text and retrieved in the top BM25 results',
0: '"hard" negative samples',
-1: 'negative samples'
}
class DprW100Doc(NamedTuple):
doc_id: str
text: str
title: str
def default_text(self):
"""
title + text
"""
return f'{self.title} {self.text}'
class DprW100Query(NamedTuple):
query_id: str
text: str
answers: Tuple[str, ]
def default_text(self):
"""
text
"""
return self.text
class DprW100Manager:
def __init__(self, dlc, base_path, passage_id_key='passage_id'):
self._dlc = dlc
self._base_path = base_path
self._base_path.mkdir(parents=True, exist_ok=True)
self._passage_id_key = passage_id_key
def build(self):
ijson = ir_datasets.lazy_libs.ijson()
if (self._base_path/'queries.tsv').exists():
return # already built
with contextlib.ExitStack() as stack:
f_queries = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'queries.tsv', 'wt'))
f_qrels = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'qrels', 'wt'))
stream = stack.enter_context(self._dlc.stream())
qid_counter = itertools.count()
for record in _logger.pbar(ijson.items(stream, 'item'), 'building dpr-w100', unit='record'):
qid = str(next(qid_counter))
f_queries.write('\t'.join([
qid,
record['question'].replace('\t', ' ')
] + [
a.replace('\t', ' ') for a in record['answers']
]) + '\n')
seen = set()
for ctxt in record['positive_ctxs']:
if ctxt[self._passage_id_key] not in seen:
seen.add(ctxt[self._passage_id_key])
rel = 2 if ctxt['score'] == 1000 else 1
f_qrels.write(f'{qid} 0 {ctxt[self._passage_id_key]} {rel}\n')
for ctxt in record['hard_negative_ctxs']:
if ctxt[self._passage_id_key] not in seen:
seen.add(ctxt[self._passage_id_key])
f_qrels.write(f'{qid} 0 {ctxt[self._passage_id_key]} 0\n')
for ctxt in record['negative_ctxs']:
if ctxt[self._passage_id_key] not in seen:
seen.add(ctxt[self._passage_id_key])
f_qrels.write(f'{qid} 0 {ctxt[self._passage_id_key]} -1\n')
def file_ref(self, path):
return _ManagedDlc(self, self._base_path/path)
class _ManagedDlc:
def __init__(self, manager, path):
self._manager = manager
self._path = path
@contextlib.contextmanager
def stream(self):
self._manager.build()
with open(self._path, 'rb') as f:
yield f
def path(self, force=True):
if force:
self._manager.build()
return self._path
class DprW100Queries(BaseQueries):
def __init__(self, dlc):
self._dlc = dlc
def queries_iter(self):
with self._dlc.stream() as stream:
for line in stream:
cols = line.decode().strip().split('\t')
yield DprW100Query(cols[0], cols[1], tuple(cols[2:]))
def queries_cls(self):
return DprW100Query
def queries_namespace(self):
return NAME
def queries_lang(self):
return 'en'
def _init():
base_path = ir_datasets.util.home_path()/NAME
dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path)
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
collection = TsvDocs(GzipExtract(dlc['docs']), doc_cls=DprW100Doc, namespace=NAME, lang='en', skip_first_line=True, docstore_size_hint=12827215492, count_hint=ir_datasets.util.count_hint(NAME))
base = Dataset(
collection,
documentation('_'))
subsets = {}
nq_dev_manager = DprW100Manager(GzipExtract(dlc['nq-dev']), base_path/'nq-dev')
subsets['natural-questions/dev'] = Dataset(
collection,
DprW100Queries(nq_dev_manager.file_ref('queries.tsv')),
TrecQrels(nq_dev_manager.file_ref('qrels'), QREL_DEFS),
documentation('natural-questions/dev'))
nq_train_manager = DprW100Manager(GzipExtract(dlc['nq-train']), base_path/'nq-train')
subsets['natural-questions/train'] = Dataset(
collection,
DprW100Queries(nq_train_manager.file_ref('queries.tsv')),
TrecQrels(nq_train_manager.file_ref('qrels'), QREL_DEFS),
documentation('natural-questions/train'))
tqa_dev_manager = DprW100Manager(GzipExtract(dlc['tqa-dev']), base_path/'tqa-dev', passage_id_key='psg_id')
subsets['trivia-qa/dev'] = Dataset(
collection,
DprW100Queries(tqa_dev_manager.file_ref('queries.tsv')),
TrecQrels(tqa_dev_manager.file_ref('qrels'), QREL_DEFS),
documentation('trivia-qa/dev'))
tqa_train_manager = DprW100Manager(GzipExtract(dlc['tqa-train']), base_path/'tqa-train', passage_id_key='psg_id')
subsets['trivia-qa/train'] = Dataset(
collection,
DprW100Queries(tqa_train_manager.file_ref('queries.tsv')),
TrecQrels(tqa_train_manager.file_ref('qrels'), QREL_DEFS),
documentation('trivia-qa/train'))
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/gov.py
================================================
import re
import io
import os
import gzip
import codecs
from collections import Counter
from contextlib import contextmanager, ExitStack
from pathlib import Path
from typing import NamedTuple
from glob import glob
import ir_datasets
from ir_datasets.util import DownloadConfig, GzipExtract, TarExtract
from ir_datasets.formats import TrecQrels, TrecQueries, TrecColonQueries, BaseDocs, GenericQuery, BaseQrels
from ir_datasets.datasets.base import Dataset, YamlDocumentation
from ir_datasets.indices import Docstore, PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS
_logger = ir_datasets.log.easy()
NAME = 'gov'
QREL_DEFS = {
1: 'Relevant',
0: 'Not Relevant',
}
NAMED_PAGE_QREL_DEFS = {
1: 'Name refers to this page',
}
NAMED_PAGE_QTYPE_MAP = {
' *(Number:)? *NP': 'query_id', # Remove NP prefix from QIDs
' *(Description:)?': 'text',
}
WEB03_QTYPE_MAP = {
' *(Number:)? *TD': 'query_id', # Remove TD prefix from QIDs
'': 'title',
' *(Description:)?': 'description',
}
WEB04_QTYPE_MAP = {
' *(Number:)? *WT04-': 'query_id',
'': 'text',
}
class GovWeb02Query(NamedTuple):
query_id: str
title: str
description: str
def default_text(self):
"""
title
"""
return self.title
class GovDoc(NamedTuple):
doc_id: str
url: str
http_headers: str
body: bytes
body_content_type: str
def default_text(self):
return ir_datasets.util.sax_html_parser(self.body, headers=self.http_headers, fields=[{'title', 'body'}])[0]
class GovDocs(BaseDocs):
def __init__(self, docs_dlc):
super().__init__()
self.docs_dlc = docs_dlc
def docs_path(self, force=True):
return self.docs_dlc.path(force)
def docs_iter(self):
return iter(self.docs_store())
def _docs_iter(self):
dirs = sorted(Path(self.docs_dlc.path()).glob('G??'))
for source_dir in dirs:
for source_file in sorted(source_dir.glob('*.gz')):
yield from self._docs_ctxt_iter_gov(source_file)
def docs_cls(self):
return GovDoc
def _docs_ctxt_iter_gov(self, gov2f):
with ExitStack() as stack:
if isinstance(gov2f, (str, Path)):
gov2f = stack.enter_context(gzip.open(gov2f, 'rb'))
inp = bytearray()
# incrementally read the input file with read1 -- this ends up being more than twice
# as fast as reading the input line-by-line and searching for and lines
inp.extend(gov2f.read1())
START, END = b'\n', b'\n'
while inp != b'':
inp, next_doc = self._extract_next_block(inp, START, END)
while next_doc is not None:
yield self._process_gov_doc(next_doc)
inp, next_doc = self._extract_next_block(inp, START, END)
inp.extend(gov2f.read1())
def _process_gov_doc(self, raw_doc):
# read the file by exploiting the sequence of blocks in the document -- this ends
# up being several times faster than reading line-by-line
raw_doc, doc_id = self._extract_next_block(raw_doc, b'', b'\n')
assert doc_id is not None
doc_id = doc_id.strip().decode()
doc_body, doc_hdr = self._extract_next_block(raw_doc, b'\n', b'\n')
assert doc_hdr is not None
for encoding in ['utf8', 'ascii', 'latin1']:
try:
doc_url, doc_hdr = doc_hdr.decode(encoding).split('\n', 1)
break
except UnicodeDecodeError:
continue
content_type_match = re.search('^content-type:(.*)$', doc_hdr, re.I|re.M)
content_type = 'text/html' # default to text/html
if content_type_match:
content_type = content_type_match.group(1)
if ';' in content_type:
content_type, _ = content_type.split(';', 1)
content_type = content_type.strip()
return GovDoc(doc_id, doc_url, doc_hdr, bytes(doc_body), content_type)
def _extract_next_block(self, inp, START, END):
# if START and END appear in inp, then return (everything after END in inp, the content between START and END),
# or if they don't appear, return (inp, None).
i_start = inp.find(START)
i_end = inp.find(END)
if i_start == -1 or i_end == -1:
return inp, None
return inp[i_end+len(END):], inp[i_start+len(START):i_end]
def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
return PickleLz4FullStore(
path=f'{self.docs_path(force=False)}.pklz4',
init_iter_fn=self._docs_iter,
data_cls=self.docs_cls(),
lookup_field=field,
index_fields=['doc_id'],
count_hint=ir_datasets.util.count_hint(NAME),
options=options
)
def docs_count(self):
if self.docs_store().built():
return self.docs_store().count()
def docs_namespace(self):
return NAME
def docs_lang(self):
return 'en'
def _init():
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
subsets = {}
collection = GovDocs(dlc['docs'])
base = Dataset(collection, documentation('_'))
subsets['trec-web-2002'] = Dataset(
collection,
TrecQueries(GzipExtract(dlc['trec-web-2002/queries']), namespace='gov/trec-web-2002', lang='en'),
TrecQrels(GzipExtract(dlc['trec-web-2002/qrels']), QREL_DEFS),
documentation('trec-web-2002')
)
subsets['trec-web-2002/named-page'] = Dataset(
collection,
TrecQueries(GzipExtract(dlc['trec-web-2002/named-page/queries']), qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace='gov/trec-web-2002/named-page', lang='en'),
TrecQrels(GzipExtract(dlc['trec-web-2002/named-page/qrels']), NAMED_PAGE_QREL_DEFS),
documentation('trec-web-2002/named-page')
)
subsets['trec-web-2003'] = Dataset(
collection,
TrecQueries(dlc['trec-web-2003/queries'], qtype=GovWeb02Query, qtype_map=WEB03_QTYPE_MAP, namespace='gov/trec-web-2003', lang='en'),
TrecQrels(dlc['trec-web-2003/qrels'], QREL_DEFS),
documentation('trec-web-2003')
)
subsets['trec-web-2003/named-page'] = Dataset(
collection,
TrecQueries(dlc['trec-web-2003/named-page/queries'], qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace='gov/trec-web-2003/named-page', lang='en'),
TrecQrels(dlc['trec-web-2003/named-page/qrels'], NAMED_PAGE_QREL_DEFS),
documentation('trec-web-2003/named-page')
)
subsets['trec-web-2004'] = Dataset(
collection,
TrecQueries(dlc['trec-web-2004/queries'], qtype=GenericQuery, qtype_map=WEB04_QTYPE_MAP, namespace='gov/trec-web-2004', lang='en'),
TrecQrels(dlc['trec-web-2004/qrels'], QREL_DEFS),
documentation('trec-web-2004')
)
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/gov2.py
================================================
import re
import io
import os
import gzip
import codecs
from collections import Counter
from contextlib import contextmanager, ExitStack
from pathlib import Path
from typing import NamedTuple
from glob import glob
import ir_datasets
from ir_datasets.util import DownloadConfig, GzipExtract, TarExtract
from ir_datasets.formats import TrecQrels, TrecQueries, TrecColonQueries, BaseDocs, GenericQuery, BaseQrels, TrecPrels
from ir_datasets.datasets.base import Dataset, YamlDocumentation
from ir_datasets.indices import Docstore
_logger = ir_datasets.log.easy()
NAME = 'gov2'
QREL_DEFS = {
2: 'Highly Relevant',
1: 'Relevant',
0: 'Not Relevant',
}
NAMED_PAGE_QREL_DEFS = {
1: 'Relevant',
0: 'Not Relevant',
}
NAMED_PAGE_QTYPE_MAP = {
' *(Number:)? *NP': 'query_id', # Remove NP prefix from QIDs
' *(Topic:)?': 'text',
}
EFF_MAP_05 = {'751': '1192', '752': '1330', '753': '5956', '754': '6303', '755': '6939', '756': '7553', '757': '8784', '758': '9121', '759': '9266', '760': '10359', '761': '10406', '762': '11597', '763': '12750', '764': '15502', '765': '16895', '766': '17279', '767': '17615', '768': '18050', '769': '18678', '770': '19280', '771': '19963', '772': '20766', '773': '21329', '774': '21513', '775': '23212', '776': '24289', '777': '24781', '778': '24813', '779': '26593', '780': '27428', '781': '28120', '782': '28627', '783': '29561', '784': '33379', '785': '33820', '786': '34135', '787': '35192', '788': '36242', '789': '36530', '790': '36616', '791': '36738', '792': '37111', '793': '41088', '794': '41192', '795': '41506', '796': '44506', '797': '45081', '798': '47993', '799': '48890', '800': '49462'}
EFF_MAP_06 = {'801': '62937', '802': '63569', '803': '63582', '804': '63641', '805': '64227', '806': '64266', '807': '64310', '808': '64642', '809': '64687', '810': '64704', '811': '64723', '812': '64741', '813': '64752', '814': '64938', '815': '65024', '816': '65070', '817': '65222', '818': '65335', '819': '65486', '820': '65504', '821': '65599', '822': '65821', '823': '65826', '824': '65950', '825': '66084', '826': '66409', '827': '66725', '828': '67326', '829': '67531', '830': '67550', '831': '67782', '832': '67961', '833': '68322', '834': '68492', '835': '68967', '836': '69028', '837': '69127', '838': '69401', '839': '69552', '840': '69564', '841': '69935', '842': '70033', '843': '70041', '844': '70285', '845': '70579', '846': '70707', '847': '70751', '848': '70815', '849': '70935', '850': '71136'}
class Gov2Doc(NamedTuple):
doc_id: str
url: str
http_headers: str
body: bytes
body_content_type: str
def default_text(self):
return ir_datasets.util.sax_html_parser(self.body, headers=self.http_headers, fields=[{'title', 'body'}])[0]
class Gov2DocIter:
def __init__(self, gov2_docs, slice):
self.gov2_docs = gov2_docs
self.slice = slice
self.next_index = 0
self.file_iter = gov2_docs._docs_iter_source_files()
self.current_file = None
self.current_file_start_idx = 0
self.current_file_end_idx = 0
def __next__(self):
if self.slice.start >= self.slice.stop:
raise StopIteration
while self.next_index != self.slice.start or self.current_file is None or self.current_file_end_idx <= self.slice.start:
if self.current_file is None or self.current_file_end_idx <= self.slice.start:
# First iteration or no docs remaining in this file
if self.current_file is not None:
self.current_file.close()
self.current_file = None
# jump ahead to the file that contains the desired index
first = True
while first or self.current_file_end_idx < self.slice.start:
source_file = next(self.file_iter)
self.next_index = self.current_file_end_idx
self.current_file_start_idx = self.current_file_end_idx
self.current_file_end_idx = self.current_file_start_idx + self.gov2_docs._docs_file_counts()[source_file]
first = False
self.current_file = self.gov2_docs._docs_ctxt_iter_gov2(source_file)
else:
for _ in zip(range(self.slice.start - self.next_index), self.current_file):
# The zip here will stop at after either as many docs we must advance, or however
# many docs remain in the file. In the latter case, we'll just drop out into the
# next iteration of the while loop and pick up the next file.
self.next_index += 1
result = next(self.current_file)
self.next_index += 1
self.slice = slice(self.slice.start + (self.slice.step or 1), self.slice.stop, self.slice.step)
return result
def close(self):
self.file_iter = None
def __iter__(self):
return self
def __del__(self):
self.close()
def __getitem__(self, key):
if isinstance(key, slice):
# it[start:stop:step]
new_slice = ir_datasets.util.apply_sub_slice(self.slice, key)
return Gov2DocIter(self.gov2_docs, new_slice)
elif isinstance(key, int):
# it[index]
new_slice = ir_datasets.util.slice_idx(self.slice, key)
new_it = Gov2DocIter(self.gov2_docs, new_slice)
try:
return next(new_it)
except StopIteration as e:
raise IndexError((self.slice, slice(key, key+1), new_slice))
raise TypeError('key must be int or slice')
class Gov2Docs(BaseDocs):
def __init__(self, docs_dlc, doccount_dlc):
super().__init__()
self.docs_dlc = docs_dlc
self._doccount_dlc = doccount_dlc
self._docs_file_counts_cache = None
def docs_path(self, force=True):
return self.docs_dlc.path(force)
def _docs_iter_source_files(self):
dirs = sorted((Path(self.docs_dlc.path()) / 'GOV2_data').glob('GX???'))
for source_dir in dirs:
for source_file in sorted(source_dir.glob('*.gz')):
yield str(source_file)
def docs_iter(self):
return Gov2DocIter(self, slice(0, self.docs_count()))
def docs_cls(self):
return Gov2Doc
def _docs_ctxt_iter_gov2(self, gov2f):
with ExitStack() as stack:
if isinstance(gov2f, (str, Path)):
gov2f = stack.enter_context(gzip.open(gov2f, 'rb'))
inp = bytearray()
# incrementally read the input file with read1 -- this ends up being more than twice
# as fast as reading the input line-by-line and searching for and lines
inp.extend(gov2f.read1())
START, END = b'\n', b'\n'
while inp != b'':
inp, next_doc = self._extract_next_block(inp, START, END)
while next_doc is not None:
yield self._process_gov2_doc(next_doc)
inp, next_doc = self._extract_next_block(inp, START, END)
inp.extend(gov2f.read1())
def _process_gov2_doc(self, raw_doc):
# read the file by exploiting the sequence of blocks in the document -- this ends
# up being several times faster than reading line-by-line
raw_doc, doc_id = self._extract_next_block(raw_doc, b'', b'\n')
assert doc_id is not None
doc_id = doc_id.strip().decode()
doc_body, doc_hdr = self._extract_next_block(raw_doc, b'\n', b'\n')
assert doc_hdr is not None
for encoding in ['utf8', 'ascii', 'latin1']:
try:
doc_url, doc_hdr = doc_hdr.decode(encoding).split('\n', 1)
break
except UnicodeDecodeError:
continue
content_type_match = re.search('^content-type:(.*)$', doc_hdr, re.I|re.M)
content_type = 'text/html' # default to text/html
if content_type_match:
content_type = content_type_match.group(1)
if ';' in content_type:
content_type, _ = content_type.split(';', 1)
content_type = content_type.strip()
return Gov2Doc(doc_id, doc_url, doc_hdr, bytes(doc_body), content_type)
def _extract_next_block(self, inp, START, END):
# if START and END appear in inp, then return (everything after END in inp, the content between START and END),
# or if they don't appear, return (inp, None).
i_start = inp.find(START)
i_end = inp.find(END)
if i_start == -1 or i_end == -1:
return inp, None
return inp[i_end+len(END):], inp[i_start+len(START):i_end]
def _docs_id_to_source_file(self, doc_id):
parts = doc_id.split('-')
if len(parts) != 3:
return None
s_dir, file, doc = parts
source_file = os.path.join(self.docs_dlc.path(), 'GOV2_data', s_dir, f'{file}.gz')
return source_file
def _docs_file_counts(self):
if self._docs_file_counts_cache is None:
result = {}
with self._doccount_dlc.stream() as f:
f = codecs.getreader('utf8')(f)
for line in f:
path, count = line.strip().split()
file = os.path.join(self.docs_dlc.path(), 'GOV2_data', path)
result[file] = int(count)
self._docs_file_counts_cache = result
return self._docs_file_counts_cache
def docs_store(self, options=ir_datasets.indices.DEFAULT_DOCSTORE_OPTIONS):
docstore = Gov2Docstore(self)
return ir_datasets.indices.CacheDocstore(docstore, f'{self.docs_path(force=False)}.cache', options=options)
def docs_count(self):
return sum(self._docs_file_counts().values())
def docs_namespace(self):
return NAME
def docs_lang(self):
return 'en'
class Gov2Docstore(Docstore):
def __init__(self, gov2_docs, options=ir_datasets.indices.DEFAULT_DOCSTORE_OPTIONS):
super().__init__(gov2_docs.docs_cls(), 'doc_id', options=options)
self.gov2_docs = gov2_docs
def get_many_iter(self, doc_ids):
result = {}
files_to_search = {}
for doc_id in doc_ids:
source_file = self.gov2_docs._docs_id_to_source_file(doc_id)
if source_file is not None:
if source_file not in files_to_search:
files_to_search[source_file] = []
files_to_search[source_file].append(doc_id)
for source_file, doc_ids in files_to_search.items():
doc_ids = sorted(doc_ids)
for doc in self.gov2_docs._docs_ctxt_iter_gov2(source_file):
if doc_ids[0] == doc.doc_id:
yield doc
doc_ids = doc_ids[1:]
if not doc_ids:
break # file finished
class RewriteQids(BaseQrels):
def __init__(self, base_qrels, qid_map):
self._base_qrels = base_qrels
self._qid_map = qid_map
def qrels_iter(self):
cls = self.qrels_cls()
for qrel in self._base_qrels.qrels_iter():
if qrel.query_id in self._qid_map:
qrel = cls(self._qid_map[qrel.query_id], *qrel[1:])
yield qrel
def qrels_defs(self):
return self._base_qrels.qrels_defs()
def qrels_path(self):
return self._base_qrels.qrels_path()
def qrels_cls(self):
return self._base_qrels.qrels_cls()
class Gov2DocCountFile:
def __init__(self, path, docs_dlc):
self._path = path
self._docs_dlc = docs_dlc
def path(self, force=True):
if force and not os.path.exists(self._path):
docs_urls_path = os.path.join(self._docs_dlc.path(), 'GOV2_extras/url2id.gz')
result = Counter()
with _logger.pbar_raw(desc='building doccounts file', total=25205179, unit='doc') as pbar:
with gzip.open(docs_urls_path, 'rt') as fin:
for line in fin:
url, doc_id = line.rstrip().split()
d, f, i = doc_id.split('-') # formatted like: GX024-52-0546388
file = f'{d}/{f}.gz'
result[file] += 1
pbar.update()
with ir_datasets.util.finialized_file(self._path, 'wt') as fout:
for file in sorted(result):
fout.write(f'{file}\t{result[file]}\n')
return self._path
@contextmanager
def stream(self):
with open(self.path(), 'rb') as f:
yield f
def _init():
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
subsets = {}
docs_dlc = dlc['docs']
doccount_dlc = Gov2DocCountFile(os.path.join(base_path, 'corpus.doccounts'), docs_dlc)
collection = Gov2Docs(docs_dlc, doccount_dlc)
base = Dataset(collection, documentation('_'))
subsets['trec-tb-2004'] = Dataset(
collection,
TrecQueries(dlc['trec-tb-2004/queries'], namespace=NAME, lang='en'),
TrecQrels(dlc['trec-tb-2004/qrels'], QREL_DEFS),
documentation('trec-tb-2004')
)
subsets['trec-tb-2005'] = Dataset(
collection,
TrecQueries(dlc['trec-tb-2005/queries'], namespace=NAME, lang='en'),
TrecQrels(dlc['trec-tb-2005/qrels'], QREL_DEFS),
documentation('trec-tb-2005')
)
subsets['trec-tb-2005/named-page'] = Dataset(
collection,
TrecQueries(dlc['trec-tb-2005/named-page/queries'], qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace=NAME, lang='en'),
TrecQrels(dlc['trec-tb-2005/named-page/qrels'], NAMED_PAGE_QREL_DEFS),
documentation('trec-tb-2005/named-page')
)
subsets['trec-tb-2005/efficiency'] = Dataset(
collection,
TrecColonQueries(GzipExtract(dlc['trec-tb-2005/efficiency/queries']), encoding='latin1', namespace=NAME, lang='en'),
RewriteQids(TrecQrels(dlc['trec-tb-2005/qrels'], QREL_DEFS), EFF_MAP_05),
documentation('trec-tb-2005/efficiency')
)
subsets['trec-tb-2006'] = Dataset(
collection,
TrecQueries(dlc['trec-tb-2006/queries'], namespace=NAME, lang='en'),
TrecQrels(dlc['trec-tb-2006/qrels'], QREL_DEFS),
documentation('trec-tb-2006')
)
subsets['trec-tb-2006/named-page'] = Dataset(
collection,
TrecQueries(dlc['trec-tb-2006/named-page/queries'], qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace=NAME, lang='en'),
TrecQrels(dlc['trec-tb-2006/named-page/qrels'], NAMED_PAGE_QREL_DEFS),
documentation('trec-tb-2006/named-page')
)
subsets['trec-tb-2006/efficiency'] = Dataset(
collection,
TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.all'), encoding='latin1', namespace=NAME, lang='en'),
RewriteQids(TrecQrels(dlc['trec-tb-2006/qrels'], QREL_DEFS), EFF_MAP_06),
documentation('trec-tb-2006/efficiency')
)
subsets['trec-tb-2006/efficiency/10k'] = Dataset(
collection,
TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.10k'), encoding='latin1', namespace=NAME, lang='en'),
documentation('trec-tb-2006/efficiency/10k')
)
subsets['trec-tb-2006/efficiency/stream1'] = Dataset(
collection,
TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-1'), encoding='latin1', namespace=NAME, lang='en'),
documentation('trec-tb-2006/efficiency/stream1')
)
subsets['trec-tb-2006/efficiency/stream2'] = Dataset(
collection,
TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-2'), encoding='latin1', namespace=NAME, lang='en'),
documentation('trec-tb-2006/efficiency/stream2')
)
subsets['trec-tb-2006/efficiency/stream3'] = Dataset(
collection,
TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-3'), encoding='latin1', namespace=NAME, lang='en'),
RewriteQids(TrecQrels(dlc['trec-tb-2006/qrels'], QREL_DEFS), EFF_MAP_06),
documentation('trec-tb-2006/efficiency/stream3')
)
subsets['trec-tb-2006/efficiency/stream4'] = Dataset(
collection,
TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-4'), encoding='latin1', namespace=NAME, lang='en'),
documentation('trec-tb-2006/efficiency/stream4')
)
subsets['trec-mq-2007'] = Dataset(
collection,
TrecColonQueries(GzipExtract(dlc['trec-mq-2007/queries']), encoding='latin1'),
TrecPrels(dlc['trec-mq-2007/qrels'], QREL_DEFS),
documentation('trec-mq-2007')
)
subsets['trec-mq-2008'] = Dataset(
collection,
TrecColonQueries(GzipExtract(dlc['trec-mq-2008/queries']), encoding='latin1', namespace='trec-mq', lang='en'),
TrecPrels(TarExtract(dlc['trec-mq-2008/qrels'], '2008.RC1/prels'), QREL_DEFS),
documentation('trec-mq-2008')
)
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/hc4.py
================================================
import ir_datasets
from ir_datasets.util import DownloadConfig
from ir_datasets.formats import TrecQrels
from ir_datasets.datasets.base import Dataset, YamlDocumentation
from ir_datasets.formats import ExctractedCCDocs, ExctractedCCQueries
NAME = 'hc4'
DOC_COUNTS = {
'zh': 646305,
'fa': 486486,
'ru': 4721064
}
QREL_DEFS = {
3: 'Very-valuable. Information in the document would be found in the lead paragraph of a report that is later written on the topic.',
1: 'Somewhat-valuable. The most valuable information in the document would be found in the remainder of such a report.',
0: 'Not-valuable. Information in the document might be included in a report footnote, or omitted entirely.',
}
def _init():
subsets = {}
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
base = Dataset(documentation('_')) # dummy top level ds
for lang in ['zh', 'fa', 'ru']:
lang_docs = ExctractedCCDocs(dlc[f'{lang}/docs'], subset_lang=lang, namespace=NAME, count=DOC_COUNTS[lang])
subsets[lang] = Dataset(
lang_docs,
documentation(lang)
)
for sep in ['train', 'dev', 'test']:
subsets[f'{lang}/{sep}'] = Dataset(
lang_docs,
ExctractedCCQueries(dlc[f'{sep}/topics'], subset_lang=lang, namespace=NAME),
TrecQrels(dlc[f'{lang}/{sep}/qrels'], QREL_DEFS),
documentation(f'{lang}/{sep}'),
)
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/highwire.py
================================================
import codecs
from typing import NamedTuple, Tuple
from zipfile import ZipFile
import ir_datasets
from ir_datasets.util import DownloadConfig
from ir_datasets.formats import BaseDocs, BaseQueries, GenericQuery, BaseQrels
from ir_datasets.datasets.base import Dataset, YamlDocumentation
from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS
_logger = ir_datasets.log.easy()
SOURCES = ['ajepidem', 'ajpcell', 'ajpendometa', 'ajpgastro', 'ajpheart', 'ajplung', 'ajprenal', 'alcohol', 'andrology', 'annonc', 'bjanast', 'bjp', 'blood', 'carcinogenesis', 'cercor', 'development', 'diabetes', 'endocrinology', 'euroheartj', 'glycobiology', 'humanrep', 'humolgen', 'ijepidem', 'intimm', 'jantichemo', 'jappliedphysio', 'jbc-1995', 'jbc-1996', 'jbc-1997', 'jbc-1998', 'jbc-1999', 'jbc-2000', 'jbc-2001', 'jbc-2002', 'jbc-2003', 'jbc-2004', 'jbc-2005', 'jcb', 'jclinicalendometa', 'jcs', 'jexpbio', 'jexpmed', 'jgenphysio', 'jgenviro', 'jhistocyto', 'jnci', 'jneuro', 'mcp', 'microbio', 'molbiolevol', 'molendo', 'molhumanrep', 'nar', 'nephrodiatransp', 'peds', 'physiogenomics', 'rheumatolgy', 'rna', 'toxsci']
QREL_DEFS_06 = {
0: 'NOT',
1: 'POSSIBLY',
2: 'DEFINITELY'
}
QREL_DEFS_07 = {
0: 'NOT_RELEVANT',
1: 'RELEVANT',
}
NAME = 'highwire'
class HighwireSpan(NamedTuple):
start: int
length: int
text: str
class HighwireDoc(NamedTuple):
doc_id: str
journal: str
title: str
spans: Tuple[HighwireSpan, ...]
def default_text(self):
"""
title + spans
"""
return self.title + ' ' + ' '.join(s.text for s in self.spans)
class TrecGenomicsQrel(NamedTuple):
query_id: str
doc_id: str
span_start: int
span_len: int
relevance: int
class HighwireQrel(NamedTuple):
query_id: str
doc_id: str
start: int
length: int
relevance: int
class HighwireDocs(BaseDocs):
def __init__(self, dlcs, legalspans_dlc):
self._dlcs = dlcs
self._legalspans_dlc = legalspans_dlc
def docs_iter(self):
return iter(self.docs_store())
def _docs_iter(self):
lxml_html = ir_datasets.lazy_libs.lxml_html()
def _legalspans_iter():
with self._legalspans_dlc.stream() as f:
prev_did, spans = None, None
for line in codecs.getreader('utf8')(f):
doc_id, start_idx, length = line.split()
if prev_did != doc_id:
if prev_did is not None:
yield prev_did, spans
prev_did, spans = doc_id, []
spans.append((int(start_idx), int(length)))
yield prev_did, spans
legalspans_iter = _legalspans_iter()
for source in SOURCES:
with ZipFile(self._dlcs[source].path(), 'r') as zipf:
for record in zipf.filelist:
doc_id = record.filename.split('/')[-1].split('.')[0]
doc_raw = zipf.open(record, 'r').read()
legalspans_did, legalspans = next(legalspans_iter, None)
assert legalspans_did == doc_id
spans = tuple(HighwireSpan(s, l, doc_raw[s:s+l]) for s, l in legalspans)
# the title should be in the first span inside a element
title = lxml_html.document_fromstring(b'' + spans[0].text + b'')
title = title.xpath("//h2")
title = title[0].text_content() if title else ''
# keep just the text content within each spans
spans = tuple(HighwireSpan(s, l, lxml_html.document_fromstring(b'' + t + b'').text_content()) for s, l, t in spans)
yield HighwireDoc(doc_id, source, title, spans)
def docs_path(self, force=True):
return ir_datasets.util.home_path()/NAME/'corpus'
def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
return PickleLz4FullStore(
path=f'{self.docs_path(force=False)}.pklz4',
init_iter_fn=self._docs_iter,
data_cls=self.docs_cls(),
lookup_field=field,
index_fields=['doc_id'],
count_hint=ir_datasets.util.count_hint(NAME),
options=options
)
def docs_cls(self):
return HighwireDoc
def docs_namespace(self):
return NAME
def docs_count(self):
if self.docs_store().built():
return self.docs_store().count()
def docs_lang(self):
return 'en'
class TrecGenomicsQueries(BaseQueries):
def __init__(self, queries_dlc):
self._queries_dlc = queries_dlc
def queries_iter(self):
with self._queries_dlc.stream() as f:
for line in codecs.getreader('cp1252')(f):
if line.strip() == '':
continue
doc_id, text = line[1:4], line[5:].rstrip()
text = text.replace('[ANTIBODIES]', 'antibodies').replace('[BIOLOGICAL SUBSTANCES]', 'biological substances').replace('[CELL OR TISSUE TYPES]', 'cell or tissue types').replace('[DISEASES]', 'diseases').replace('[DRUGS]', 'drugs').replace('[GENES]', 'genes').replace('[MOLECULAR FUNCTIONS]', 'molecular functions').replace('[MUTATIONS]', 'mutations').replace('[PATHWAYS]', 'pathways').replace('[PROTEINS]', 'proteins').replace('[SIGNS OR SYMPTOMS]', 'signs or symptoms').replace('[STRAINS]', 'strains').replace('[TOXICITIES]', 'toxicities').replace('[TUMOR TYPES]', 'tumor types')
yield GenericQuery(doc_id, text)
def queries_cls(self):
return GenericQuery
def queries_namespace(self):
return 'trec-genomics'
def queries_lang(self):
return 'en'
class HighwireQrels(BaseQrels):
def __init__(self, qrels_dlc, qrel_defs):
self._qrels_dlc = qrels_dlc
self._qrel_defs = qrel_defs
def qrels_iter(self):
rev_devs = dict((v, k) for k, v in self._qrel_defs.items())
with self._qrels_dlc.stream() as f:
for line in codecs.getreader('utf8')(f):
if line.startswith('#') or line.strip() == '':
continue
cols = line.split()
if len(cols) == 6: # 2006
query_id, doc_id, start, length, _, rel_str = cols
elif len(cols) == 5: # 2006
query_id, doc_id, start, length, rel_str = cols
else:
raise RuntimeError('error parsing file')
yield HighwireQrel(query_id, doc_id, int(start), int(length), rev_devs[rel_str])
def qrels_defs(self):
return self._qrel_defs
def qrels_path(self):
return self._qrels_dlc.path()
def qrels_cls(self):
return HighwireQrel
def _init():
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
subsets = {}
collection = HighwireDocs(dlc, dlc['legalspans'])
base = Dataset(collection, documentation('_'))
subsets['trec-genomics-2006'] = Dataset(
collection,
TrecGenomicsQueries(dlc['trec-genomics-2006/queries']),
HighwireQrels(dlc['trec-genomics-2006/qrels'], QREL_DEFS_06),
documentation('trec-genomics-2006'),
)
subsets['trec-genomics-2007'] = Dataset(
collection,
TrecGenomicsQueries(dlc['trec-genomics-2007/queries']),
HighwireQrels(dlc['trec-genomics-2007/qrels'], QREL_DEFS_07),
documentation('trec-genomics-2007'),
)
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/istella22.py
================================================
import json
import codecs
from typing import NamedTuple, Dict, List
import ir_datasets
from ir_datasets.util import TarExtract, TarExtractAll, RelativePath, GzipExtract, Lazy
from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries, FilteredQrels
from ir_datasets.formats import JsonlDocs, JsonlQueries, TrecQrels
from ir_datasets.indices import PickleLz4FullStore
_logger = ir_datasets.log.easy()
class Istella22Doc(NamedTuple):
doc_id: str
title: str
url: str
text: str
extra_text: str
lang: str
lang_pct: int
def default_text(self):
"""
title + text + extra_text
"""
return f'{self.title} {self.text} {self.extra_text}'
NAME = 'istella22'
QREL_DEFS = {1: 'Least relevant', 2: 'Somewhat relevant', 3: 'Mostly relevant', 4: 'Perfectly relevant'}
DUA = ("To use the Istella22 dataset, you must read and accept the Istella22 Licence Agreement, found here: "
"")
def _init():
base_path = ir_datasets.util.home_path()/NAME
dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path, dua=DUA)
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
base_dlc = TarExtractAll(dlc['source'], base_path/'istella22_extracted')
docs = JsonlDocs(GzipExtract(RelativePath(base_dlc, 'istella22/docs.jsonl.gz')), doc_cls=Istella22Doc, lang=None, count_hint=8421456)
test_queries = JsonlQueries(GzipExtract(RelativePath(base_dlc, 'istella22/queries.test.jsonl.gz')), lang='it')
test_qrels = TrecQrels(GzipExtract(RelativePath(base_dlc, 'istella22/qrels.test.gz')), QREL_DEFS)
base = Dataset(
docs,
documentation('_'))
subsets = {}
subsets['test'] = Dataset(
docs,
test_queries,
test_qrels,
documentation('test'))
for fold in ['fold1', 'fold2', 'fold3', 'fold4', 'fold5']:
fold_qids = Lazy(fold_qids_factory(fold, base_dlc))
subsets[f'test/{fold}'] = Dataset(
docs,
FilteredQueries(test_queries, fold_qids, mode='include'),
FilteredQrels(test_qrels, fold_qids, mode='include'),
documentation('test'))
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
def fold_qids_factory(fold, base_dlc):
def wrapped():
with TarExtract(RelativePath(base_dlc, 'istella22/queries.test.folds.tar.gz'), f'./test.queries.{fold}').stream() as f:
result = [qid.decode().strip().lstrip('0') for qid in f]
return result
return wrapped
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/kilt.py
================================================
import json
import codecs
from typing import NamedTuple, Tuple
import ir_datasets
from ir_datasets.util import TarExtractAll, Cache, RelativePath, Lazy, Migrator
from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQrels
from ir_datasets.formats import BaseDocs, TrecQrels
from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS
from ir_datasets.datasets import codec
_logger = ir_datasets.log.easy()
NAME = 'kilt'
CODEC_QREL_DEFS = {
3: 'Very Valuable. It is absolutely critical to understand what this entity is for understanding this topic.',
2: 'Somewhat valuable. It is important to understand what this entity is for understanding this topic.',
1: 'Not Valuable. It is useful to understand what this entity is for understanding this topic.',
0: 'Not Relevant. This entity is not useful or on topic.',
}
class KiltDocAnchor(NamedTuple):
text: str
href: str
paragraph_id: int
start: int
end: int
class KiltDoc(NamedTuple):
doc_id: str
title: str
text: str
text_pieces: Tuple[str, ...]
anchors: Tuple[KiltDocAnchor, ...]
categories: Tuple[str, ...]
wikidata_id: str
history_revid: str
history_timestamp: str
history_parentid: str
history_pageid: str
history_url: str
def default_text(self):
"""
title + text
"""
return f'{self.title} {self.text}'
def strip_markup(text):
if text.startswith('Section::::'):
return text.replace('Section::::', '').replace(':', ' ')
if text.startswith('BULLET::::-'):
return text.replace('BULLET::::-', '-')
return text
class KiltDocs(BaseDocs):
def __init__(self, streamer, count_hint=None):
super().__init__()
self._streamer = streamer
self._count_hint = count_hint
@ir_datasets.util.use_docstore
def docs_iter(self):
for doc in self.docs_kilt_raw_iter():
yield KiltDoc(
doc['wikipedia_id'],
doc['wikipedia_title'],
''.join(strip_markup(t) for t in doc['text']),
tuple(doc['text']),
tuple(KiltDocAnchor(
a['text'],
a['href'],
a['paragraph_id'],
a['start'],
a['end']) for a in doc['anchors']),
tuple(doc['categories'].split(',')),
doc.get('wikidata_info', {}).get('wikidata_id', ''),
str(doc['history']['revid']),
doc['history']['timestamp'],
str(doc['history']['parentid']),
str(doc['history']['pageid']),
doc['history']['url'],
)
def docs_cls(self):
return KiltDoc
def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
return PickleLz4FullStore(
path=f'{ir_datasets.util.home_path()/NAME}/docs.pklz4',
init_iter_fn=self.docs_iter,
data_cls=self.docs_cls(),
lookup_field=field,
index_fields=['doc_id'],
count_hint=self._count_hint,
options=options
)
def docs_count(self):
if self.docs_store().built():
return self.docs_store().count()
def docs_namespace(self):
return NAME
def docs_lang(self):
return 'en'
def docs_kilt_raw_iter(self):
with self._streamer.stream() as stream:
for doc in stream:
yield json.loads(doc)
def _init():
base_path = ir_datasets.util.home_path()/NAME
dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path)
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
corpus = KiltDocs(dlc['knowledgesource'], count_hint=5903530)
base = Dataset(
corpus,
documentation('_'))
subsets = {}
subsets['codec'] = Dataset(
corpus,
codec.base.queries_handler(),
TrecQrels(dlc['codec/qrels'], CODEC_QREL_DEFS),
documentation('codec'))
for domain in codec.DOMAINS:
queries_handler = codec.subsets[domain]
subsets[f'codec/{domain}'] = Dataset(
corpus,
queries_handler,
FilteredQrels(subsets['codec'].qrels_handler(), codec.filter_qids(domain, queries_handler), mode='include'),
documentation(f'codec/{domain}'))
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/lotte.py
================================================
import json
import codecs
from typing import NamedTuple, Dict, List
import ir_datasets
from ir_datasets.util import TarExtractAll, Cache, RelativePath, Lazy, Migrator
from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries
from ir_datasets.formats import TsvDocs, TsvQueries, BaseQrels, GenericDoc, GenericQuery, TrecQrel
from ir_datasets.indices import PickleLz4FullStore
_logger = ir_datasets.log.easy()
NAME = 'lotte'
QRELS_DEFS = {1: 'Answer upvoted or accepted on stack exchange'}
class LotteQrels(BaseQrels):
def __init__(self, qrels_dlc):
self._qrels_dlc = qrels_dlc
def qrels_path(self):
return self._qrels_dlc.path()
def qrels_iter(self):
with self._qrels_dlc.stream() as f:
for line in f:
data = json.loads(line)
for did in data['answer_pids']:
yield TrecQrel(str(data['qid']), str(did), 1, "0")
def qrels_cls(self):
return TrecQrel
def qrels_defs(self):
return QRELS_DEFS
def _init():
base_path = ir_datasets.util.home_path()/NAME
dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path)
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
base_dlc = TarExtractAll(dlc['source'], base_path/'lotte_extracted')
base = Dataset(documentation('_'))
subsets = {}
domains = [
('lifestyle',),
('recreation',),
('science',),
('technology',),
('writing',),
('pooled',),
]
for (domain,) in domains:
for split in ['dev', 'test']:
corpus = TsvDocs(RelativePath(base_dlc, f'lotte/{domain}/{split}/collection.tsv'), lang='en')
subsets[f'{domain}/{split}'] = Dataset(
corpus,
documentation(f'{domain}/{split}')
)
for qtype in ['search', 'forum']:
subsets[f'{domain}/{split}/{qtype}'] = Dataset(
corpus,
TsvQueries(RelativePath(base_dlc, f'lotte/{domain}/{split}/questions.{qtype}.tsv'), lang='en'),
LotteQrels(RelativePath(base_dlc, f'lotte/{domain}/{split}/qas.{qtype}.jsonl')),
documentation(f'{domain}/{split}/{qtype}')
)
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/medline.py
================================================
import codecs
import itertools
import io
import gzip
from contextlib import ExitStack
import itertools
from typing import NamedTuple, Tuple
import tarfile
import ir_datasets
from ir_datasets.util import DownloadConfig, GzipExtract, ZipExtract
from ir_datasets.formats import BaseDocs, BaseQueries, GenericQuery, TrecQrels, TrecXmlQueries
from ir_datasets.datasets.base import Dataset, YamlDocumentation
from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS
from .highwire import TrecGenomicsQueries
_logger = ir_datasets.log.easy()
QREL_DEFS = {
0: 'not relevant',
1: 'possibly relevant',
2: 'definitely relevant'
}
TREC04_XML_MAP = {
'ID': 'query_id',
'TITLE': 'title',
'NEED': 'need',
'CONTEXT': 'context',
}
NAME = 'medline'
class MedlineDoc(NamedTuple):
doc_id: str
title: str
abstract: str
def default_text(self):
"""
title + abstract
"""
return f'{self.title} {self.abstract}'
class TrecGenomicsQuery(NamedTuple):
query_id: str
title: str
need: str
context: str
def default_text(self):
"""
title
"""
return self.title
class TrecPm2017Query(NamedTuple):
query_id: str
disease: str
gene: str
demographic: str
other: str
def default_text(self):
"""
disease, gene, demographic, and other
"""
return f'{self.disease} {self.gene} {self.demographic} {self.other}'
class TrecPmQuery(NamedTuple):
query_id: str
disease: str
gene: str
demographic: str
def default_text(self):
"""
disease, gene, and demographic
"""
return f'{self.disease} {self.gene} {self.demographic}'
class ConcatFile:
"""
Simulates a sequence of file-like objects that are cat'd.
Only supports read operations.
"""
def __init__(self, files):
self.file_iter = files
self.file = next(self.file_iter)
def read(self, count=None):
result = b''
while not result and self.file is not None:
result = self.file.read(count)
if not result:
self.file = next(self.file_iter, None)
return result
class MedlineDocs(BaseDocs):
def __init__(self, name, dlcs, count_hint=None):
self._name = name
self._dlcs = dlcs
self._count_hint = count_hint
@ir_datasets.util.use_docstore
def docs_iter(self):
ET = ir_datasets.lazy_libs.xml_etree()
with ExitStack() as stack:
if self._name == '2004':
# The files for 2004 are a large XML file that's split internally.
# Simulate one big file for the parser below.
EOF = io.BytesIO(b'\n')
files = [ConcatFile(itertools.chain(
(stack.enter_context(dlc.stream()) for dlc in self._dlcs),
(EOF,)
))]
elif self._name == '2017':
# The files for 2017 are individual files in a big tar file. Generate
# a file for each.
def _files():
for dlc in self._dlcs:
with dlc.stream() as f:
tarf = stack.enter_context(tarfile.open(fileobj=f, mode=f'r|gz'))
for r in tarf:
if r.isfile() and r.name.endswith('.gz'):
yield gzip.GzipFile(fileobj=tarf.extractfile(r), mode='r')
files = _files()
else:
raise ValueError(f'unknown {self._name}')
for file in files:
for _, el in ET.iterparse(file, events=['end']):
if el.tag == 'MedlineCitation':
doc_id = el.find('.//PMID').text
title = el.find('.//ArticleTitle')
abstract = el.find('.//AbstractText')
yield MedlineDoc(doc_id, title.text if title is not None else '', abstract.text if abstract is not None else '')
if el.tag in ('PubmedArticle', 'MedlineCitation'):
el.clear() # so we don't need to keep it all in memory
def docs_path(self, force=True):
return ir_datasets.util.home_path()/NAME/self._name/'corpus'
def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
return PickleLz4FullStore(
path=f'{self.docs_path(force=False)}.pklz4',
init_iter_fn=self.docs_iter,
data_cls=self.docs_cls(),
lookup_field=field,
index_fields=['doc_id'],
size_hint=15900069519,
count_hint=self._count_hint,
options=options
)
def docs_cls(self):
return MedlineDoc
def docs_namespace(self):
return NAME
def docs_count(self):
if self.docs_store().built():
return self.docs_store().count()
def docs_lang(self):
return 'en'
class AacrAscoDocs(BaseDocs):
def __init__(self, dlc):
self._dlc = dlc
@ir_datasets.util.use_docstore
def docs_iter(self):
with self._dlc.stream() as f, tarfile.open(fileobj=f, mode=f'r|gz') as tarf:
for file in tarf:
if not file.isfile():
continue
file_reader = tarf.extractfile(file)
file_reader = codecs.getreader('utf8')(file_reader)
doc_id = file.name.split('/')[-1].split('.')[0]
meeting = next(file_reader)
title = ''
for line in file_reader:
title += line
if title.endswith('\n\n'):
break
assert title.startswith('Title:')
title = title[len('Title:'):].strip()
abstract = file_reader.read().strip()
yield MedlineDoc(doc_id, title, abstract)
def docs_path(self, force=True):
return ir_datasets.util.home_path()/NAME/'2017'/'corpus'
def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
return PickleLz4FullStore(
path=f'{self.docs_path(force=False)}.pklz4',
init_iter_fn=self.docs_iter,
data_cls=self.docs_cls(),
lookup_field=field,
index_fields=['doc_id'],
options=options
)
def docs_cls(self):
return MedlineDoc
def docs_namespace(self):
return NAME
def docs_count(self):
if self.docs_store().built():
return self.docs_store().count()
def docs_lang(self):
return 'en'
class ConcatDocs(BaseDocs):
def __init__(self, docs, count_hint=None):
self._docs = docs
self._count_hint = count_hint
def docs_iter(self):
return iter(self.docs_store())
@ir_datasets.util.use_docstore
def docs_iter(self):
for docs in self._docs:
yield from docs.docs_iter()
def docs_path(self, force=True):
return f'{self._docs[0].docs_path(force)}.concat'
def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
return PickleLz4FullStore(
path=f'{self.docs_path(force=False)}.pklz4',
init_iter_fn=self.docs_iter,
data_cls=self.docs_cls(),
lookup_field=field,
index_fields=['doc_id'],
count_hint=self._count_hint,
options=options
)
def docs_cls(self):
return self._docs[0].docs_cls()
def docs_namespace(self):
return self._docs[0].docs_namespace()
def docs_lang(self):
return self._docs[0].docs_lang()
def docs_count(self):
if self.docs_store().built():
return self.docs_store().count()
def _init():
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
subsets = {}
base = Dataset(documentation('_'))
collection04 = MedlineDocs('2004', [GzipExtract(dlc['2004/a']), GzipExtract(dlc['2004/b']), GzipExtract(dlc['2004/c']), GzipExtract(dlc['2004/d'])], count_hint=ir_datasets.util.count_hint(f'{NAME}/2004'))
subsets['2004'] = Dataset(collection04, documentation('2004'))
subsets['2004/trec-genomics-2004'] = Dataset(
collection04,
TrecXmlQueries(ZipExtract(dlc['trec-genomics-2004/queries'], 'Official.xml'), qtype=TrecGenomicsQuery, qtype_map=TREC04_XML_MAP, namespace='trec-genomics', lang='en'),
TrecQrels(dlc['trec-genomics-2004/qrels'], QREL_DEFS),
documentation('trec-genomics-2004'),
)
subsets['2004/trec-genomics-2005'] = Dataset(
collection04,
TrecGenomicsQueries(dlc['trec-genomics-2005/queries']),
TrecQrels(dlc['trec-genomics-2005/qrels'], QREL_DEFS),
documentation('trec-genomics-2005'),
)
collection17 = ConcatDocs([
AacrAscoDocs(dlc['2017/aacr_asco_extra']),
MedlineDocs('2017', [dlc['2017/part1'], dlc['2017/part2'], dlc['2017/part3'], dlc['2017/part4'], dlc['2017/part5']]),
], count_hint=ir_datasets.util.count_hint(f'{NAME}/2017'))
subsets['2017'] = Dataset(collection17, documentation('2017'))
subsets['2017/trec-pm-2017'] = Dataset(
collection17,
TrecXmlQueries(dlc['trec-pm-2017/queries'], qtype=TrecPm2017Query, namespace='trec-pm-2017', lang='en'),
TrecQrels(dlc['trec-pm-2017/qrels'], QREL_DEFS),
documentation('trec-pm-2017'),
)
subsets['2017/trec-pm-2018'] = Dataset(
collection17,
TrecXmlQueries(dlc['trec-pm-2018/queries'], qtype=TrecPmQuery, namespace='trec-pm-2018', lang='en'),
TrecQrels(dlc['trec-pm-2018/qrels'], QREL_DEFS),
documentation('trec-pm-2018'),
)
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/miracl.py
================================================
import ir_datasets
from typing import NamedTuple
from ir_datasets.util import DownloadConfig, GzipExtract
from ir_datasets.datasets.base import Dataset, YamlDocumentation
from ir_datasets.formats import JsonlDocs, TsvQueries, TrecQrels, TrecScoredDocs
NAME = 'miracl'
_logger = ir_datasets.log.easy()
QRELS_DEFS = {
0: 'Not Relevant',
1: 'Relevant',
}
class MiraclDoc(NamedTuple):
doc_id: str
title: str
text: str
def default_text(self):
return f'{self.title} {self.text}'
def _init():
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
subsets = {}
langs = [
('ar', 5, {'train', 'dev', 'test-a', 'test-b'}),
('bn', 1, {'train', 'dev', 'test-a', 'test-b'}),
('de', 32, {'dev', 'test-b'}),
('en', 66, {'train', 'dev', 'test-a', 'test-b'}),
('es', 21, {'train', 'dev', 'test-b'}),
('fa', 5, {'train', 'dev', 'test-b'}),
('fi', 4, {'train', 'dev', 'test-a', 'test-b'}),
('fr', 30, {'train', 'dev', 'test-b'}),
('hi', 2, {'train', 'dev', 'test-b'}),
('id', 3, {'train', 'dev', 'test-a', 'test-b'}),
('ja', 14, {'train', 'dev', 'test-a', 'test-b'}),
('ko', 3, {'train', 'dev', 'test-a', 'test-b'}),
('ru', 20, {'train', 'dev', 'test-a', 'test-b'}),
('sw', 1, {'train', 'dev', 'test-a', 'test-b'}),
('te', 2, {'train', 'dev', 'test-a', 'test-b'}),
('th', 2, {'train', 'dev', 'test-a', 'test-b'}),
('yo', 1, {'dev', 'test-b'}),
('zh', 10, {'train', 'dev', 'test-b'}),
]
for lang, n_doc_files, topic_sets in langs:
collection = JsonlDocs(
[GzipExtract(dlc[f'v1.0/{lang}/corpus/{i}']) for i in range(n_doc_files)],
doc_cls=MiraclDoc,
mapping={'doc_id': 'docid', 'title': 'title', 'text': 'text'},
namespace=f'{NAME}/{lang}',
lang=lang,
count_hint=ir_datasets.util.count_hint(f'{NAME}/{lang}'),
docstore_path=base_path/'v1.0'/lang/'docs.pklz4')
subsets[f'{lang}'] = Dataset(collection, documentation(f'{lang}'))
if 'train' in topic_sets:
subsets[f'{lang}/train'] = Dataset(
collection,
TsvQueries(dlc[f'v1.0/{lang}/train/topics'], namespace=f'{NAME}/{lang}', lang=lang),
TrecQrels(dlc[f'v1.0/{lang}/train/qrels'], QRELS_DEFS),
documentation(f'{lang}/train'))
if 'dev' in topic_sets:
subsets[f'{lang}/dev'] = Dataset(
collection,
TsvQueries(dlc[f'v1.0/{lang}/dev/topics'], namespace=f'{NAME}/{lang}', lang=lang),
TrecQrels(dlc[f'v1.0/{lang}/dev/qrels'], QRELS_DEFS),
documentation(f'{lang}/dev'))
if 'test-a' in topic_sets:
subsets[f'{lang}/test-a'] = Dataset(
collection,
TsvQueries(dlc[f'v1.0/{lang}/test-a/topics'], namespace=f'{NAME}/{lang}', lang=lang),
documentation(f'{lang}/test-a'))
if 'test-b' in topic_sets:
subsets[f'{lang}/test-b'] = Dataset(
collection,
TsvQueries(dlc[f'v1.0/{lang}/test-b/topics'], namespace=f'{NAME}/{lang}', lang=lang),
documentation(f'{lang}/test-b'))
ir_datasets.registry.register(NAME, Dataset(documentation('_')))
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return collection, subsets
collection, subsets = _init()
================================================
FILE: ir_datasets/datasets/mmarco.py
================================================
import io
import codecs
import re
import ir_datasets
from ir_datasets.util import DownloadConfig, Lazy
from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries
from ir_datasets.datasets import msmarco_passage
from ir_datasets.formats import TsvQueries, TsvDocs, TrecQrels, TsvDocPairs, TrecScoredDocs
NAME = 'mmarco'
_logger = ir_datasets.log.easy()
QRELS_DEFS = {
1: 'Labeled by crowd worker as relevant'
}
def _init():
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
subsets = {}
train_qrels = ir_datasets.registry['msmarco-passage/train'].qrels_handler()
train_docparis = TsvDocPairs(dlc['train/triples'])
dev_qrels = TrecQrels(dlc['dev/qrels'], QRELS_DEFS)
dev_small_qrels = TrecQrels(dlc['dev/qrels-small'], QRELS_DEFS)
small_dev_qids = Lazy(lambda: {q.query_id for q in dev_small_qrels.qrels_iter()})
for lang in ['es', 'fr', 'pt', 'it', 'id', 'de', 'ru', 'zh']:
collection = TsvDocs(dlc[f'{lang}/docs'], namespace=f'mmarco/{lang}', lang=lang, count_hint=ir_datasets.util.count_hint(f'{NAME}/{lang}'))
subsets[f'{lang}'] = Dataset(collection, documentation(f'{lang}'))
subsets[f'{lang}/train'] = Dataset(
collection,
TsvQueries(dlc[f'{lang}/queries/train'], namespace=f'mmarco/{lang}', lang=lang),
train_qrels,
train_docparis,
documentation(f'{lang}/train'))
subsets[f'{lang}/dev'] = Dataset(
collection,
TsvQueries(dlc[f'{lang}/queries/dev'], namespace=f'mmarco/{lang}', lang=lang),
dev_qrels,
documentation(f'{lang}/dev'))
subsets[f'{lang}/dev/small'] = Dataset(
collection,
FilteredQueries(subsets[f'{lang}/dev'].queries_handler(), small_dev_qids, mode='include'),
dev_small_qrels,
TrecScoredDocs(dlc[f'{lang}/scoreddocs/dev']) if lang not in ('zh', 'pt') else None,
documentation(f'{lang}/dev/small'))
if lang in ('zh', 'pt'):
subsets[f'{lang}/dev/v1.1'] = Dataset(
collection,
TsvQueries(dlc[f'{lang}/queries/dev/v1.1'], namespace=f'mmarco/{lang}', lang=lang),
dev_qrels,
documentation(f'{lang}/dev/v1.1'))
subsets[f'{lang}/dev/small/v1.1'] = Dataset(
collection,
FilteredQueries(subsets[f'{lang}/dev/v1.1'].queries_handler(), small_dev_qids, mode='include'),
dev_small_qrels,
TrecScoredDocs(dlc[f'{lang}/scoreddocs/dev/v1.1']),
documentation(f'{lang}/dev/v1.1'))
if lang in ('pt',):
subsets[f'{lang}/train/v1.1'] = Dataset(
collection,
TsvQueries(dlc[f'{lang}/queries/train/v1.1'], namespace=f'mmarco/{lang}', lang=lang),
train_qrels,
train_docparis,
documentation(f'{lang}/train/v1.1'))
for lang in ['ar', 'zh', 'dt', 'fr', 'de', 'hi', 'id', 'it', 'ja', 'pt', 'ru', 'es', 'vi']:
collection = TsvDocs(dlc[f'v2/{lang}/docs'], namespace=f'mmarco/{lang}', lang=lang, count_hint=ir_datasets.util.count_hint(f'{NAME}/v2/{lang}'))
subsets[f'v2/{lang}'] = Dataset(collection, documentation(f'v2/{lang}'))
subsets[f'v2/{lang}/train'] = Dataset(
collection,
TsvQueries(dlc[f'v2/{lang}/queries/train'], namespace=f'mmarco/v2/{lang}', lang=lang),
train_qrels,
train_docparis,
documentation(f'v2/{lang}/train'))
subsets[f'v2/{lang}/dev'] = Dataset(
collection,
TsvQueries(dlc[f'v2/{lang}/queries/dev'], namespace=f'v2/mmarco/{lang}', lang=lang),
dev_qrels,
documentation(f'v2/{lang}/dev'))
subsets[f'v2/{lang}/dev/small'] = Dataset(
collection,
FilteredQueries(subsets[f'v2/{lang}/dev'].queries_handler(), small_dev_qids, mode='include'),
dev_small_qrels,
TrecScoredDocs(dlc[f'v2/{lang}/scoreddocs/dev'], negate_score=True),
documentation(f'v2/{lang}/dev/small'))
ir_datasets.registry.register(NAME, Dataset(documentation('_')))
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return collection, subsets
collection, subsets = _init()
================================================
FILE: ir_datasets/datasets/mr_tydi.py
================================================
import json
import codecs
from typing import NamedTuple, Dict
import ir_datasets
from ir_datasets.util import TarExtractAll, RelativePath, GzipExtract, Migrator
from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries
from ir_datasets.formats import TsvQueries, BaseDocs, TrecQrels, GenericDoc
from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS
_logger = ir_datasets.log.easy()
NAME = 'mr-tydi'
QREL_DEFS = {
1: "Passage identified within Wikipedia article from top Google search results"
}
class MrTydiDocs(BaseDocs):
def __init__(self, dlc, lang, count_hint=None):
super().__init__()
self._dlc = dlc
self._count_hint = count_hint
self._lang = lang
@ir_datasets.util.use_docstore
def docs_iter(self):
with self._dlc.stream() as stream:
for line in stream:
data = json.loads(line)
yield GenericDoc(data['id'], data['contents'])
def docs_cls(self):
return GenericDoc
def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
return PickleLz4FullStore(
path=f'{ir_datasets.util.home_path()/NAME/self._lang}.pklz4',
init_iter_fn=self.docs_iter,
data_cls=self.docs_cls(),
lookup_field=field,
index_fields=['doc_id'],
count_hint=self._count_hint,
options=options
)
def docs_count(self):
if self.docs_store().built():
return self.docs_store().count()
def docs_namespace(self):
return f'{NAME}/{self._lang}'
def docs_lang(self):
return self._lang
def _init():
base_path = ir_datasets.util.home_path()/NAME
dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path)
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
base = Dataset(documentation('_'))
subsets = {}
langs = {
'ar': 'mrtydi-v1.0-arabic',
'bn': 'mrtydi-v1.0-bengali',
'en': 'mrtydi-v1.0-english',
'fi': 'mrtydi-v1.0-finnish',
'id': 'mrtydi-v1.0-indonesian',
'ja': 'mrtydi-v1.0-japanese',
'ko': 'mrtydi-v1.0-korean',
'ru': 'mrtydi-v1.0-russian',
'sw': 'mrtydi-v1.0-swahili',
'te': 'mrtydi-v1.0-telugu',
'th': 'mrtydi-v1.0-thai',
}
migrator = Migrator(base_path/'irds_version.txt', 'v2',
affected_files=[base_path/lang for lang in langs],
message='Migrating mr-tydi (restructuring directory)')
for lang, file_name in langs.items():
dlc_ds = TarExtractAll(dlc[lang], f'{base_path/lang}.data')
docs = MrTydiDocs(GzipExtract(RelativePath(dlc_ds, f'{file_name}/collection/docs.jsonl.gz')), lang, count_hint=ir_datasets.util.count_hint(f'{NAME}/{lang}'))
docs = migrator(docs)
subsets[lang] = Dataset(
docs,
TsvQueries(RelativePath(dlc_ds, f'{file_name}/topic.tsv'), lang=lang),
TrecQrels(RelativePath(dlc_ds, f'{file_name}/qrels.txt'), QREL_DEFS),
documentation(lang)
)
subsets[f'{lang}/train'] = Dataset(
docs,
TsvQueries(RelativePath(dlc_ds, f'{file_name}/topic.train.tsv'), lang=lang),
TrecQrels(RelativePath(dlc_ds, f'{file_name}/qrels.train.txt'), QREL_DEFS),
documentation(f'{lang}/train')
)
subsets[f'{lang}/dev'] = Dataset(
docs,
TsvQueries(RelativePath(dlc_ds, f'{file_name}/topic.dev.tsv'), lang=lang),
TrecQrels(RelativePath(dlc_ds, f'{file_name}/qrels.dev.txt'), QREL_DEFS),
documentation(f'{lang}/dev')
)
subsets[f'{lang}/test'] = Dataset(
docs,
TsvQueries(RelativePath(dlc_ds, f'{file_name}/topic.test.tsv'), lang=lang),
TrecQrels(RelativePath(dlc_ds, f'{file_name}/qrels.test.txt'), QREL_DEFS),
documentation(f'{lang}/test')
)
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/msmarco_document.py
================================================
from typing import NamedTuple, List
import json
import ir_datasets
from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS
from ir_datasets.util import Cache, DownloadConfig, GzipExtract, Lazy, Migrator
from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries, FilteredScoredDocs, FilteredQrels
from ir_datasets.formats import TrecDocs, TsvQueries, TrecQrels, TrecScoredDocs, BaseDocs
from ir_datasets.datasets.msmarco_passage import DUA, DL_HARD_QIDS_BYFOLD, DL_HARD_QIDS
NAME = 'msmarco-document'
_logger = ir_datasets.log.easy()
QRELS_DEFS = {
1: 'Document contains a passage labeled as relevant in msmarco-passage'
}
TREC_DL_QRELS_DEFS = {
3: "Perfectly relevant: Document is dedicated to the query, it is worthy of being a top result "
"in a search engine.",
2: "Highly relevant: The content of this document provides substantial information on the query.",
1: "Relevant: Document provides some information relevant to the query, which may be minimal.",
0: "Irrelevant: Document does not provide any useful information about the query",
}
ORCAS_QLRES_DEFS = {
1: "User click",
}
class MsMarcoDocument(NamedTuple):
doc_id: str
url: str
title: str
body: str
def default_text(self):
"""
title + body
"""
return f'{self.title} {self.body}'
# Use the TREC-formatted docs so we get all the available formatting (namely, line breaks)
class MsMarcoTrecDocs(TrecDocs):
def __init__(self, docs_dlc):
super().__init__(docs_dlc, parser='text', lang='en', docstore_size_hint=14373971970, count_hint=ir_datasets.util.count_hint(NAME))
@ir_datasets.util.use_docstore
def docs_iter(self):
for doc in super().docs_iter():
if isinstance(doc, MsMarcoDocument):
# It's coming from the docstore
yield doc
else:
# It's coming from the TredDocs parser... Do a little more reformatting:
# The first two lines are the URL and page title
url, title, *body = doc.text.lstrip('\n').split('\n', 2)
body = body[0] if body else ''
yield MsMarcoDocument(doc.doc_id, url, title, body)
def docs_cls(self):
return MsMarcoDocument
def docs_namespace(self):
return NAME
class MsMarcoAnchorTextDocument(NamedTuple):
doc_id: str
text: str
anchors: List[str]
def default_text(self):
"""
text + anchors
"""
return f'{self.text} ' + ' '.join(self.anchors)
class MsMarcoAnchorTextDocs(BaseDocs):
def __init__(self, dlc, count_hint):
super().__init__()
self._dlc = dlc
self._count_hint = count_hint
@ir_datasets.util.use_docstore
def docs_iter(self):
with self._dlc.stream() as stream:
for line in stream:
data = json.loads(line)
yield MsMarcoAnchorTextDocument(data['id'], ' '.join(data['anchors']), data['anchors'])
def docs_cls(self):
return MsMarcoAnchorTextDocument
def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
return PickleLz4FullStore(
path=f'{ir_datasets.util.home_path()}/{NAME}/anchor-text.pklz4',
init_iter_fn=self.docs_iter,
data_cls=self.docs_cls(),
lookup_field=field,
index_fields=['doc_id'],
count_hint=self._count_hint,
options=options
)
def docs_count(self):
if self.docs_store().built():
return self.docs_store().count()
def docs_namespace(self):
return f'{NAME}/anchor-text'
def docs_lang(self):
return 'en'
def _init():
base_path = ir_datasets.util.home_path()/NAME
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
subsets = {}
collection = MsMarcoTrecDocs(GzipExtract(dlc['docs']))
subsets['train'] = Dataset(
collection,
TsvQueries(GzipExtract(dlc['train/queries']), namespace='msmarco', lang='en'),
TrecQrels(GzipExtract(dlc['train/qrels']), QRELS_DEFS),
TrecScoredDocs(GzipExtract(dlc['train/scoreddocs'])),
)
subsets['dev'] = Dataset(
collection,
TsvQueries(GzipExtract(dlc['dev/queries']), namespace='msmarco', lang='en'),
TrecQrels(GzipExtract(dlc['dev/qrels']), QRELS_DEFS),
TrecScoredDocs(GzipExtract(dlc['dev/scoreddocs'])),
)
subsets['eval'] = Dataset(
collection,
TsvQueries(GzipExtract(dlc['eval/queries']), namespace='msmarco', lang='en'),
TrecScoredDocs(GzipExtract(dlc['eval/scoreddocs'])),
)
subsets['trec-dl-2019'] = Dataset(
collection,
TsvQueries(GzipExtract(dlc['trec-dl-2019/queries']), namespace='msmarco', lang='en'),
TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS),
TrecScoredDocs(GzipExtract(dlc['trec-dl-2019/scoreddocs'])),
)
subsets['trec-dl-2020'] = Dataset(
collection,
TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco', lang='en'),
TrecQrels(dlc['trec-dl-2020/qrels'], TREC_DL_QRELS_DEFS),
TrecScoredDocs(GzipExtract(dlc['trec-dl-2020/scoreddocs'])),
)
subsets['orcas'] = Dataset(
collection,
TsvQueries(GzipExtract(dlc['orcas/queries']), namespace='orcas', lang='en'),
TrecQrels(GzipExtract(dlc['orcas/qrels']), ORCAS_QLRES_DEFS),
TrecScoredDocs(GzipExtract(dlc['orcas/scoreddocs'])),
)
dl19_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()})
subsets['trec-dl-2019/judged'] = Dataset(
FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_judged),
FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(), dl19_judged),
subsets['trec-dl-2019'],
)
dl20_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2020'].qrels_iter()})
subsets['trec-dl-2020/judged'] = Dataset(
FilteredQueries(subsets['trec-dl-2020'].queries_handler(), dl20_judged),
FilteredScoredDocs(subsets['trec-dl-2020'].scoreddocs_handler(), dl20_judged),
subsets['trec-dl-2020'],
)
# DL-Hard
dl_hard_qrels_migrator = Migrator(base_path/'trec-dl-hard'/'irds_version.txt', 'v2',
affected_files=[base_path/'trec-dl-hard'/'qrels'],
message='Updating trec-dl-hard qrels')
hard_qids = Lazy(lambda: DL_HARD_QIDS)
dl_hard_base_queries = TsvQueries([
Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path/'trec-dl-2019/queries.tsv'),
Cache(GzipExtract(dlc['trec-dl-2020/queries']), base_path/'trec-dl-2020/queries.tsv')], namespace='msmarco', lang='en')
subsets['trec-dl-hard'] = Dataset(
collection,
FilteredQueries(dl_hard_base_queries, hard_qids),
dl_hard_qrels_migrator(TrecQrels(dlc['trec-dl-hard/qrels'], TREC_DL_QRELS_DEFS)),
documentation('trec-dl-hard')
)
hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['1'])
subsets['trec-dl-hard/fold1'] = Dataset(
collection,
FilteredQueries(dl_hard_base_queries, hard_qids),
FilteredQrels(subsets['trec-dl-hard'], hard_qids),
documentation('trec-dl-hard/fold1')
)
hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['2'])
subsets['trec-dl-hard/fold2'] = Dataset(
collection,
FilteredQueries(dl_hard_base_queries, hard_qids),
FilteredQrels(subsets['trec-dl-hard'], hard_qids),
documentation('trec-dl-hard/fold2')
)
hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['3'])
subsets['trec-dl-hard/fold3'] = Dataset(
collection,
FilteredQueries(dl_hard_base_queries, hard_qids),
FilteredQrels(subsets['trec-dl-hard'], hard_qids),
documentation('trec-dl-hard/fold3')
)
hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['4'])
subsets['trec-dl-hard/fold4'] = Dataset(
collection,
FilteredQueries(dl_hard_base_queries, hard_qids),
FilteredQrels(subsets['trec-dl-hard'], hard_qids),
documentation('trec-dl-hard/fold4')
)
hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['5'])
subsets['trec-dl-hard/fold5'] = Dataset(
collection,
FilteredQueries(dl_hard_base_queries, hard_qids),
FilteredQrels(subsets['trec-dl-hard'], hard_qids),
documentation('trec-dl-hard/fold5')
)
subsets['anchor-text'] = Dataset(
MsMarcoAnchorTextDocs(
Cache(GzipExtract(dlc['anchor-text']), base_path / "anchor-text.json"),
count_hint=1703834
),
documentation('anchor-text')
)
ir_datasets.registry.register(NAME, Dataset(collection, documentation("_")))
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s)))
return collection, subsets
collection, subsets = _init()
================================================
FILE: ir_datasets/datasets/msmarco_document_v2.py
================================================
import contextlib
import gzip
import io
from pathlib import Path
import json
from typing import NamedTuple, Tuple, List
import tarfile
import ir_datasets
from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS
from ir_datasets.util import Cache, DownloadConfig, GzipExtract, Lazy, Migrator, TarExtractAll
from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries, FilteredScoredDocs, FilteredQrels
from ir_datasets.formats import TsvQueries, TrecQrels, TrecScoredDocs, BaseDocs
from ir_datasets.datasets.msmarco_passage import DUA, DL_HARD_QIDS_BYFOLD, DL_HARD_QIDS
from ir_datasets.datasets.msmarco_document import TREC_DL_QRELS_DEFS
_logger = ir_datasets.log.easy()
NAME = 'msmarco-document-v2'
QRELS_DEFS = {
1: 'Document contains a passage labeled as relevant in msmarco-passage'
}
class MsMarcoV2Document(NamedTuple):
doc_id: str
url: str
title: str
headings: str
body: str
def default_text(self):
"""
title + headings + body
"""
return f'{self.title} {self.headings} {self.body}'
class MsMarcoV2Docs(BaseDocs):
def __init__(self, dlc):
super().__init__()
self._dlc = dlc
@ir_datasets.util.use_docstore
def docs_iter(self):
with self._dlc.stream() as stream, \
tarfile.open(fileobj=stream, mode='r|') as tarf:
for record in tarf:
if not record.name.endswith('.gz'):
continue
file = tarf.extractfile(record)
with gzip.open(file) as file:
for line in file:
data = json.loads(line)
yield MsMarcoV2Document(
data['docid'],
data['url'],
data['title'],
data['headings'],
data['body'])
def docs_cls(self):
return MsMarcoV2Document
def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
# NOTE: the MS MARCO v2 documents have this really neat quality that they contain the offset
# position in the source file: .
# Unfortunately, it points to the position in the *uncompressed* file, so for this to work, we'd
# need to decompress the source files, inflating the size ~3.3x. The options would be to:
# 1) Always de-compress the source files, costing everybody ~3.3x the storage. Ouch.
# 2) De-compress the source files the first time that the docstore is requested. This would
# only cost the users who use the docstore 3.3x, but increases the complexity of the
# iteration code to handle both compressed and non-compressed versions. Would also need code
# to handle stuff like fancy slicing, which wouldn't be trivial. Would we also keep
# the original source file around? If so, it actually ends up being 4.3x.
# 3) Build a PickleLz4FullStore on demand, as normal. This would only cost the users who use
# the docstore ~2.7x (accounting for worse lz4 compression rate and keeping around original
# copy of the data), but is also slightly slower because of the O(log n) position lookups and
# decompression. (This may be offset because pickle parsing is faster than json though.)
# It also reduces the complexity of the code, as it does not require a new docstore
# implementation for this dataset, and is just doing the normal procedure.
return PickleLz4FullStore(
path=f'{self._dlc.path(force=False)}.pklz4',
init_iter_fn=self.docs_iter,
data_cls=self.docs_cls(),
lookup_field=field,
index_fields=['doc_id'],
key_field_prefix='msmarco_doc_', # cut down on storage by removing prefix in lookup structure
size_hint=66500029281,
count_hint=ir_datasets.util.count_hint(NAME),
options=options
)
# return MsMArcoV2DocStore(self)
def docs_count(self):
if self.docs_store().built():
return self.docs_store().count()
def docs_namespace(self):
return NAME
def docs_lang(self):
return 'en'
class MsMarcoV2AnchorTextDocument(NamedTuple):
doc_id: str
text: str
anchors: List[str]
def default_text(self):
"""
text + anchors
"""
return f'{self.text} ' + ' '.join(self.anchors)
class MsMarcoV2AnchorTextDocs(BaseDocs):
def __init__(self, dlc, count_hint):
super().__init__()
self._dlc = dlc
self._count_hint = count_hint
@ir_datasets.util.use_docstore
def docs_iter(self):
with self._dlc.stream() as stream:
for line in stream:
data = json.loads(line)
yield MsMarcoV2AnchorTextDocument(data['id'], ' '.join(data['anchors']), data['anchors'])
def docs_cls(self):
return MsMarcoV2AnchorTextDocument
def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
return PickleLz4FullStore(
path=f'{ir_datasets.util.home_path()}/{NAME}/anchor-text.pklz4',
init_iter_fn=self.docs_iter,
data_cls=self.docs_cls(),
lookup_field=field,
index_fields=['doc_id'],
count_hint=self._count_hint,
options=options
)
def docs_count(self):
if self.docs_store().built():
return self.docs_store().count()
def docs_namespace(self):
return f'{NAME}/anchor-text'
def docs_lang(self):
return 'en'
def _init():
base_path = ir_datasets.util.home_path()/NAME
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
subsets = {}
collection = MsMarcoV2Docs(dlc['docs'])
subsets['train'] = Dataset(
collection,
TsvQueries(dlc['train_queries'], namespace='msmarco', lang='en'),
TrecQrels(dlc['train_qrels'], QRELS_DEFS),
TrecScoredDocs(GzipExtract(dlc['train_scoreddocs'])),
)
subsets['dev1'] = Dataset(
collection,
TsvQueries(dlc['dev1_queries'], namespace='msmarco', lang='en'),
TrecQrels(dlc['dev1_qrels'], QRELS_DEFS),
TrecScoredDocs(GzipExtract(dlc['dev1_scoreddocs'])),
)
subsets['dev2'] = Dataset(
collection,
TsvQueries(dlc['dev2_queries'], namespace='msmarco', lang='en'),
TrecQrels(dlc['dev2_qrels'], QRELS_DEFS),
TrecScoredDocs(GzipExtract(dlc['dev2_scoreddocs'])),
)
subsets['trec-dl-2019'] = Dataset(
collection,
TsvQueries(GzipExtract(dlc['trec-dl-2019/queries']), namespace='msmarco', lang='en'),
TrecQrels(GzipExtract(dlc['trec_dl_2019_qrels']), TREC_DL_QRELS_DEFS),
)
subsets['trec-dl-2020'] = Dataset(
collection,
TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco', lang='en'),
TrecQrels(GzipExtract(dlc['trec_dl_2020_qrels']), TREC_DL_QRELS_DEFS),
)
dl19_v2_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()})
subsets['trec-dl-2019/judged'] = Dataset(
FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_v2_judged),
subsets['trec-dl-2019'],
)
dl20_v2_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2020'].qrels_iter()})
subsets['trec-dl-2020/judged'] = Dataset(
FilteredQueries(subsets['trec-dl-2020'].queries_handler(), dl20_v2_judged),
subsets['trec-dl-2020'],
)
subsets['trec-dl-2021'] = Dataset(
collection,
TsvQueries(dlc['trec-dl-2021/queries'], namespace='msmarco', lang='en'),
TrecQrels(dlc['trec-dl-2021/qrels'], TREC_DL_QRELS_DEFS),
TrecScoredDocs(GzipExtract(dlc['trec-dl-2021/scoreddocs'])),
)
dl21_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2021'].qrels_iter()})
subsets['trec-dl-2021/judged'] = Dataset(
FilteredQueries(subsets['trec-dl-2021'].queries_handler(), dl21_judged),
FilteredScoredDocs(subsets['trec-dl-2021'].scoreddocs_handler(), dl21_judged),
subsets['trec-dl-2021'],
)
subsets['trec-dl-2022'] = Dataset(
collection,
TsvQueries(dlc['trec-dl-2022/queries'], namespace='msmarco', lang='en'),
TrecQrels(dlc['trec-dl-2022/qrels'], TREC_DL_QRELS_DEFS),
TrecScoredDocs(GzipExtract(dlc['trec-dl-2022/scoreddocs'])),
)
dl22_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2022'].qrels_iter()})
subsets['trec-dl-2022/judged'] = Dataset(
FilteredQueries(subsets['trec-dl-2022'].queries_handler(), dl22_judged),
FilteredScoredDocs(subsets['trec-dl-2022'].scoreddocs_handler(), dl22_judged),
subsets['trec-dl-2022'],
)
subsets['trec-dl-2023'] = Dataset(
collection,
TsvQueries(dlc['trec-dl-2023/queries'], namespace='msmarco', lang='en'),
TrecQrels(dlc['trec-dl-2023/qrels'], TREC_DL_QRELS_DEFS),
TrecScoredDocs(GzipExtract(dlc['trec-dl-2023/scoreddocs'])),
)
dl23_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2023'].qrels_iter()})
subsets['trec-dl-2023/judged'] = Dataset(
FilteredQueries(subsets['trec-dl-2023'].queries_handler(), dl23_judged),
FilteredScoredDocs(subsets['trec-dl-2023'].scoreddocs_handler(), dl23_judged),
subsets['trec-dl-2023'],
)
subsets['anchor-text'] = Dataset(
MsMarcoV2AnchorTextDocs(
Cache(GzipExtract(dlc['anchor-text']), base_path / "anchor-text.json"),
count_hint=4821244
),
documentation('anchor-text')
)
ir_datasets.registry.register(NAME, Dataset(collection, documentation("_")))
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s)))
return collection, subsets
collection, subsets = _init()
================================================
FILE: ir_datasets/datasets/msmarco_passage.py
================================================
import hashlib
import io
import codecs
import re
import ir_datasets
from ir_datasets.util import Cache, TarExtract, IterStream, GzipExtract, Lazy, DownloadConfig, Migrator
from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredScoredDocs, FilteredQrels, FilteredDocPairs, YamlDocumentation
from ir_datasets.formats import TsvQueries, TsvDocs, TrecQrels, TrecScoredDocs, TsvDocPairs
_logger = ir_datasets.log.easy()
NAME = 'msmarco-passage'
DUA = ("Please confirm you agree to the MSMARCO data usage agreement found at "
"")
QRELS_DEFS = {
1: 'Labeled by crowd worker as relevant'
}
TREC_DL_QRELS_DEFS = {
3: "Perfectly relevant: The passage is dedicated to the query and contains the exact answer.",
2: "Highly relevant: The passage has some answer for the query, but the answer may be a bit "
"unclear, or hidden amongst extraneous information.",
1: "Related: The passage seems related to the query but does not answer it.",
0: "Irrelevant: The passage has nothing to do with the query.",
}
SPLIT200_QIDS = {'484694', '836399', '683975', '428803', '1035062', '723895', '267447', '325379', '582244', '148817', '44209', '1180950', '424238', '683835', '701002', '1076878', '289809', '161771', '807419', '530982', '600298', '33974', '673484', '1039805', '610697', '465983', '171424', '1143723', '811440', '230149', '23861', '96621', '266814', '48946', '906755', '1142254', '813639', '302427', '1183962', '889417', '252956', '245327', '822507', '627304', '835624', '1147010', '818560', '1054229', '598875', '725206', '811871', '454136', '47069', '390042', '982640', '1174500', '816213', '1011280', '368335', '674542', '839790', '270629', '777692', '906062', '543764', '829102', '417947', '318166', '84031', '45682', '1160562', '626816', '181315', '451331', '337653', '156190', '365221', '117722', '908661', '611484', '144656', '728947', '350999', '812153', '149680', '648435', '274580', '867810', '101999', '890661', '17316', '763438', '685333', '210018', '600923', '1143316', '445800', '951737', '1155651', '304696', '958626', '1043094', '798480', '548097', '828870', '241538', '337392', '594253', '1047678', '237264', '538851', '126690', '979598', '707766', '1160366', '123055', '499590', '866943', '18892', '93927', '456604', '560884', '370753', '424562', '912736', '155244', '797512', '584995', '540814', '200926', '286184', '905213', '380420', '81305', '749773', '850038', '942745', '68689', '823104', '723061', '107110', '951412', '1157093', '218549', '929871', '728549', '30937', '910837', '622378', '1150980', '806991', '247142', '55840', '37575', '99395', '231236', '409162', '629357', '1158250', '686443', '1017755', '1024864', '1185054', '1170117', '267344', '971695', '503706', '981588', '709783', '147180', '309550', '315643', '836817', '14509', '56157', '490796', '743569', '695967', '1169364', '113187', '293255', '859268', '782494', '381815', '865665', '791137', '105299', '737381', '479590', '1162915', '655989', '292309', '948017', '1183237', '542489', '933450', '782052', '45084', '377501', '708154'}
# from on 30 April 2021
DL_HARD_QIDS_BYFOLD = {
"1": {'915593', '451602', '966413', '1056204', '182539', '655914', '67316', '883915', '1049519', '174463'},
"2": {'794429', '588587', '1114646', '537817', '1065636', '144862', '443396', '332593', '1103812', '19335'},
"3": {'177604', '1108939', '264403', '86606', '1133485', '1117817', '705609', '315637', '673670', '1105792'},
"4": {'801118', '507445', '87452', '88495', '554515', '166046', '730539', '1108100', '1109707', '1056416'},
"5": {'190044', '527433', '489204', '877809', '1106007', '47923', '1136769', '1112341', '1103153', '273695'},
}
DL_HARD_QIDS = set.union(*DL_HARD_QIDS_BYFOLD.values())
# Converts "top1000" MS run files "QID DID QText DText" to "QID DID" to remove tons of redundant
# storage with query and document files.
class ExtractQidPid:
def __init__(self, streamer):
self._streamer = streamer
def stream(self):
return io.BufferedReader(IterStream(iter(self)), buffer_size=io.DEFAULT_BUFFER_SIZE)
def __iter__(self):
with self._streamer.stream() as stream:
for line in _logger.pbar(stream, desc='extracting QID/PID pairs', unit='pair'):
qid, did, _, _ = line.split(b'\t')
yield qid + b'\t' + did + b'\n'
# The encoding of the MS MARCO passage collection is... weird...
# Some characters are properly utf8-encoded, while others are not, even within the same passage.
# So, thi cutom-built streaming class aims to fix that. What it does is finds "suspicious"
# characters, basically anything in the 128-255 range. Once found, it will pick 2-4 characters
# around it and try to encode them as latin-1 and decode them at utf8.
class FixEncoding:
def __init__(self, streamer):
self._streamer = streamer
def stream(self):
return io.BufferedReader(IterStream(iter(self)), buffer_size=io.DEFAULT_BUFFER_SIZE)
def __iter__(self):
SUS = '[\x80-\xff]'
# Find sequences of up to 4 characters that contain a suspicious character.
# We'll attempt to interpret these as latin1 characters and then decode them back to UTF8.
# With this technique, we get 100% matches with MS MARCO QnA passages (which do not have this encoding issue)
# This approach is more than twice as fast as using ftfy
regexes = [
re.compile(f'(...{SUS}|..{SUS}.|.{SUS}..|{SUS}...)'),
re.compile(f'(..{SUS}|.{SUS}.|{SUS}..)'),
re.compile(f'(.{SUS}|{SUS}.)'),
]
with self._streamer.stream() as stream, \
_logger.pbar_raw(desc='fixing encoding', unit='B', unit_scale=True) as pbar:
# NOTE: codecs.getreader is subtly broken here; it sometimes splits lines between special characters (and it's unclear why)
for line in stream:
pbar.update(len(line))
line = line.decode('utf8')
for regex in regexes:
pos = 0
while pos < len(line):
match = regex.search(line, pos=pos)
if not match:
break
try:
fixed = match.group().encode('latin1').decode('utf8')
if len(fixed) == 1:
line = line[:match.start()] + fixed + line[match.end():]
except UnicodeError:
pass
pos = match.start() + 1
yield line.encode()
# Converts "small triples" MS files to "qid/pos_did/neg_did" format to remove tons of redundant storage.
class MapSmallTriplesQidPid:
def __init__(self, streamer, corpus_stream, queries_handler):
self._streamer = streamer
self._corpus_stream = corpus_stream # note: must use raw topics here beacuse this file also includes the broken text found in the corpus file
self._queries_handler = queries_handler
def stream(self):
return io.BufferedReader(IterStream(iter(self)), buffer_size=io.DEFAULT_BUFFER_SIZE)
def __iter__(self):
# Strangely, in this file, the query text is mangled, even though the query source file isn't.
# So we need to apply the encoding fix that's normally applied to the docs to the queries here.
SUS = '[\x80-\xff]'
regexes = [
re.compile(f'(...{SUS}|..{SUS}.|.{SUS}..|{SUS}...)'),
re.compile(f'(..{SUS}|.{SUS}.|{SUS}..)'),
re.compile(f'(.{SUS}|{SUS}.)'),
]
passagehash_did_map = {}
with self._corpus_stream.stream() as fin:
for line in _logger.pbar(fin, desc='build d text lookup (step 1 of 3)', total=8841823):
did, contents = line.rstrip(b'\n').split(b'\t')
content_hash = hashlib.md5(contents).digest()[:7] # 7 byte version results in no collisions & reduces memory
assert content_hash not in passagehash_did_map
passagehash_did_map[bytes(content_hash)] = int(did) # int did reduces memory
queryhash_qid_map = {}
for query in _logger.pbar(self._queries_handler.queries_iter(), desc='build q text lookup (step 2 of 3)', total=808731):
query_hash = hashlib.md5(query.text.encode()).digest()[:6] # 6 byte version results in no collisions & reduces memory
assert query_hash not in queryhash_qid_map
queryhash_qid_map[bytes(query_hash)] = int(query.query_id) # int qid reduces memory
with self._streamer.stream() as fin:
for line in _logger.pbar(fin, desc='map d/q text to IDs (step 3 of 3)', total=39780811):
query, doc1, doc2 = line.rstrip(b'\n').split(b'\t')
query = query.decode('utf8')
for regex in regexes:
pos = 0
while pos < len(query):
match = regex.search(query, pos=pos)
if not match:
break
try:
fixed = match.group().encode('latin1').decode('utf8')
if len(fixed) == 1:
query = query[:match.start()] + fixed + query[match.end():]
except UnicodeError:
pass
pos = match.start() + 1
query_hash = hashlib.md5(query.encode()).digest()[:6]
doc1_hash = hashlib.md5(doc1).digest()[:7]
doc2_hash = hashlib.md5(doc2).digest()[:7]
yield f'{queryhash_qid_map[query_hash]}\t{passagehash_did_map[doc1_hash]}\t{passagehash_did_map[doc2_hash]}\n'.encode()
def _init():
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
migrator = Migrator(base_path/'irds_version.txt', 'v2',
affected_files=[base_path/'collection.tsv', base_path/'collection.tsv.pklz4'],
message=f'Migrating {NAME} (fixing passage encoding)')
collection = TsvDocs(Cache(FixEncoding(TarExtract(dlc['collectionandqueries'], 'collection.tsv')), base_path/'collection.tsv'), namespace='msmarco', lang='en', docstore_size_hint=14373971970, count_hint=ir_datasets.util.count_hint(NAME))
collection = migrator(collection)
subsets = {}
subsets['train'] = Dataset(
collection,
TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.train.tsv'), base_path/'train/queries.tsv'), namespace='msmarco', lang='en'),
TrecQrels(dlc['train/qrels'], QRELS_DEFS),
TsvDocPairs(GzipExtract(dlc['train/docpairs'])),
TrecScoredDocs(Cache(ExtractQidPid(TarExtract(dlc['train/scoreddocs'], 'top1000.train.txt')), base_path/'train/ms.run')),
)
subsets['train/triples-v2'] = Dataset(
collection,
subsets['train'].queries_handler(),
subsets['train'].qrels_handler(),
TsvDocPairs(GzipExtract(dlc['train/docpairs/v2'])),
subsets['train'].scoreddocs_handler(),
)
subsets['train/triples-small'] = Dataset(
collection,
subsets['train'].queries_handler(),
subsets['train'].qrels_handler(),
TsvDocPairs(Cache(MapSmallTriplesQidPid(TarExtract(dlc['train/docpairs/small'], 'triples.train.small.tsv'), TarExtract(dlc['collectionandqueries'], 'collection.tsv'), subsets['train'].queries_handler()), base_path/'train/small.triples.qidpid.tsv')),
subsets['train'].scoreddocs_handler(),
)
subsets['dev'] = Dataset(
collection,
TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.dev.tsv'), base_path/'dev/queries.tsv'), namespace='msmarco', lang='en'),
TrecQrels(dlc['dev/qrels'], QRELS_DEFS),
)
subsets['dev/small'] = Dataset(
collection,
TsvQueries(Cache(TarExtract(dlc['collectionandqueries'], 'queries.dev.small.tsv'), base_path/'dev/small/queries.tsv'), namespace='msmarco', lang='en'),
TrecQrels(Cache(TarExtract(dlc['collectionandqueries'], 'qrels.dev.small.tsv'), base_path/'dev/small/qrels'), QRELS_DEFS),
TrecScoredDocs(Cache(ExtractQidPid(TarExtract(dlc['dev/scoreddocs'], 'top1000.dev')), base_path/'dev/ms.run')),
)
subsets['eval'] = Dataset(
collection,
TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.eval.tsv'), base_path/'eval/queries.tsv'), namespace='msmarco', lang='en'),
)
subsets['eval/small'] = Dataset(
collection,
TsvQueries(Cache(TarExtract(dlc['collectionandqueries'], 'queries.eval.small.tsv'), base_path/'eval/small/queries.tsv'), namespace='msmarco', lang='en'),
TrecScoredDocs(Cache(ExtractQidPid(TarExtract(dlc['eval/scoreddocs'], 'top1000.eval')), base_path/'eval/ms.run')),
)
subsets['trec-dl-2019'] = Dataset(
collection,
TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS),
TsvQueries(Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path/'trec-dl-2019/queries.tsv'), namespace='msmarco', lang='en'),
TrecScoredDocs(Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2019/scoreddocs'])), base_path/'trec-dl-2019/ms.run')),
)
subsets['trec-dl-2020'] = Dataset(
collection,
TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco', lang='en'),
TrecQrels(dlc['trec-dl-2020/qrels'], TREC_DL_QRELS_DEFS),
TrecScoredDocs(Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2020/scoreddocs'])), base_path/'trec-dl-2020/ms.run')),
)
# A few subsets that are contrainted to just the queries/qrels/docpairs that have at least
# 1 relevance assessment
train_judged = Lazy(lambda: {q.query_id for q in subsets['train'].qrels_iter()})
subsets['train/judged'] = Dataset(
FilteredQueries(subsets['train'].queries_handler(), train_judged),
FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_judged),
subsets['train'],
)
dev_judged = Lazy(lambda: {q.query_id for q in subsets['dev'].qrels_iter()})
subsets['dev/judged'] = Dataset(
FilteredQueries(subsets['dev'].queries_handler(), dev_judged),
subsets['dev'],
)
dl19_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()})
subsets['trec-dl-2019/judged'] = Dataset(
FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_judged),
FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(), dl19_judged),
subsets['trec-dl-2019'],
)
dl20_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2020'].qrels_iter()})
subsets['trec-dl-2020/judged'] = Dataset(
FilteredQueries(subsets['trec-dl-2020'].queries_handler(), dl20_judged),
FilteredScoredDocs(subsets['trec-dl-2020'].scoreddocs_handler(), dl20_judged),
subsets['trec-dl-2020'],
)
# split200 -- 200 queries held out from the training data for validation
split200 = Lazy(lambda: SPLIT200_QIDS)
subsets['train/split200-train'] = Dataset(
FilteredQueries(subsets['train'].queries_handler(), split200, mode='exclude'),
FilteredScoredDocs(subsets['train'].scoreddocs_handler(), split200, mode='exclude'),
FilteredQrels(subsets['train'].qrels_handler(), split200, mode='exclude'),
FilteredDocPairs(subsets['train'].docpairs_handler(), split200, mode='exclude'),
subsets['train'],
)
subsets['train/split200-valid'] = Dataset(
FilteredQueries(subsets['train'].queries_handler(), split200, mode='include'),
FilteredScoredDocs(subsets['train'].scoreddocs_handler(), split200, mode='include'),
FilteredQrels(subsets['train'].qrels_handler(), split200, mode='include'),
FilteredDocPairs(subsets['train'].docpairs_handler(), split200, mode='include'),
subsets['train'],
)
dev2_qids = Lazy(lambda: {q.query_id for q in ir_datasets.load('msmarco-passage-v2/dev2').queries})
subsets['dev/2'] = Dataset(
FilteredQueries(subsets['dev'].queries_handler(), dev2_qids),
FilteredQrels(subsets['dev'].qrels_handler(), dev2_qids),
subsets['dev'],
)
# Medical subset
def train_med():
with dlc['medmarco_ids'].stream() as stream:
stream = codecs.getreader('utf8')(stream)
return {l.rstrip() for l in stream}
train_med = Lazy(train_med)
subsets['train/medical'] = Dataset(
FilteredQueries(subsets['train'].queries_handler(), train_med),
FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_med),
FilteredDocPairs(subsets['train'].docpairs_handler(), train_med),
FilteredQrels(subsets['train'].qrels_handler(), train_med),
subsets['train'],
)
# DL-Hard
dl_hard_qrels_migrator = Migrator(base_path/'trec-dl-hard'/'irds_version.txt', 'v3',
affected_files=[base_path/'trec-dl-hard'/'qrels'],
message='Updating trec-dl-hard qrels')
hard_qids = Lazy(lambda: DL_HARD_QIDS)
dl_hard_base_queries = TsvQueries([
Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path/'trec-dl-2019/queries.tsv'),
Cache(GzipExtract(dlc['trec-dl-2020/queries']), base_path/'trec-dl-2020/queries.tsv')], namespace='msmarco', lang='en')
subsets['trec-dl-hard'] = Dataset(
collection,
FilteredQueries(dl_hard_base_queries, hard_qids),
dl_hard_qrels_migrator(TrecQrels(dlc['trec-dl-hard/qrels'], TREC_DL_QRELS_DEFS)),
documentation('trec-dl-hard')
)
hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['1'])
subsets['trec-dl-hard/fold1'] = Dataset(
collection,
FilteredQueries(dl_hard_base_queries, hard_qids),
FilteredQrels(subsets['trec-dl-hard'], hard_qids),
documentation('trec-dl-hard/fold1')
)
hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['2'])
subsets['trec-dl-hard/fold2'] = Dataset(
collection,
FilteredQueries(dl_hard_base_queries, hard_qids),
FilteredQrels(subsets['trec-dl-hard'], hard_qids),
documentation('trec-dl-hard/fold2')
)
hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['3'])
subsets['trec-dl-hard/fold3'] = Dataset(
collection,
FilteredQueries(dl_hard_base_queries, hard_qids),
FilteredQrels(subsets['trec-dl-hard'], hard_qids),
documentation('trec-dl-hard/fold3')
)
hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['4'])
subsets['trec-dl-hard/fold4'] = Dataset(
collection,
FilteredQueries(dl_hard_base_queries, hard_qids),
FilteredQrels(subsets['trec-dl-hard'], hard_qids),
documentation('trec-dl-hard/fold4')
)
hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['5'])
subsets['trec-dl-hard/fold5'] = Dataset(
collection,
FilteredQueries(dl_hard_base_queries, hard_qids),
FilteredQrels(subsets['trec-dl-hard'], hard_qids),
documentation('trec-dl-hard/fold5')
)
ir_datasets.registry.register(NAME, Dataset(collection, documentation('_')))
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s)))
return collection, subsets
collection, subsets = _init()
================================================
FILE: ir_datasets/datasets/msmarco_passage_v2.py
================================================
import re
import os
import contextlib
import gzip
import io
from pathlib import Path
import json
from typing import NamedTuple, Tuple
import tarfile
import ir_datasets
from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS, FileAccess
from ir_datasets.util import Cache, DownloadConfig, GzipExtract, Lazy, Migrator, TarExtractAll
from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries, FilteredScoredDocs, FilteredQrels
from ir_datasets.formats import TsvQueries, TrecQrels, TrecScoredDocs, BaseDocs
from ir_datasets.datasets.msmarco_passage import DUA, DL_HARD_QIDS_BYFOLD, DL_HARD_QIDS, TREC_DL_QRELS_DEFS
QRELS_DEFS = {
1: 'Based on mapping from v1 of MS MARCO'
}
_logger = ir_datasets.log.easy()
NAME = 'msmarco-passage-v2'
class MsMarcoV2Passage(NamedTuple):
doc_id: str
text: str
spans: Tuple[Tuple[int, int], ...]
msmarco_document_id: str
def default_text(self):
"""
text
"""
return self.text
def parse_msmarco_passage(line):
data = json.loads(line)
# extract spans in the format of "(123,456),(789,101123)"
spans = tuple((int(a), int(b)) for a, b in re.findall(r'\((\d+),(\d+)\)', data['spans']))
return MsMarcoV2Passage(
data['pid'],
data['passage'],
spans,
data['docid'])
class MsMarcoV2Passages(BaseDocs):
def __init__(self, dlc, pos_dlc=None):
super().__init__()
self._dlc = dlc
self._pos_dlc = pos_dlc
@ir_datasets.util.use_docstore
def docs_iter(self):
if self._pos_dlc is not None:
# the shortcut only applies if the default pos
# files are used (i.e., no filtering is applied)
yield from self.docs_store()
else:
with self._dlc.stream() as stream, \
tarfile.open(fileobj=stream, mode='r|') as tarf:
for record in tarf:
if not record.name.endswith('.gz'):
continue
file = tarf.extractfile(record)
with gzip.open(file) as file:
for line in file:
yield parse_msmarco_passage(line)
def docs_cls(self):
return MsMarcoV2Passage
def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
assert field == 'doc_id'
# Unlike for msmarco-document-v2, using the docstore actually hurts performance.
return MsMarcoV2DocStore(self, options=options)
def docs_count(self):
if self.docs_store().built():
return self.docs_store().count()
def docs_namespace(self):
return NAME
def docs_lang(self):
return 'en'
def docs_path(self, force=True):
return self._dlc.path(force)
class MsMarcoV2DocStore(ir_datasets.indices.Docstore):
def __init__(self, docs_handler, options=DEFAULT_DOCSTORE_OPTIONS):
super().__init__(docs_handler.docs_cls(), 'doc_id', options=options)
self.np = ir_datasets.lazy_libs.numpy()
self.docs_handler = docs_handler
self.dlc = docs_handler._dlc
self.pos_dlc = docs_handler._pos_dlc
self.base_path = docs_handler.docs_path(force=False) + '.extracted'
if not os.path.exists(self.base_path):
os.makedirs(self.base_path)
self.size_hint = 60880127751
if options.file_access != FileAccess.FILE:
_logger.warning(f"MsMarcoV2 passage only allows FILE access (requested {options.file_access})")
def get_many_iter(self, keys):
self.build()
# adapted from
bundles = {}
for key in keys:
if not key.count('_') == 3:
continue
(string1, string2, bundlenum, position) = key.split('_')
assert string1 == 'msmarco' and string2 == 'passage'
if bundlenum not in bundles:
bundles[bundlenum] = []
bundles[bundlenum].append(int(position))
for bundlenum, positions in bundles.items():
positions = sorted(positions)
file = f'{self.base_path}/msmarco_passage_{bundlenum}'
if not os.path.exists(file):
# invalid doc_id -- doesn't point to a real bundle
continue
if self.docs_handler._pos_dlc is not None:
# check the positions are valid for these doc_ids -- only return valid ones
mmp = self.np.memmap(os.path.join(self.pos_dlc.path(), f'msmarco_passage_{bundlenum}.pos'), dtype='= self.slice.stop:
raise StopIteration
while self.next_index != self.slice.start or self.current_file is None or self.current_file_end_idx <= self.slice.start or self.current_pos_mmap[self.slice.start - self.current_file_start_idx] != self.current_file.tell():
if self.current_file is None or self.current_file_end_idx <= self.slice.start:
# First iteration or no docs remaining in this file
if self.current_file is not None:
self.current_file.close()
self.current_file = None
# jump ahead to the file that contains the desired index
first = True
while first or self.current_file_end_idx < self.slice.start:
source_file = next(self.file_iter)
self.next_index = self.current_file_end_idx
self.current_file_start_idx = self.current_file_end_idx
pos_file = source_file + '.pos'
if self.docstore.pos_dlc is not None:
pos_file = os.path.join(self.docstore.pos_dlc.path(), source_file.split('/')[-1] + '.pos')
self.current_file_end_idx = self.current_file_start_idx + (os.path.getsize(pos_file) // 4)
first = False
self.current_file = open(source_file, 'rb')
self.current_pos_mmap = self.np.memmap(pos_file, dtype='")
QRELS_DEFS = {
1: 'Marked by annotator as a contribution to their answer',
0: 'Not marked by annotator as a contribution to their answer',
}
NO_ANSWER_PLACEHOLDER = 'No Answer Present.'
class MsMarcoQnAQuery(NamedTuple):
query_id: str
text: str
type: str
answers: Tuple[str, ...]
def default_text(self):
"""
text
"""
return self.text
class MsMarcoQnAEvalQuery(NamedTuple):
query_id: str
text: str
type: str
def default_text(self):
"""
text
"""
return self.text
class MsMarcoQnADoc(NamedTuple):
doc_id: str
text: str
url: str
msmarco_passage_id: str
msmarco_document_id: str
def default_text(self):
"""
text
"""
return self.text
# The MS MARCO QnA data files are in a super inconvenient format. They have a script to convert it
# to JSONL format, but it involves loading the entire collection into memory and doing merging via
# pandas, which is a non-starter. So we'll incrementally process the dataset using ijson.
# Format:
# {
# "answers": {
# "XXX": ["", ""],
# ...
# },
# "passages": {
# "XXX": {
# "is_selected": 0,
# "passage_text": "",
# "url": ""
# },
# ...
# },
# "query": {"XXX": "", ...},
# "query_type": {"XXX": "", ...},
# "query_id": {"XXX": 0, ...}
# }
# Where XXX is an ID used only for linking the records here in this file. Luckly, they are sorted
# so we don't actually need to deal with them.
# What's worse is that "passages" can be repeated and they don't have an ID. So we'll assign one
# in the order that they appear in the file, skipping duplicates.
# To find duplicates, we'll hash the text and url and keep that in a lookup. It's not ideal, but
# better than keeping a copy of all the passage texts in memory. I found that I can use a shorter
# version of the hashes that do not end up colliding. This reduces the memory overhead.
# The process ends up building out a collection-wide docstore and id/query/type/answers/qrels files
# for each split, that then get merged into query and qrel TSV files.
class MsMarcoQnAManager:
def __init__(self, train_dlc, dev_dlc, eval_dlc, base_path):
self._train_dlc = train_dlc
self._dev_dlc = dev_dlc
self._eval_dlc = eval_dlc
self._docs_store = None
self._base_path = base_path
def docs_store(self, options: DocstoreOptions=DEFAULT_DOCSTORE_OPTIONS):
self.build()
return self._internal_docs_store(options)
def _internal_docs_store(self, options: DocstoreOptions=DEFAULT_DOCSTORE_OPTIONS):
if self._docs_store is None:
self._docs_store = ir_datasets.indices.PickleLz4FullStore(self._base_path/'docs.pklz4', None, MsMarcoQnADoc, 'doc_id', ['doc_id'], count_hint=ir_datasets.util.count_hint(NAME))
return self._docs_store
def build(self):
ijson = ir_datasets.lazy_libs.ijson()
docs_store = self._internal_docs_store()
if docs_store.built():
return # already built
dochash_lookup = {}
for doc in _logger.pbar(ir_datasets.load('msmarco-passage').docs_iter(), desc='building msmarco-passage lookup', total=ir_datasets.load('msmarco-passage').docs_count(), unit='doc'):
dochash = bytes(hashlib.md5(doc.text.encode()).digest()[:8])
assert dochash not in dochash_lookup
dochash_lookup[dochash] = (int(doc.doc_id), {})
urlhash_lookup = {}
for doc in _logger.pbar(ir_datasets.load('msmarco-document').docs_iter(), desc='building msmarco-document lookup', total=ir_datasets.load('msmarco-document').docs_count(), unit='doc'):
urlhash = bytes(hashlib.md5(doc.url.encode()).digest()[:8])
assert urlhash not in urlhash_lookup
urlhash_lookup[urlhash] = doc.doc_id
nil_doc = MsMarcoQnADoc(None, None, None, None, None)
current_doc = nil_doc
prefix_passages = re.compile(r'^passages\.\d+\.item$')
prefix_answers = re.compile(r'^answers\.\d+\.item$')
prefix_type = re.compile(r'^query_type\.\d+$')
prefix_text = re.compile(r'^query\.\d+$')
prefix_id = re.compile(r'^query_id\.\d+$')
pbar_postfix = {'file': None, 'missing_urls': 0, 'key': None}
with contextlib.ExitStack() as outer_stack:
docs_trans = outer_stack.enter_context(docs_store.lookup.transaction())
pbar = outer_stack.enter_context(_logger.pbar_raw(desc='processing qna', postfix=pbar_postfix, unit='item'))
for dlc, file_str in [(self._train_dlc, 'train'), (self._dev_dlc, 'dev'), (self._eval_dlc, 'eval')]:
pbar_postfix['file'] = file_str
last_ans_prefix = None
last_psg_prefix = None
is_selected = None
with contextlib.ExitStack() as inner_stack:
stream = inner_stack.enter_context(dlc.stream())
parser = ijson.parse(stream)
out_text = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_text', 'wt'))
out_type = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_type', 'wt'))
out_id = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_id', 'wt'))
if file_str != 'eval':
out_qrels = inner_stack.enter_context(open(self._base_path/f'{file_str}.selections', 'wt'))
out_answer = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_answer', 'wt+'))
out_seq = None
else:
out_qrels, out_answer = None, None
out_seq = inner_stack.enter_context(open(self._base_path/f'{file_str}.seq', 'wt'))
for prefix, event, data in parser:
pbar_postfix['key'] = prefix
pbar.set_postfix(pbar_postfix, refresh=False)
pbar.update()
if prefix_passages.match(prefix):
if event == 'end_map':
assert current_doc.text is not None and current_doc.url is not None
dochash = bytes(hashlib.md5(current_doc.text.encode()).digest()[:8])
assert dochash in dochash_lookup, "doc_id lookup failed; passage text not found in msmarco-passage"
pid = dochash_lookup[dochash][0]
urlhash = bytes(hashlib.md5(current_doc.url.encode()).digest()[:8])
add = False
if urlhash not in dochash_lookup[dochash][1]:
urlidx = len(dochash_lookup[dochash][1])
dochash_lookup[dochash][1][urlhash] = urlidx
add = True
else:
urlidx = dochash_lookup[dochash][1][urlhash]
msm_doc_id = urlhash_lookup.get(urlhash)
if msm_doc_id is None:
pbar_postfix['missing_urls'] += 1
did = f'{pid}-{urlidx}'
current_doc = current_doc._replace(doc_id=did, msmarco_passage_id=str(pid), msmarco_document_id=msm_doc_id)
if add:
docs_trans.add(current_doc)
if out_qrels is not None:
if last_psg_prefix == prefix:
out_qrels.write(f'\t{did} {is_selected}')
elif last_psg_prefix is None:
out_qrels.write(f'{did} {is_selected}')
else:
out_qrels.write(f'\n{did} {is_selected}')
last_psg_prefix = prefix
if out_seq is not None:
if last_psg_prefix == prefix:
out_seq.write(f'\t{did}')
elif last_psg_prefix is None:
out_seq.write(f'{did}')
else:
out_seq.write(f'\n{did}')
last_psg_prefix = prefix
is_selected = None
current_doc = nil_doc
elif event == 'map_key':
key = data
value = next(parser)[2]
if key == 'is_selected':
is_selected = str(value)
elif key == 'passage_text':
current_doc = current_doc._replace(text=value)
elif key == 'url':
current_doc = current_doc._replace(url=value)
elif prefix_answers.match(prefix):
# a little more annoying because there can be multiple answers (but there's always at least 1)
text = str(data).replace("\n", " ").replace("\t", " ")
if last_ans_prefix == prefix:
out_answer.write(f'\t{text}')
elif last_ans_prefix is None:
out_answer.write(text)
else:
out_answer.write(f'\n{text}')
last_ans_prefix = prefix
elif prefix_text.match(prefix):
text = str(data).replace("\n", " ")
out_text.write(f'{text}\n')
elif prefix_id.match(prefix):
text = str(data).replace("\n", " ")
out_id.write(f'{text}\n')
elif prefix_type.match(prefix):
text = str(data).replace("\n", " ")
out_type.write(f'{text}\n')
if file_str != 'eval':
out_answer.write('\n')
out_qrels.write('\n')
else:
out_seq.write('\n')
# Merge files
for file_str in ['train', 'dev', 'eval']:
with contextlib.ExitStack() as stack:
f_qid = stack.enter_context(open(self._base_path/f'{file_str}.query_id', 'rt'))
f_type = stack.enter_context(open(self._base_path/f'{file_str}.query_type', 'rt'))
f_text = stack.enter_context(open(self._base_path/f'{file_str}.query_text', 'rt'))
f_queries = stack.enter_context(open(self._base_path/f'{file_str}.queries.tsv', 'wt'))
f_run = stack.enter_context(open(self._base_path/f'{file_str}.run', 'wt'))
in_files = [f_qid, f_type, f_text]
if file_str != 'eval':
f_selections = stack.enter_context(open(self._base_path/f'{file_str}.selections', 'rt'))
f_answers = stack.enter_context(open(self._base_path/f'{file_str}.query_answer', 'rt'))
f_qrels = stack.enter_context(open(self._base_path/f'{file_str}.qrels', 'wt'))
in_files += [f_selections, f_answers]
else:
f_seq = stack.enter_context(open(self._base_path/f'{file_str}.seq', 'rt'))
in_files += [f_seq]
for columns in _logger.pbar(zip(*in_files), desc=f'merging {file_str} files', unit='doc'):
columns = [x.strip() for x in columns]
qid, typ, text = columns[:3]
if file_str != 'eval':
selections, answers = columns[3:]
# Remove the "no answer" placeholder
answers = answers.replace(NO_ANSWER_PLACEHOLDER, '')
if answers:
answers = f'\t{answers}'
f_queries.write(f'{qid}\t{text}\t{typ}{answers}\n')
for i, qrel in enumerate(selections.split('\t')):
did, label = qrel.split()
f_qrels.write(f'{qid} 0 {did} {label}\n')
f_run.write(f'{qid} Q0 {did} {i} {-i} qna\n')
else:
seq, = columns[3:]
f_queries.write(f'{qid}\t{text}\t{typ}\n')
for i, did in enumerate(seq.split('\t')):
f_run.write(f'{qid} Q0 {did} {i} {-i} qna\n')
# clean up temp files
(self._base_path/f'{file_str}.query_id').unlink()
(self._base_path/f'{file_str}.query_type').unlink()
(self._base_path/f'{file_str}.query_text').unlink()
if file_str != 'eval':
(self._base_path/f'{file_str}.selections').unlink()
(self._base_path/f'{file_str}.query_answer').unlink()
def file_ref(self, path):
return _ManagedDlc(self, self._base_path/path)
class _ManagedDlc:
def __init__(self, manager, path):
self._manager = manager
self._path = path
@contextlib.contextmanager
def stream(self):
self._manager.build()
with open(self._path, 'rb') as f:
yield f
def path(self, force=True):
if force:
self._manager.build()
return self._path
def _init():
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
manager = MsMarcoQnAManager(GzipExtract(dlc['train']), GzipExtract(dlc['dev']), GzipExtract(dlc['eval']), base_path)
migrator = Migrator(base_path/'irds_version.txt', 'v2',
affected_files=[
base_path/'docs.pklz4',
base_path/'train.run', base_path/'train.qrels',
base_path/'dev.run', base_path/'dev.qrels',
base_path/'eval.run',
],
message='Migrating msmarco-qna (correcting doc_ids)')
collection = DocstoreBackedDocs(manager.docs_store, docs_cls=MsMarcoQnADoc, namespace=NAME, lang='en')
collection = migrator(collection)
subsets = {}
subsets['train'] = Dataset(
collection,
TsvQueries(manager.file_ref('train.queries.tsv'), query_cls=MsMarcoQnAQuery, namespace='msmarco', lang='en'),
migrator(TrecQrels(manager.file_ref('train.qrels'), QRELS_DEFS)),
migrator(TrecScoredDocs(manager.file_ref('train.run'))),
)
subsets['dev'] = Dataset(
collection,
TsvQueries(manager.file_ref('dev.queries.tsv'), query_cls=MsMarcoQnAQuery, namespace='msmarco', lang='en'),
migrator(TrecQrels(manager.file_ref('dev.qrels'), QRELS_DEFS)),
migrator(TrecScoredDocs(manager.file_ref('dev.run'))),
)
subsets['eval'] = Dataset(
collection,
TsvQueries(manager.file_ref('eval.queries.tsv'), query_cls=MsMarcoQnAEvalQuery, namespace='msmarco', lang='en'),
migrator(TrecScoredDocs(manager.file_ref('eval.run'))),
)
ir_datasets.registry.register(NAME, Dataset(collection, documentation('_')))
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s)))
return collection, subsets
collection, subsets = _init()
================================================
FILE: ir_datasets/datasets/nano_beir.py
================================================
import ir_datasets
from ir_datasets.datasets.base import Dataset, YamlDocumentation
from ir_datasets.formats import (
BaseDocs,
BaseQrels,
BaseQueries,
GenericDoc,
GenericQuery,
TrecQrel,
)
from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS
_logger = ir_datasets.log.easy()
NAME = "nano-beir"
def _map_field(field, data):
if field in ("doc_id", "query_id"):
return data["_id"]
if field == "text":
return data["text"]
raise ValueError(f"unknown field: {field}")
def parquet_iter(path):
pq = ir_datasets.lazy_libs.pyarrow_parquet()
# https://stackoverflow.com/a/77150113
batch_size = 64
with pq.ParquetFile(path) as parquet_file:
for record_batch in parquet_file.iter_batches(batch_size=batch_size):
for d in record_batch.to_pylist():
yield d
class NanoBeirDocs(BaseDocs):
def __init__(self, name, dlc, doc_type):
super().__init__()
self._name = name
self._dlc = dlc
self._doc_type = doc_type
def docs_iter(self):
return iter(self.docs_store())
def _docs_iter(self):
for d in parquet_iter(self._dlc.path()):
yield self._doc_type(*(_map_field(f, d) for f in self._doc_type._fields))
def docs_cls(self):
return self._doc_type
def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS):
return PickleLz4FullStore(
path=f"{ir_datasets.util.home_path()/NAME/self._name}/docs.pklz4",
init_iter_fn=self._docs_iter,
data_cls=self.docs_cls(),
lookup_field=field,
index_fields=["doc_id"],
count_hint=ir_datasets.util.count_hint(f"{NAME}/{self._name}"),
options=options
)
def docs_count(self):
if self.docs_store().built():
return self.docs_store().count()
def docs_namespace(self):
return f"{NAME}/{self._name}"
def docs_lang(self):
return "en"
class NanoBeirQueries(BaseQueries):
def __init__(self, name, dlc, query_type):
super().__init__()
self._name = name
self._dlc = dlc
self._query_type = query_type
def queries_iter(self):
for d in parquet_iter(self._dlc.path()):
yield self._query_type(*(_map_field(f, d) for f in self._query_type._fields))
def queries_cls(self):
return self._query_type
def queries_namespace(self):
return f"{NAME}/{self._name}"
def queries_lang(self):
return "en"
class NanoBeirQrels(BaseQrels):
def __init__(self, qrels_dlc, qrels_defs):
self._qrels_dlc = qrels_dlc
self._qrels_defs = qrels_defs
def qrels_path(self):
return self._qrels_dlc.path()
def qrels_iter(self):
for d in parquet_iter(self.qrels_path()):
yield TrecQrel(d["query-id"], d["corpus-id"], 1, "0")
def qrels_cls(self):
return TrecQrel
def qrels_defs(self):
return self._qrels_defs
def _init():
base_path = ir_datasets.util.home_path() / NAME
dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path)
documentation = YamlDocumentation(f"docs/{NAME}.yaml")
base = Dataset(documentation("_"))
subsets = {}
benchmarks = [
"climate-fever",
"dbpedia-entity",
"fever",
"fiqa",
"hotpotqa",
"msmarco",
"nfcorpus",
"nq",
"quora",
"scidocs",
"arguana",
"scifact",
"webis-touche2020",
]
for ds in benchmarks:
docs = NanoBeirDocs(ds, dlc[f"{ds}/docs"], GenericDoc)
queries = NanoBeirQueries(ds, dlc[f"{ds}/queries"], GenericQuery)
qrels = NanoBeirQrels(dlc[f"{ds}/qrels"], qrels_defs={1: 'relevant'})
subsets[ds] = Dataset(
docs,
queries,
qrels,
documentation(ds),
)
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f"{NAME}/{s}", subsets[s])
return base, subsets
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/natural_questions.py
================================================
from typing import NamedTuple, List
import json
import contextlib
import ir_datasets
from ir_datasets.datasets.base import Dataset, YamlDocumentation
from ir_datasets.formats import DocstoreBackedDocs, TsvQueries, BaseQrels, BaseScoredDocs, GenericScoredDoc
from ir_datasets.indices import DocstoreOptions, DEFAULT_DOCSTORE_OPTIONS
_logger = ir_datasets.log.easy()
NAME = 'natural-questions'
class NqPassageDoc(NamedTuple):
doc_id: str # a sequentially-assigned document ID (unique based on URL) + the index of the passage
text: str # tokenized text of the passage, with all HTML tokens removed
html: str # raw HTML of the passage
start_byte: int # the following are from the `long_answer_candidates` objects and may be useful for something
end_byte: int
start_token: int
end_token: int
document_title: str # from document itself
document_url: str # from document itself
parent_doc_id: str # doc_id of the largest passage it's under (e.g., a sentence under a paragraph), or None if it's a top-level passage
def default_text(self):
"""
document_title and text
"""
return f'{self.document_title} {self.text}'
class NqQrel(NamedTuple):
query_id: str
doc_id: str
relevance: int # always 1
short_answers: List[str] # the **string** representations of the answers (this is similar to how DPH evaluates)
yes_no_answer: str
class NqManager:
def __init__(self, dlcs, base_path):
self._dlcs = dlcs
self._docs_store = None
self._base_path = base_path
def docs_store(self, options: DocstoreOptions = DEFAULT_DOCSTORE_OPTIONS):
self.build()
return self._internal_docs_store(options)
def _internal_docs_store(self, options: DocstoreOptions = DEFAULT_DOCSTORE_OPTIONS):
if self._docs_store is None:
self._docs_store = ir_datasets.indices.PickleLz4FullStore(self._base_path/'docs.pklz4', None, NqPassageDoc, 'doc_id', ['doc_id'], count_hint=ir_datasets.util.count_hint(NAME))
return self._docs_store
def build(self):
docs_store = self._internal_docs_store()
if docs_store.built():
return # already built
pbar_postfix = {'file': None}
doc_url_to_id = {}
with contextlib.ExitStack() as stack:
docs_trans = stack.enter_context(docs_store.lookup.transaction())
pbar = stack.enter_context(_logger.pbar_raw(desc='processing nq', postfix=pbar_postfix, unit='question'))
train_queries = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'train.queries.tsv', 'wt'))
train_qrels = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'train.qrels.jsonl', 'wt'))
train_scoreddocs = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'train.scoreddocs.tsv', 'wt'))
dev_queries = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'dev.queries.tsv', 'wt'))
dev_qrels = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'dev.qrels.jsonl', 'wt'))
dev_scoreddocs = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'dev.scoreddocs.tsv', 'wt'))
for file_name in sorted(self._dlcs.contents().keys()):
pbar_postfix['file'] = file_name
pbar.set_postfix(pbar_postfix)
if 'train' in file_name:
f_queries, f_qrels, f_scoreddocs = train_queries, train_qrels, train_scoreddocs
elif 'dev' in file_name:
f_queries, f_qrels, f_scoreddocs = dev_queries, dev_qrels, dev_scoreddocs
with ir_datasets.util.GzipExtract(self._dlcs[file_name]).stream() as stream:
for line in stream:
data = json.loads(line)
qid = str(data['example_id'])
# docs
if data['document_url'] not in doc_url_to_id:
did = str(len(doc_url_to_id))
doc_url_to_id[data['document_url']] = did
last_end_idx, last_did = -1, None
for idx, cand in enumerate(data['long_answer_candidates']):
text = ' '.join(t['token'] for t in data['document_tokens'][cand['start_token']:cand['end_token']] if not t['html_token'])
html = ' '.join(t['token'] for t in data['document_tokens'][cand['start_token']:cand['end_token']])
parent_doc_id = last_did if cand['start_token'] < last_end_idx else None
doc = NqPassageDoc(
f'{did}-{idx}',
text,
html,
cand['start_byte'],
cand['end_byte'],
cand['start_token'],
cand['end_token'],
data['document_title'],
data['document_url'],
parent_doc_id,
)
docs_trans.add(doc)
if parent_doc_id is None:
last_end_idx, last_did = cand['end_token'], doc.doc_id
else:
did = doc_url_to_id[data['document_url']]
# queries
f_queries.write('{}\t{}\n'.format(qid, data['question_text'].replace('\t', ' ')))
# qrels
qrels = {}
for ann in data['annotations']:
if ann['long_answer']['candidate_index'] == -1:
continue
passage_id = '{}-{}'.format(did, ann['long_answer']['candidate_index'])
short_answers = [' '.join(t['token'] for t in data['document_tokens'][s['start_token']:s['end_token']] if not t['html_token']) for s in ann['short_answers']]
if passage_id in qrels:
qrel = qrels[passage_id]
short_answers = [s for s in short_answers if s not in short_answers]
qrel.short_answers.extend(short_answers)
else:
qrel = NqQrel(
qid,
passage_id,
1,
short_answers,
ann['yes_no_answer'],
)
qrels[passage_id] = qrel
for qrel in qrels.values():
json.dump(qrel._asdict(), f_qrels)
f_qrels.write('\n')
# scoreddocs
count = len(data['long_answer_candidates'])
f_scoreddocs.write(f'{qid}\t{did}\t{count}\n')
pbar.update(1)
def file_ref(self, path):
return _ManagedDlc(self, self._base_path/path)
class _ManagedDlc:
def __init__(self, manager, path):
self._manager = manager
self._path = path
@contextlib.contextmanager
def stream(self):
self._manager.build()
with open(self._path, 'rb') as f:
yield f
def path(self, force=True):
if force:
self._manager.build()
return self._path
class NqQrels(BaseQrels):
def __init__(self, dlc):
super().__init__()
self.dlc = dlc
def qrels_iter(self):
with self.dlc.stream() as stream:
for line in stream:
data = json.loads(line)
yield NqQrel(**data)
def qrels_cls(self):
return NqQrel
def qrels_defs(self):
return {1: 'passage marked by annotator as a "long" answer to the question'}
class NqScoredDocs(BaseScoredDocs):
def __init__(self, dlc):
super().__init__()
self.dlc = dlc
def scoreddocs_iter(self):
with self.dlc.stream() as stream:
for line in stream:
qid, did, count = line.decode().strip().split('\t')
for i in range(int(count)):
yield GenericScoredDoc(qid, f'{did}-{i}', 0.)
def scoreddocs_cls(self):
return GenericScoredDoc
def _init():
base_path = ir_datasets.util.home_path()/NAME
dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path)
manager = NqManager(dlc, base_path)
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
collection = DocstoreBackedDocs(manager.docs_store, docs_cls=NqPassageDoc, namespace=NAME, lang='en')
base = Dataset(
collection,
documentation('_'))
subsets = {}
subsets['train'] = Dataset(
collection,
TsvQueries(manager.file_ref('train.queries.tsv'), namespace=NAME, lang='en'),
NqQrels(manager.file_ref('train.qrels.jsonl')),
NqScoredDocs(manager.file_ref('train.scoreddocs.tsv')),
documentation('train'),
)
subsets['dev'] = Dataset(
collection,
TsvQueries(manager.file_ref('dev.queries.tsv'), namespace=NAME, lang='en'),
NqQrels(manager.file_ref('dev.qrels.jsonl')),
NqScoredDocs(manager.file_ref('dev.scoreddocs.tsv')),
documentation('dev'),
)
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/neuclir.py
================================================
import gzip
import json
from functools import lru_cache
import ir_datasets
from ir_datasets.util import DownloadConfig, Lazy
from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries
from ir_datasets.formats.trec import TrecQrels
from ir_datasets.formats import ExctractedCCDocs, ExctractedCCQueries, ExctractedCCNoReportQuery, ExctractedCCNoReportNoHtNarQuery, ExctractedCCMultiMtQuery
from ir_datasets.datasets.hc4 import NAME as HC4_NAME
from ir_datasets.util.fileio import GzipExtract, TarExtract
NAME = 'neuclir'
DOC_COUNTS = {
'zh': 3179209,
'fa': 2232016,
'ru': 4627543
}
@lru_cache(maxsize=3) # three languages
def get_ids(dlcs):
dlcs = dlcs if isinstance(dlcs, (list, tuple)) else [dlcs]
ids = []
for dlc in dlcs:
with GzipExtract(dlc).stream() as f:
ids += [ json.loads(line)['id'] for line in f ]
return set(ids)
class FilteredExctractedCCDocs(ExctractedCCDocs):
def __init__(self, docs_dlc, subset_lang, include_doc_id_dlc, filter_name=None, namespace=None, count=None):
super().__init__(docs_dlc, subset_lang, namespace, count)
self._filter_name = filter_name or "filtered"
self._include_doc_id_dlc = include_doc_id_dlc
def _doc_store_path(self):
return self.docs_path(force=False) + f".{self._filter_name}"
def _internal_docs_iter(self):
include_doc_id = get_ids(self._include_doc_id_dlc)
for doc in super()._internal_docs_iter():
if doc.doc_id in include_doc_id:
yield doc
class FilteredTrecQrels(TrecQrels):
def __init__(self, qrels_dlc, qrels_defs, include_doc_id_dlc, format_3col=False):
super().__init__(qrels_dlc, qrels_defs, format_3col)
self._include_doc_id_dlc = include_doc_id_dlc
def qrels_iter(self):
include_doc_id = get_ids(self._include_doc_id_dlc)
for qrel in super().qrels_iter():
if qrel.doc_id in include_doc_id:
yield qrel
class LangFilteredTrecQrels(TrecQrels):
def __init__(self, qrels_dlc, qrels_defs, lang, format_3col=False):
super().__init__(qrels_dlc, qrels_defs, format_3col)
self._lang = lang
def qrels_iter(self):
for qrel in super().qrels_iter():
if qrel.iteration == self._lang:
yield qrel
QREL_DEFS = {
3: 'Very-valuable. Information in the document would be found in the lead paragraph of a report that is later written on the topic.',
1: 'Somewhat-valuable. The most valuable information in the document would be found in the remainder of such a report.',
0: 'Not-valuable. Information in the document might be included in a report footnote, or omitted entirely.',
}
def _init():
subsets = {}
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
hc4_dlc = DownloadConfig.context(HC4_NAME, ir_datasets.util.home_path()/HC4_NAME)
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
base = Dataset(documentation('_')) # dummy top level ds
subsets["1"] = Dataset(documentation('1')) # dummy year level ds
qrels2022 = dlc['trec-2022/qrels']
# For NeuCLIR Collection 1
for lang in ['zh', 'fa', 'ru']:
lang3 = {'fa': 'fas', 'zh': 'zho', 'ru': 'rus'}[lang]
lang_docs = ExctractedCCDocs(GzipExtract(dlc[f'1/{lang}/docs']), subset_lang=lang, namespace=NAME, count=DOC_COUNTS[lang])
subsets[f"1/{lang}"] = Dataset(
lang_docs,
documentation(f"1/{lang}")
)
qrels = LangFilteredTrecQrels(qrels2022, QREL_DEFS, lang3)
subsets[f"1/{lang}/trec-2022"] = Dataset(
lang_docs,
FilteredQueries(ExctractedCCQueries(dlc['trec-2022/queries'], subset_lang=lang, filter_lwq=False, cls=ExctractedCCNoReportQuery, namespace=NAME), _lazy_qids_set(qrels), mode='include'),
qrels,
documentation(f"1/{lang}/trec-2022"),
)
subsets[f"1/{lang}/trec-2023"] = Dataset(
lang_docs,
FilteredQueries(ExctractedCCQueries(dlc['trec-2023/queries'], subset_lang=lang, filter_lwq=False, cls=ExctractedCCNoReportNoHtNarQuery, namespace=NAME), _lazy_qids_set(qrels), mode='include'),
TrecQrels(TarExtract(dlc['trec-2023/qrels'], f'qrels.final.gains.{lang3}'), QREL_DEFS),
documentation(f"1/{lang}/trec-2023"),
)
include_doc_id_dlc = hc4_dlc[f'{lang}/docs/ids'] if lang != 'ru' else tuple([ hc4_dlc[f'{lang}/docs/ids/{i}'] for i in range(8) ])
subsets[f"1/{lang}/hc4-filtered"] = Dataset(
FilteredExctractedCCDocs(GzipExtract(dlc[f'1/{lang}/docs']), subset_lang=lang, namespace=NAME, include_doc_id_dlc=include_doc_id_dlc),
ExctractedCCQueries([hc4_dlc['dev/topics'], hc4_dlc['test/topics']], subset_lang=lang, namespace=NAME),
FilteredTrecQrels([ hc4_dlc[f'{lang}/dev/qrels'], hc4_dlc[f'{lang}/test/qrels'] ], QREL_DEFS, include_doc_id_dlc=include_doc_id_dlc),
documentation(f"1/{lang}/hc4-filtered")
)
multi_docs = ExctractedCCDocs([GzipExtract(dlc[f'1/{lang}/docs']) for lang in ['zh', 'fa', 'ru']], namespace=NAME, count=sum(DOC_COUNTS.values()), docstore_path=base_path/'1'/'multi')
subsets['1/multi'] = Dataset(
multi_docs,
documentation("1/multi")
)
subsets['1/multi/trec-2023'] = Dataset(
multi_docs,
ExctractedCCQueries(dlc['trec-2023/queries'], filter_lwq=False, cls=ExctractedCCMultiMtQuery, namespace=NAME),
TrecQrels(TarExtract(dlc['trec-2023/qrels'], 'qrels.final.gains'), QREL_DEFS),
documentation("1/multi/trec-2023")
)
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
def _lazy_qids_set(qrels):
return Lazy(lambda: {qrel.query_id for qrel in qrels.qrels_iter()})
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/neumarco.py
================================================
import io
import codecs
import re
import ir_datasets
from ir_datasets.util import DownloadConfig, TarExtract, Cache
from ir_datasets.datasets.base import Dataset, YamlDocumentation
from ir_datasets.datasets import msmarco_passage
from ir_datasets.formats import TsvDocs
NAME = 'neumarco'
def _init():
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
subsets = {}
subsets_from_msmarco = {
'train': [
ir_datasets.registry['msmarco-passage/train'].queries_handler(),
ir_datasets.registry['msmarco-passage/train'].qrels_handler(),
ir_datasets.registry['msmarco-passage/train'].docpairs_handler(),
],
'train/judged': [
ir_datasets.registry['msmarco-passage/train/judged'].queries_handler(),
ir_datasets.registry['msmarco-passage/train/judged'].qrels_handler(),
ir_datasets.registry['msmarco-passage/train/judged'].docpairs_handler(),
],
'dev': [
ir_datasets.registry['msmarco-passage/dev'].queries_handler(),
ir_datasets.registry['msmarco-passage/dev'].qrels_handler(),
],
'dev/small': [
ir_datasets.registry['msmarco-passage/dev/small'].queries_handler(),
ir_datasets.registry['msmarco-passage/dev/small'].qrels_handler(),
],
'dev/judged': [
ir_datasets.registry['msmarco-passage/dev/judged'].queries_handler(),
ir_datasets.registry['msmarco-passage/dev/judged'].qrels_handler(),
]
}
base_dlc = dlc['main']
for lang3, lang2 in [('fas', 'fa'), ('zho', 'zh'), ('rus', 'ru')]:
corpus_dlc = Cache(TarExtract(base_dlc, f'eng-{lang3}/msmarco.collection.20210731-scale21-sockeye2-tm1.tsv'), base_path/f'{lang2}.tsv')
collection = TsvDocs(corpus_dlc, namespace=f'{NAME}/{lang2}', lang=lang2, count_hint=ir_datasets.util.count_hint(f'{NAME}/{lang2}'))
subsets[f'{lang2}'] = Dataset(collection, documentation(f'{lang2}'))
for s, items in subsets_from_msmarco.items():
subsets[f'{lang2}/{s}'] = Dataset(
collection,
*items,
documentation(f'{lang2}/{s}'))
ir_datasets.registry.register(NAME, Dataset(documentation('_')))
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return collection, subsets
collection, subsets = _init()
================================================
FILE: ir_datasets/datasets/nfcorpus.py
================================================
import io
import codecs
import re
from typing import NamedTuple
import ir_datasets
from ir_datasets.util import Cache, TarExtract, IterStream, GzipExtract, Lazy, DownloadConfig
from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredScoredDocs, FilteredQrels, FilteredDocPairs, YamlDocumentation
from ir_datasets.formats import TsvQueries, TsvDocs, TrecQrels, TrecScoredDocs, TsvDocPairs, BaseQueries
NAME = 'nfcorpus'
_logger = ir_datasets.log.easy()
QRELS_DEFS = {
2: "A direct link from the query to the document the cited sources section of a page.",
1: "A link exists from the query to another query that directly links to the document.",
0: "Marginally relevant, based on topic containment.",
}
class NfCorpusDoc(NamedTuple):
doc_id: str
url: str
title: str
abstract: str
def default_text(self):
"""
title and abstract
"""
return f'{self.title} {self.abstract}'
class NfCorpusQuery(NamedTuple):
query_id: str
title: str
all: str
def default_text(self):
"""
title
"""
return self.title
class NfCorpusVideoQuery(NamedTuple):
query_id: str
title: str
desc: str
def default_text(self):
"""
title
"""
return self.title
class ZipQueries(BaseQueries):
def __init__(self, queries, idxs, qtype):
self._queries = queries
self._idxs = idxs
self._qtype = qtype
def queries_iter(self):
for qs in zip(*(q.queries_iter() for q in self._queries)):
assert len({q.query_id for q in qs}) == 1 # all query IDs should be the same
yield self._qtype(*(qs[i][j] for i, j in self._idxs))
def queries_cls(self):
return self._qtype
def queries_path(self):
return self._queries[0].queries_path()
def queries_namespace(self):
return NAME
def queries_lang(self):
return self._queries[0].queries_lang()
def _init():
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
main_dlc = dlc['main']
collection = TsvDocs(Cache(TarExtract(main_dlc, 'nfcorpus/raw/doc_dump.txt'), base_path/'collection.tsv'), doc_cls=NfCorpusDoc, namespace=NAME, lang='en', count_hint=ir_datasets.util.count_hint(NAME))
subsets = {}
def read_lines(file):
file = Cache(TarExtract(main_dlc, f'nfcorpus/raw/{file}'), base_path/file)
with file.stream() as stream:
stream = codecs.getreader('utf8')(stream)
return {l.rstrip() for l in stream}
nontopic_qid_filter = Lazy(lambda: read_lines('nontopics.ids'))
video_qid_filter = Lazy(lambda: read_lines('all_videos.ids'))
subsets['train'] = Dataset(
collection,
ZipQueries([
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/train.titles.queries'), base_path/'train/queries.titles.tsv'), namespace=NAME, lang='en'),
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/train.all.queries'), base_path/'train/queries.all.tsv'), namespace=NAME, lang='en'),
], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery),
TrecQrels(Cache(TarExtract(main_dlc, 'nfcorpus/train.3-2-1.qrel'), base_path/'train/qrels'), QRELS_DEFS),
documentation('train'),
)
subsets['train/nontopic'] = Dataset(
collection,
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/train.nontopic-titles.queries'), base_path/'train/nontopic/queries.tsv'), namespace=NAME, lang='en'),
FilteredQrels(subsets['train'].qrels_handler(), nontopic_qid_filter, mode='include'),
documentation('train/nontopic'),
)
subsets['train/video'] = Dataset(
collection,
ZipQueries([
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/train.vid-titles.queries'), base_path/'train/video/queries.titles.tsv'), namespace=NAME, lang='en'),
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/train.vid-desc.queries'), base_path/'train/video/queries.desc.tsv'), namespace=NAME, lang='en'),
], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery),
FilteredQrels(subsets['train'].qrels_handler(), video_qid_filter, mode='include'),
documentation('train/video'),
)
subsets['dev'] = Dataset(
collection,
ZipQueries([
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/dev.titles.queries'), base_path/'dev/queries.titles.tsv'), namespace=NAME, lang='en'),
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/dev.all.queries'), base_path/'dev/queries.all.tsv'), namespace=NAME, lang='en'),
], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery),
TrecQrels(Cache(TarExtract(main_dlc, 'nfcorpus/dev.3-2-1.qrel'), base_path/'dev/qrels'), QRELS_DEFS),
documentation('dev'),
)
subsets['dev/nontopic'] = Dataset(
collection,
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/dev.nontopic-titles.queries'), base_path/'dev/nontopic/queries.tsv'), namespace=NAME, lang='en'),
FilteredQrels(subsets['dev'].qrels_handler(), nontopic_qid_filter, mode='include'),
documentation('dev/nontopic'),
)
subsets['dev/video'] = Dataset(
collection,
ZipQueries([
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/dev.vid-titles.queries'), base_path/'dev/video/queries.titles.tsv'), namespace=NAME, lang='en'),
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/dev.vid-desc.queries'), base_path/'dev/video/queries.desc.tsv'), namespace=NAME, lang='en'),
], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery),
FilteredQrels(subsets['dev'].qrels_handler(), video_qid_filter, mode='include'),
documentation('dev/video'),
)
subsets['test'] = Dataset(
collection,
ZipQueries([
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/test.titles.queries'), base_path/'test/queries.titles.tsv'), namespace=NAME, lang='en'),
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/test.all.queries'), base_path/'test/queries.all.tsv'), namespace=NAME, lang='en'),
], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery),
TrecQrels(Cache(TarExtract(main_dlc, 'nfcorpus/test.3-2-1.qrel'), base_path/'test/qrels'), QRELS_DEFS),
documentation('test'),
)
subsets['test/nontopic'] = Dataset(
collection,
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/test.nontopic-titles.queries'), base_path/'test/nontopic/queries.tsv'), namespace=NAME, lang='en'),
FilteredQrels(subsets['test'].qrels_handler(), nontopic_qid_filter, mode='include'),
documentation('test/nontopic'),
)
subsets['test/video'] = Dataset(
collection,
ZipQueries([
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/test.vid-titles.queries'), base_path/'test/video/queries.titles.tsv'), namespace=NAME, lang='en'),
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/test.vid-desc.queries'), base_path/'test/video/queries.desc.tsv'), namespace=NAME, lang='en'),
], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery),
FilteredQrels(subsets['test'].qrels_handler(), video_qid_filter, mode='include'),
documentation('test/video'),
)
ir_datasets.registry.register(NAME, Dataset(collection, documentation('_')))
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return collection, subsets
collection, subsets = _init()
================================================
FILE: ir_datasets/datasets/nyt.py
================================================
import io
import tarfile
from typing import NamedTuple
import ir_datasets
from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS
from ir_datasets.util import Lazy, DownloadConfig, Migrator
from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredQrels, YamlDocumentation
from ir_datasets.formats import BaseDocs, BaseQueries, BaseQrels, GenericQuery, GenericQrel, TrecQueries, TrecQrels
NAME = 'nyt'
QREL_DEFS = {
1: 'title is associated with article body',
}
CORE_QREL_DEFS = {
0: "not relevant",
1: "relevant",
2: "highly relevant",
}
VALID_IDS = {'1206388', '46335', '1223589', '1642970', '144845', '420493', '1186325', '564166', '1092844', '1232733', '243508', '946470', '1147459', '84957', '87385', '1298633', '1327402', '1482333', '1069716', '1575477', '1110091', '655579', '1562062', '541298', '1571257', '639395', '1341710', '663400', '1174700', '1406944', '1368755', '1315376', '1609162', '1746895', '1447812', '193348', '882027', '213652', '126658', '799474', '1677212', '1254313', '43743', '250901', '426439', '1803638', '1111630', '1220244', '1142672', '944176', '860862', '342011', '1556809', '1574691', '292048', '855559', '1473717', '157893', '252570', '305646', '198014', '1444467', '1842149', '161276', '455333', '146910', '1414339', '1413851', '1352725', '509114', '563685', '1738087', '1115555', '639541', '427073', '1435887', '862324', '476212', '870108', '315852', '144389', '684154', '845724', '117999', '35935', '716125', '1818546', '551762', '687923', '1817616', '135841', '618338', '1597113', '1549790', '1292666', '147051', '1778945', '1347630', '1337511', '299371', '1384273', '388274', '938995', '263847', '195638', '303927', '646946', '1620311', '1455534', '325463', '1380230', '1038853', '1040633', '1831119', '363686', '260491', '1611855', '147526', '542544', '581106', '1766627', '899656', '236785', '1408409', '300748', '742732', '986023', '1662861', '1083296', '152722', '1458233', '1203328', '1810235', '996231', '1226680', '427277', '517560', '1230947', '185677', '1524891', '492603', '1023515', '334223', '1219069', '1021319', '152336', '1227959', '1501876', '765819', '395940', '524179', '1494335', '66871', '105130', '1660760', '744794', '1616161', '876120', '714837', '35529', '42617', '198139', '1811671', '147293', '1041065', '841417', '1346509', '200467', '850536', '1235945', '184078', '1269259', '1314141', '1368414', '387436', '896464', '84650', '375608', '423014', '1201696', '883245', '137547', '1376881', '1207160', '280170', '968570', '1438840', '626732', '1085071', '632127', '1206647', '399973', '1316303', '1187122', '805546', '1727291', '570037', '1178896', '555992', '977573', '1340396', '632958', '63542', '1280664', '977205', '1567169', '783676', '814977', '1668678', '1735184', '1074278', '1652858', '1108702', '955404', '1784962', '1185130', '250831', '818408', '623624', '134405', '104342', '965709', '956076', '1260229', '27255', '1500603', '1127679', '1722973', '1734641', '309555', '1681934', '695555', '48767', '433808', '995051', '180797', '123367', '378006', '1216681', '324683', '1711346', '211935', '1801492', '103678', '446767', '594334', '860460', '660793', '1393998', '266826', '876460', '994066', '1282229', '1587147', '815344', '1103826', '343997', '1200405', '179480', '742314', '1780439', '1066709', '1330760', '1368900', '1549318', '1110897', '619788', '188464', '173770', '34154', '578909', '645650', '1157537', '62836', '700552', '1388063', '408649', '848686', '1694615', '1617883', '1765655', '1466678', '155464', '1445513', '1303273', '231804', '581627', '742052', '1212886', '1405769', '481040', '1855639', '54259', '111905', '1313586', '387001', '1185491', '1670617', '906527', '69825', '499522', '1819890', '164762', '970999', '1179216', '993221', '372699', '296270', '1185999', '792835', '1037962', '1740374', '1624046', '954664', '368818', '1087747', '1026355', '812422', '1544110', '1226870', '155570', '1190376', '869921', '296349', '595907', '614301', '1241703', '442373', '995807', '1369864', '1709789', '114305', '184927', '1120202', '584073', '828184', '1473187', '1521230', '440704', '1013610', '1830313', '721770', '1658974', '313921', '692325', '368461', '985252', '290240', '1251117', '1538562', '422046', '1630032', '1181653', '125066', '1837263', '1656997', '441', '490006', '1643057', '165954', '69049', '1199388', '1507218', '1329673', '509136', '1466695', '16687', '508419', '268880', '969961', '340902', '253378', '256155', '863620', '1683671', '1560798', '675553', '1748098', '458865', '1665924', '1055150', '66385', '215071', '13148', '986080', '236365', '517825', '873311', '441741', '720189', '572737', '1225926', '624119', '997868', '515426', '691257', '419206', '1130476', '100471', '6461', '1807548', '1544601', '407787', '380030', '1152266', '1065150', '694778', '811554', '1854529', '444117', '1099590', '922315', '1217477', '1779802', '369061', '775743', '72992', '144419', '552889', '1181556', '1292830', '1778514', '1489202', '914269', '1706337', '1196929', '184181', '314027', '1227737', '559948', '784834', '1704396', '1256508', '1508836', '317087', '96486', '747998', '1632274', '950708', '1649807', '446890', '593993', '814566', '1292672', '560408', '1077779', '978883', '393982', '844217', '398230', '183055', '53060', '1210135', '916178', '1532407', '1139738', '1518821', '728959', '1304148', '491724', '1568275', '712403', '1728481', '660217', '821176', '1222683', '1778005', '1195123', '1817074', '974513', '426701', '1111638', '1240027', '1664639', '1464379', '521007', '1199739', '578456', '1439699', '284928', '494919', '491912', '232568', '923474', '99386', '1643092', '1790124', '1061993', '621986', '1122877', '100662', '1473138', '1030173', '71586', '1096287', '1138157', '262640', '602945', '1300130', '1338721', '1270177', '39801', '1692635', '56624', '211659', '1646283', '324374', '255385', '1255526', '1786203', '1406143', '1788514', '289251', '672936', '452286', '137862', '185683', '1430', '1380422', '845912', '775802', '647375', '145796', '355527', '146542', '1410218', '345442', '190717', '371036', '1797336', '120994', '1718571', '1054043', '4558', '428059', '1396897', '1201117', '1158485', '1089656', '519981', '43015', '520964', '1494349', '1094063', '1392684', '978574', '1052143', '1118795', '1687088', '1314160', '162771', '911024', '1820168', '1192318', '91766', '143489', '1004985', '518421', '166275', '370104', '974150', '546915', '1323563', '1798085', '938123', '182313', '1364401', '9506', '557187', '112370', '611777', '1159485', '1403348', '683930', '797900', '1383582', '114608', '350383', '1604331', '568871', '1047323', '394651', '165898', '283949', '810556', '105425', '1013875', '1464119', '1312394', '1695169', '58536', '1169598', '1125874', '1665958', '769476', '594319', '683707', '882361', '1302321', '450679', '254550', '1033539', '1301128', '1320428', '41154', '1657029', '1227578', '171871', '1792745', '288902', '453868', '271254', '409591', '143722', '535764', '1830350', '578047', '230266', '111402', '773754', '1245031', '1350576', '1624207', '1807992', '1015799', '1794740', '511024', '789525', '319777', '1132669', '1327710', '1272568', '1390168', '1533260', '617767', '638910', '496086', '1205039', '1626665', '191596', '1810513', '1556267', '1100153', '207238', '1501543', '834402', '279588', '568816', '1632682', '822260', '343317', '430137', '1768788', '545282', '279954', '165473', '828347', '1470816', '1327112', '1529515', '1016007', '270386', '1702078', '286404', '1088273', '1322387', '1643857', '489043', '380855', '1083556', '1619528', '583350', '132853', '546862', '1253587', '535138', '264437', '943235', '1620828', '1006607', '553760', '828792', '1624460', '1434951', '833541', '212690', '200229', '1064862', '220330', '1579543', '363926', '1258350', '1184051', '720391', '1459592', '457690', '38548', '81369', '1679222', '390074', '286007', '378270', '816642', '283001', '372084', '411601', '910971', '1590440', '135775', '1112005', '75424', '213834', '689492', '1005355', '1139329', '808335', '720425', '1267233', '263546', '1222854', '258056', '837513', '940506', '1103175', '1378900', '1385626', '237112', '730612', '301649', '273771', '497029', '736059', '1193481', '797044', '1144902', '1030001', '719277', '1119289', '1337197', '942773', '982474', '584235', '1707268', '1754255', '1104478', '1534921', '128481', '470969', '347013', '509587', '408644', '772685', '1733430', '1317735', '848134', '404829', '267884', '953680', '1303696', '884333', '968388', '1201708', '1112434', '303328', '1304264', '1133757', '1724836', '1334405', '1829066', '925761', '946016', '552534', '943383', '1100246', '1846843', '1088146', '544438', '1753939', '74810', '1807078', '100915', '1236323', '803592', '429972', '393687', '1378937', '456043', '1613185', '613184', '417913', '1563559', '1339387', '1502489', '656071', '365604', '1151482', '1259752', '277596', '673808', '161493', '873580', '832327', '260612', '924572', '1064547', '1125330', '1641045', '1151695', '256879', '394244', '556588', '1305678', '1263185', '136826', '1399892', '557148', '1358190', '1776190', '249236', '1492533', '1303288', '521017', '1066272', '541133', '1623539', '137859', '687241', '237814', '1369332', '371264', '24081', '1552898', '1502059', '1047404', '1023221', '177279', '1267817', '1411135', '191656', '980600', '951516', '499404', '1695509', '811244', '238763', '1284303', '585143', '1033260', '942257', '1349353', '1429932', '140492', '1044892', '418808', '698145', '1796223', '59227', '194957', '269275', '730734', '1145222', '253742', '581098', '45351', '66070', '426605', '1050966', '529688', '1801056', '1718077', '1266182', '129555', '1531233', '74473', '302447', '215843', '792070', '1104761', '1573381', '202553', '60314', '1503921', '280964', '711987', '136821', '832921', '1419515', '1662966', '1819530', '716942', '219736', '436016', '1735969', '713752', '60858', '121707', '689812', '193395', '1624062', '1330056', '563645', '1492653', '1449544', '376209', '1750188', '1478352', '410699', '777880', '1029514', '108914', '720269', '1448513', '74549', '972109', '215002', '404357', '1647764', '550693', '1255375', '1293865', '1264570', '896848', '789563', '826347', '903589', '1018558', '277290', '1683375', '1496790', '1112399', '860557', '127350', '1015623', '312660', '233953', '1565217', '1639977', '1607902', '397905', '490534', '1513419', '174443', '1215224', '66269', '275494', '209655', '516500', '1675849', '836893', '947869', '789401', '1553981', '155710', '496679', '821652', '1139493', '286234', '128146', '1207153', '1199733', '1778364', '1704065', '326315', '317132', '1824346', '319345', '1219375', '99297', '1850878', '755324', '1737932', '1556261', '1389561', '128767', '24850', '1105008', '1046487', '390245', '899371', '623036', '1190883', '1218126', '334762', '1496567', '1228970', '540795', '689403', '1465965', '1585171', '734591', '1257610', '685476', '784313', '1178416', '1468942', '883627', '1000719', '952670', '51709', '933442'}
class NytDoc(NamedTuple):
doc_id: str
headline: str
body: str
source_xml: str
def default_text(self):
"""
headline and body
"""
return f'{self.headline} {self.body}'
class NytDocs(BaseDocs):
def __init__(self, dlc):
self._dlc = dlc
def docs_path(self, force=True):
return self._dlc.path(force)
def docs_cls(self):
return NytDoc
def docs_iter(self):
return iter(self.docs_store())
def _docs_iter(self):
BeautifulSoup = ir_datasets.lazy_libs.bs4().BeautifulSoup
with self._dlc.stream() as stream:
with tarfile.open(fileobj=stream, mode='r|gz') as tgz_outer:
for member_o in tgz_outer:
if not member_o.isfile() or not (member_o.name.endswith('.tar') or member_o.name.endswith('.tgz')):
continue
file = tgz_outer.extractfile(member_o)
with tarfile.open(fileobj=file, mode='r|gz' if member_o.name.endswith('.tgz') else 'r|') as tgz_inner:
for member_i in tgz_inner:
if not member_i.isfile():
continue
full_xml = tgz_inner.extractfile(member_i).read()
soup = BeautifulSoup(full_xml, 'lxml-xml')
did = soup.find('doc-id')
did = did['id-string'] if did else ''
headline = soup.find('hl1') # 'headline' element can contain multiple (e.g. hl2 for online)
headline = headline.get_text() if headline else ''
full_text = soup.find('block', {'class': 'full_text'})
full_text = full_text.get_text().strip() if full_text else ''
yield NytDoc(did, headline, full_text, full_xml)
def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
return PickleLz4FullStore(
path=f'{self.docs_path()}.pklz4',
init_iter_fn=self._docs_iter,
data_cls=self.docs_cls(),
lookup_field=field,
index_fields=['doc_id'],
count_hint=ir_datasets.util.count_hint(NAME),
options=options
)
def docs_count(self):
if self.docs_store().built():
return self.docs_store().count()
def docs_namespace(self):
return NAME
def docs_lang(self):
return 'en'
class NytQueries(BaseQueries):
def __init__(self, collection):
self._collection = collection
def queries_iter(self):
for doc in self._collection.docs_iter():
yield GenericQuery(doc.doc_id, doc.headline)
def queries_namespace(self):
return NAME
def queries_lang(self):
return 'en'
class NytQrels(BaseQrels):
def __init__(self, collection):
self._collection = collection
def qrels_iter(self):
for doc in self._collection.docs_iter():
yield GenericQrel(doc.doc_id, doc.doc_id, 1)
def qrels_defs(self):
return QREL_DEFS
def _init():
subsets = {}
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
migrator = Migrator(base_path/'irds_version.txt', 'v2',
affected_files=[base_path/'nyt.tgz.pklz4'],
message='Migrating nyt (extracting body text)')
collection = migrator(NytDocs(dlc['source']))
base = Dataset(collection, documentation('_'))
# core17
subsets['trec-core-2017'] = Dataset(
TrecQueries(dlc['trec-core-2017/queries'], namespace='trec-core-2017', lang='en'),
TrecQrels(dlc['trec-core-2017/qrels'], CORE_QREL_DEFS),
collection,
documentation('trec-core-2017'))
# wksup
all_queries = NytQueries(collection)
all_qrels = NytQrels(collection)
match_qids = Lazy(lambda: VALID_IDS)
subsets['wksup'] = Dataset(
all_queries,
all_qrels,
collection,
documentation('wksup/train'))
subsets['wksup/train'] = Dataset(
FilteredQueries(all_queries, match_qids, mode='exclude'),
FilteredQrels(all_qrels, match_qids, mode='exclude'),
collection,
documentation('wksup/train'))
subsets['wksup/valid'] = Dataset(
FilteredQueries(all_queries, match_qids, mode='include'),
FilteredQrels(all_qrels, match_qids, mode='include'),
collection,
documentation('wksup/valid'))
ir_datasets.registry.register('nyt', base)
for s in sorted(subsets):
ir_datasets.registry.register(f'nyt/{s}', subsets[s])
return base, subsets
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/pmc.py
================================================
import codecs
import tarfile
import itertools
from typing import NamedTuple, Tuple
from zipfile import ZipFile
import xml.etree.ElementTree as ET
import ir_datasets
from ir_datasets.util import DownloadConfig, GzipExtract, ZipExtract
from ir_datasets.formats import BaseDocs, GenericQuery, TrecQrels, TrecXmlQueries
from ir_datasets.datasets.base import Dataset, YamlDocumentation
from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS
_logger = ir_datasets.log.easy()
QREL_DEFS = {
0: 'not relevant',
1: 'possibly relevant',
2: 'definitely relevant'
}
QUERY_FILE_MAP = {
'number': 'query_id',
'type': 'type',
'description': 'description',
'summary': 'summary',
'note': 'note',
}
NAME = 'pmc'
class PmcDoc(NamedTuple):
doc_id: str
journal: str
title: str
abstract: str
body: str
def default_text(self):
"""
title, abstract, and body
"""
return f'{self.title} {self.abstract} {self.body}'
class TrecCdsQuery(NamedTuple):
query_id: str
type: str
description: str
summary: str
def default_text(self):
"""
description
"""
return self.description
class TrecCds2016Query(NamedTuple):
query_id: str
type: str
note: str
description: str
summary: str
def default_text(self):
"""
description
"""
return self.description
class PmcDocs(BaseDocs):
def __init__(self, dlcs, path, duplicate_dlcs=[], count_hint=None):
self._dlcs = dlcs
self._path = path
self._duplicate_dlcs = duplicate_dlcs
self._count_hint = count_hint
def docs_iter(self):
return iter(self.docs_store())
def _docs_iter(self):
# There's a set of known "duplicate" files, which are not considered
# for scoring. Skip them.
duplicate_file_names = set()
for dlc in self._duplicate_dlcs:
with dlc.stream() as f:
for line in codecs.getreader('utf8')(f):
for fn in line.split():
duplicate_file_names.add(fn)
for dlc in self._dlcs:
with dlc.stream() as f, tarfile.open(fileobj=f, mode=f'r|gz') as tarf:
for file in tarf:
if not file.isfile() or file.name in duplicate_file_names:
continue
xml = tarf.extractfile(file).read()
# Some files have a problem where spaces are missing between tag and attributes.
# Fix those here.
xml = xml.replace(b' Cache:
return Cache(
download_config[name],
base_path / f"{name}.{extension}"
)
def cached_zip_download(name: str, zip_path: str, extension: str) -> Cache:
return Cache(
ZipExtract(
download_config[name],
zip_path
),
base_path / f"{name}.{extension}"
)
def cached_gzip_download(name: str, extension: str) -> Cache:
return Cache(
GzipExtract(download_config[name]),
base_path / f"{name}.{extension}"
)
# Define and create task datasets.
task_base_datasets = {
f"argsme/2020-04-01/{NAME}-2020-task-1": Dataset(
registry["argsme/2020-04-01"].docs_handler(),
ToucheQueries(
cached_zip_download("2020/task-1/queries", "topics-task-1.xml", "xml"),
namespace=f"argsme/2020-04-01/{NAME}-2020-task-1",
language="en",
),
ToucheQrels(
cached_download("2020/task-1/qrels", "qrels"),
QRELS_DEFS_2020_TASK_1,
),
documentation("2020/task-1"),
),
f"clueweb12/{NAME}-2020-task-2": Dataset(
registry["clueweb12"].docs_handler(),
ToucheQueries(
cached_zip_download("2020/task-2/queries", "topics-task-2.xml", "xml"),
namespace=f"clueweb12/{NAME}-2020-task-2",
language="en",
),
ToucheQrels(
cached_download("2020/task-2/qrels", "qrels"),
QRELS_DEFS_2020_TASK_2,
),
documentation("2020/task-2"),
),
f"argsme/2020-04-01/{NAME}-2021-task-1": Dataset(
registry["argsme/2020-04-01"].docs_handler(),
ToucheTitleQueries(
cached_zip_download("2021/task-1/queries", "topics-task-1-only-titles.xml", "xml"),
namespace=f"argsme/2020-04-01/{NAME}-2021-task-1",
language="en",
),
ToucheQualityQrels(
cached_download("2021/task-1/qrels-relevance", "qrels"),
cached_download("2021/task-1/qrels-quality", "qrels"),
QRELS_DEFS_2021_TASK_1,
),
documentation("2021/task-1"),
),
f"clueweb12/{NAME}-2021-task-2": Dataset(
registry["clueweb12"].docs_handler(),
ToucheQueries(
cached_zip_download("2021/task-2/queries", "topics-task2-51-100.xml", "xml"),
namespace=f"clueweb12/{NAME}-2021-task-2",
language="en",
),
ToucheQualityQrels(
cached_download("2021/task-2/qrels-relevance", "qrels"),
cached_download("2021/task-2/qrels-quality", "qrels"),
QRELS_DEFS_2021_TASK_2,
),
documentation("2021/task-2"),
),
f"argsme/2020-04-01/processed/{NAME}-2022-task-1": Dataset(
registry["argsme/2020-04-01/processed"].docs_handler(),
ToucheQueries(
cached_download("2022/task-1/queries", "xml"),
namespace=f"argsme/2020-04-01-processed/{NAME}-2022-task-1",
language="en",
),
ToucheQualityCoherenceQrels(
cached_download("2022/task-1/qrels-relevance", "qrels"),
cached_download("2022/task-1/qrels-quality", "qrels"),
cached_download("2022/task-1/qrels-coherence", "qrels"),
QRELS_DEFS_2022_TASK_1,
),
documentation("2022/task-1"),
),
f"clueweb12/{NAME}-2022-task-2": Dataset(
TouchePassageDocs(
cached_gzip_download("2022/task-2/passages", "jsonl"),
namespace=f"clueweb12/{NAME}-2022-task-2",
language="en",
count_hint=868655,
),
ToucheComparativeQueries(
cached_zip_download("2022/task-2/queries", "topics-task2.xml", "xml"),
namespace=f"clueweb12/{NAME}-2022-task-2",
language="en",
),
ToucheQualityComparativeStanceQrels(
cached_download("2022/task-2/qrels-relevance", "qrels"),
cached_download("2022/task-2/qrels-quality", "qrels"),
cached_download("2022/task-2/qrels-stance", "qrels"),
QRELS_DEFS_2022_TASK_2,
),
documentation("2022/task-2"),
),
f"touche-image/2022-06-13/{NAME}-2022-task-3": Dataset(
registry["touche-image/2022-06-13"].docs_handler(),
ToucheQueries(
cached_download("2022/task-3/queries", "xml"),
namespace=f"{NAME}/{NAME}-2022-task-3",
language="en",
),
ToucheControversialStanceQrels(
cached_download("2022/task-3/qrels", "qrels"),
QRELS_DEFS_2022_TASK_3,
),
documentation("2022/task-3"),
),
}
for name, dataset in task_base_datasets.items():
registry.register(name, dataset)
# Define and create task sub-datasets.
task_sub_datasets = {
f"argsme/1.0/{NAME}-2020-task-1/uncorrected": Dataset(
registry["argsme/1.0"].docs_handler(),
registry[f"argsme/2020-04-01/{NAME}-2020-task-1"].queries_handler(),
ToucheQrels(
cached_download("2020/task-1/qrels-argsme-1.0-uncorrected", "qrels"),
QRELS_DEFS_2020_TASK_1,
allow_float_score=True,
),
documentation("2020/task-1/argsme-1.0/uncorrected"),
),
f"argsme/2020-04-01/{NAME}-2020-task-1/uncorrected": Dataset(
registry["argsme/2020-04-01"].docs_handler(),
registry[f"argsme/2020-04-01/{NAME}-2020-task-1"].queries_handler(),
ToucheQrels(
cached_download("2020/task-1/qrels-argsme-2020-04-01-uncorrected", "qrels"),
QRELS_DEFS_2020_TASK_1,
allow_float_score=True,
),
documentation("2020/task-1/argsme-2020-04-01/uncorrected"),
),
f"clueweb12/{NAME}-2022-task-2/expanded-doc-t5-query": Dataset(
TouchePassageDocs(
cached_gzip_download("2022/task-2/passages-expanded-doc-t5-query", "jsonl"),
namespace=f"clueweb12/{NAME}-2022-task-2",
language="en",
count_hint=868655
),
registry[f"clueweb12/{NAME}-2022-task-2"].queries_handler(),
registry[f"clueweb12/{NAME}-2022-task-2"].qrels_handler(),
documentation("2022/task-2/expanded-doc-t5-query"),
),
}
for name, dataset in task_sub_datasets.items():
registry.register(name, dataset)
return task_base_datasets, task_sub_datasets
_init()
================================================
FILE: ir_datasets/datasets/touche_image.py
================================================
from ir_datasets import registry
from ir_datasets.datasets.base import Dataset, YamlDocumentation
from ir_datasets.formats import ToucheImageDocs
from ir_datasets.util import DownloadConfig, home_path, Cache
NAME = "touche-image"
def _init():
base_path = home_path() / NAME
documentation = YamlDocumentation(f"docs/{NAME}.yaml")
download_config = DownloadConfig.context(NAME, base_path)
base = Dataset(documentation('_'))
def cached_download(name: str, extension: str) -> Cache:
return Cache(
download_config[name],
base_path / f"{name}.{extension}"
)
datasets = {
f"2022-06-13": Dataset(
ToucheImageDocs(
cached_download("2022-06-13/images-main", "zip"),
cached_download("2022-06-13/images-nodes", "zip"),
cached_download("2022-06-13/images-png", "zip"),
namespace=f"{NAME}/2022-06-13",
language="en",
count_hint=23841,
),
documentation("2022-06-13"),
)
}
# NOTE: the following datasets are defined in touche.py:
# - touche-image/2022-06-13/touche-2022-task-3
# Register datasets.
registry.register(NAME, base)
for name, images in datasets.items():
registry.register(f'{NAME}/{name}', images)
return base, datasets
dataset = _init()
================================================
FILE: ir_datasets/datasets/trec_arabic.py
================================================
import ir_datasets
from ir_datasets.util import DownloadConfig
from ir_datasets.formats import TrecQrels, TrecDocs, TrecQueries
from ir_datasets.datasets.base import Dataset, YamlDocumentation
NAME = 'trec-arabic'
QREL_DEFS = {
1: 'relevant',
0: 'not relevant',
}
QTYPE_MAP = {
' *(Number:)? *AR': 'query_id', # Remove AR prefix from QIDs
' *(Topic:)?': 'title',
' *(Description:)?': 'description',
' *(Narrative:)?': 'narrative'
}
def _init():
subsets = {}
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
collection = TrecDocs(dlc['docs'], encoding='utf8', path_globs=['arabic_newswire_a/transcripts/*/*.sgm.gz'], namespace=NAME, lang='ar', count_hint=ir_datasets.util.count_hint(NAME))
base = Dataset(collection, documentation('_'))
subsets['ar2001'] = Dataset(
TrecQueries(dlc['ar2001/queries'], qtype_map=QTYPE_MAP, encoding='ISO-8859-6', namespace=NAME, lang='ar'),
TrecQrels(dlc['ar2001/qrels'], QREL_DEFS),
collection,
documentation('ar2001'))
subsets['ar2002'] = Dataset(
TrecQueries(dlc['ar2002/queries'], qtype_map=QTYPE_MAP, encoding='ISO-8859-6', namespace=NAME, lang='ar'),
TrecQrels(dlc['ar2002/qrels'], QREL_DEFS),
collection,
documentation('ar2002'))
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/trec_cast.py
================================================
import gzip
from hashlib import md5
import os
from functools import cached_property, lru_cache, partial
from collections import defaultdict
import re
import json
import itertools
from typing import List, NamedTuple, Optional, Tuple
import ir_datasets
from ir_datasets.util import BaseDownload, DownloadConfig, Lazy
from ir_datasets.formats import (
TrecQrels,
TrecScoredDocs,
BaseDocs,
BaseQueries,
GenericDoc,
)
from ir_datasets.datasets.base import (
Dataset,
YamlDocumentation,
FilteredQueries,
FilteredScoredDocs,
)
from ir_datasets.util.docs.lazy import (
IRDSDocuments,
BaseTransformedDocs,
IterDocs,
LazyDocs,
TransformedDocs,
)
from ir_datasets.util.docs.multiple import PrefixedDocs, PrefixedDocsSpec
import numpy as np
from ir_datasets.util.docs.subset import ColonCommaDupes, DocsSubset, Dupes
from ir_datasets.indices import DEFAULT_DOCSTORE_OPTIONS
_logger = ir_datasets.log.easy()
NAME = "trec-cast"
QRELS_DEFS = {
4: "Fully meets. The passage is a perfect answer for the turn. It includes all of the information needed to fully answer the turn in the conversation context. It focuses only on the subject and contains little extra information.",
3: "Highly meets. The passage answers the question and is focused on the turn. It would be a satisfactory answer if Google Assistant or Alexa returned this passage in response to the query. It may contain limited extraneous information.",
2: "Moderately meets. The passage answers the turn, but is focused on other information that is unrelated to the question. The passage may contain the answer, but users will need extra effort to pick the correct portion. The passage may be relevant, but it may only partially answer the turn, missing a small aspect of the context.",
1: "Slightly meets. The passage includes some information about the turn, but does not directly answer it. Users will find some useful information in the passage that may lead to the correct answer, perhaps after additional rounds of conversation (better than nothing).",
0: "Fails to meet. The passage is not relevant to the question. The passage is unrelated to the target query.",
}
QRELS_DEFS_TRAIN = {
2: "very relevant",
1: "relevant",
0: "not relevant",
}
class CastPassage(NamedTuple):
passage_id: str
text: str
marked_up_text: str
class CastDoc(NamedTuple):
doc_id: str
title: str
url: str
passages: Tuple[CastPassage, ...]
def default_text(self):
"""
Combines the title and text of constituent passages.
"""
return "\n".join([self.title] + [p.text for p in self.passages])
class CastPassageDoc(NamedTuple):
doc_id: str
title: str
url: str
text: str
def default_text(self):
"""
Combines the title from the source document with the text of this passage.
"""
return f"{self.title}\n{self.text}"
class Cast2019Query(NamedTuple):
query_id: str
raw_utterance: str
topic_number: int
turn_number: int
topic_title: str
topic_description: str
def default_text(self):
"""
raw_utterance
"""
return self.raw_utterance
class Cast2020Query(NamedTuple):
query_id: str
raw_utterance: str
automatic_rewritten_utterance: str
manual_rewritten_utterance: str
manual_canonical_result_id: str
topic_number: int
turn_number: int
def default_text(self):
"""
raw_utterance
"""
return self.raw_utterance
class Cast2021Query(NamedTuple):
query_id: str
raw_utterance: str
automatic_rewritten_utterance: str
manual_rewritten_utterance: str
canonical_result_id: str
topic_number: int
turn_number: int
def default_text(self):
"""
raw_utterance
"""
return self.raw_utterance
class Cast2022Query(NamedTuple):
query_id: str
parent_id: str
participant: str
raw_utterance: str
manual_rewritten_utterance: str
response: str
provenance: List[str]
topic_number: int
turn_number: int
def default_text(self):
"""
raw_utterance
"""
return self.raw_utterance
class CastPassageIter:
def __init__(self, docstore, doc_psg_offsets, slice):
self.next_psg_index = 0
self.docstore = docstore
self.doc_iter = iter(docstore)
self.doc = None
self.slice = slice
if self.slice.start != 0:
start_doc_idx = (
int(np.searchsorted(doc_psg_offsets(), self.slice.start, side="right"))
- 1
)
self.doc_iter = self.doc_iter[start_doc_idx:]
self.next_psg_index = self.slice.start - doc_psg_offsets()[start_doc_idx]
self.doc_psg_offsets = doc_psg_offsets
def __next__(self):
if self.slice.start >= self.slice.stop or self.doc is StopIteration:
raise StopIteration
if self.doc is None:
self.doc = next(self.doc_iter, StopIteration)
while self.next_psg_index >= len(self.doc.passages):
self.next_psg_index -= len(self.doc.passages)
self.doc = next(self.doc_iter, StopIteration)
if self.doc is StopIteration:
raise StopIteration
result = self.doc.passages[self.next_psg_index]
result = CastPassageDoc(
f"{self.doc.doc_id}-{self.next_psg_index+1}",
self.doc.title,
self.doc.url,
self.doc.passages[self.next_psg_index],
)
self.next_psg_index += self.slice.step or 1
self.slice = slice(
self.slice.start + (self.slice.step or 1), self.slice.stop, self.slice.step
)
return result
def __iter__(self):
return self
def __getitem__(self, key):
if isinstance(key, slice):
# it[start:stop:step]
new_slice = ir_datasets.util.apply_sub_slice(self.slice, key)
return CastPassageIter(self.docstore, self.doc_psg_offsets, new_slice)
elif isinstance(key, int):
# it[index]
new_slice = ir_datasets.util.slice_idx(self.slice, key)
new_it = CastPassageIter(self.docstore, self.doc_psg_offsets, new_slice)
try:
return next(new_it)
except StopIteration as e:
raise IndexError(e)
raise TypeError("key must be int or slice")
class CastPassageDocstore(ir_datasets.indices.Docstore):
def __init__(self, docs_docstore, options=DEFAULT_DOCSTORE_OPTIONS):
super().__init__(GenericDoc, "doc_id", options=options)
self._docs_docstore = docs_docstore
def get_many_iter(self, doc_ids):
passage_ids = list(doc_ids)
did2pids = defaultdict(set)
for pid in passage_ids:
if pid.count("-") >= 1:
did, idx = pid.rsplit("-", 1)
if idx.isnumeric():
did2pids[did].add(int(idx) - 1)
for doc in self._docs_docstore.get_many_iter(did2pids.keys()):
for idx in did2pids[doc.doc_id]:
if len(doc.passages) > idx:
passage = doc.passages[idx]
yield CastPassageDoc(
f"{doc.doc_id}-{idx+1}", doc.title, doc.url, passage
)
class LazyCastPassageIter:
def __init__(self, docs: "CastPassageDocs"):
self._docs = docs
self._doc_iter = docs._docs.docs_iter()
self._doc = None
self._passage_ix = None
def __iter__(self):
return self
def __next__(self):
while (self._doc is None) or (len(self._doc.passages) <= self._passage_ix):
self._doc = next(self._doc_iter)
self._passage_ix = 0
self._passage_ix += 1
return CastPassageDoc(
f"{self._doc.doc_id}-{self._passage_ix}",
self._doc.title,
self._doc.url,
self._doc.passages[self._passage_ix - 1],
)
def __getitem__(self, key):
docstore = self._docs._docs.docs_store()
@lru_cache()
def offsets_fn():
"""Stores the number of passages for each document of the initial
collection"""
offsets_path = f"{str(docstore.path)}.psg_offsets.np"
if not os.path.exists(offsets_path):
offsets = np.empty(docstore.count() + 1, dtype=np.uint32)
count = 0
for i, doc in enumerate(
_logger.pbar(
iter(docstore),
total=docstore.count(),
desc="building passage offset file",
)
):
offsets[i] = count
count += len(doc.passages)
offsets[-1] = count
with ir_datasets.util.finialized_file(offsets_path, "wb") as fout:
fout.write(offsets.tobytes())
return offsets
else:
return np.memmap(offsets_path, dtype=np.uint32, mode="r")
passage_iter = CastPassageIter(
docstore, offsets_fn, slice(0, self._docs._count, 1)
)
return passage_iter[key]
class CastPassageDocs(BaseDocs):
def __init__(self, docs, count):
super().__init__()
self._docs = docs
self._count = count
def docs_iter(self):
return LazyCastPassageIter(self)
def docs_cls(self):
return CastPassageDoc
def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS):
return CastPassageDocstore(self._docs.docs_store(field, options=options), options=options)
def docs_count(self):
return self._count
def docs_namespace(self):
return NAME
def docs_lang(self):
return "en"
class SegmentedDocs(BaseTransformedDocs):
"""Segmented document collection based on pre-computed offsets
segments_dl points to a compressed JSONL file where the ranges refer to the
original document text, e.g.:
{"id":"MARCO_00_1454834","ranges":[[[0,917]],[[918,2082]],[[2083,3220]],[[3221,3763]]],"md5":"f0577db28de265852932224525710486"}
"""
def __init__(self, docs, segments_dl: BaseDownload, store_name: str):
super().__init__(docs, CastDoc, store_name)
self._segments_dl = segments_dl
def docs_iter(self):
# Process files
with self._segments_dl.stream() as fin, gzip.open(fin) as offsets_stream:
for doc, data_json in zip(self._docs, offsets_stream):
data = json.loads(data_json)
assert (
doc.doc_id == data["id"]
), f"Error in processing offsets, docids differ: expected {data['id']} (offset), got {doc.doc_id} (document)"
body: str = doc.passages[0]
computer = md5()
passages = []
for ranges in data["ranges"]:
texts = []
computer.update(b"\x00")
for start, end in ranges:
computer.update(b"\x01")
text = body[start:end]
texts.append(text)
computer.update(text.encode("utf-8"))
passages.append(" ".join(texts))
assert computer.digest().hex() == data["md5"]
yield doc._replace(passages=passages)
class CastQueries(BaseQueries):
def __init__(self, dlc, query_type):
super().__init__()
self._dlc = dlc
self._query_type = query_type
def queries_iter(self):
with self._dlc.stream() as stream:
topics = json.load(stream)
for topic in topics:
topic_number = topic["number"]
for turn in topic["turn"]:
turn_number = turn["number"]
if self._query_type is Cast2019Query:
yield Cast2019Query(
f"{topic_number}_{turn_number}",
turn["raw_utterance"],
topic_number,
turn_number,
topic["title"],
topic.get("description", ""),
)
elif self._query_type is Cast2020Query:
yield Cast2020Query(
f"{topic_number}_{turn_number}",
turn["raw_utterance"],
turn["automatic_rewritten_utterance"],
turn["manual_rewritten_utterance"],
turn["manual_canonical_result_id"],
topic_number,
turn_number,
)
elif self._query_type is Cast2021Query:
yield Cast2021Query(
f"{topic_number}_{turn_number}",
turn["raw_utterance"],
turn["automatic_rewritten_utterance"],
turn["manual_rewritten_utterance"],
turn["canonical_result_id"],
topic_number,
turn_number,
)
elif self._query_type is Cast2022Query:
if parent_id := turn.get("parent"):
parent_id = f"{topic_number}_{parent_id}"
yield Cast2022Query(
f"{topic_number}_{turn_number}",
parent_id,
turn["participant"],
turn.get("utterance"),
turn.get("manual_rewritten_utterance"),
turn.get("response"),
turn.get("provenance", []),
topic_number,
turn_number,
)
def queries_cls(self):
return self._query_type
def queries_namespace(self):
return NAME
def queries_lang(self):
return "en"
class WapoV4Docs(IRDSDocuments):
def __init__(self, dsid: str):
super().__init__(dsid)
def docs_cls(self):
return CastDoc
def docs_iter(self):
CLEANR = re.compile("<.*?>")
dup_dids = set()
for data in self.docs.docs_handler().docs_wapo_raw_iter():
if data["id"] in dup_dids:
continue
dup_dids.add(data["id"])
doc_id = str(data["id"])
title = data.get("title", "No Title")
if data["article_url"]:
if "www.washingtonpost.com" not in data["article_url"]:
url = "https://www.washingtonpost.com" + data["article_url"]
else:
url = data["article_url"]
else:
url = ""
body = ""
if data.get("contents") and len(data["contents"]) > 0:
for item in data["contents"]:
# if item is not None and item.get('subtype') == 'paragraph':
if item is not None and item.get("subtype") == "paragraph":
body += " " + item["content"]
body = re.sub(CLEANR, "", body)
body = body.replace("\n", " ").strip()
if body:
yield CastDoc(doc_id, title, url, [body])
class KiltCastDocs(TransformedDocs):
def __init__(self, dsid: str):
super().__init__(LazyDocs(dsid), CastDoc)
def docs_iter(self):
for doc in map(
self.transform, self._docs.docs.docs_handler().docs_kilt_raw_iter()
):
if doc is not None:
yield doc
def transform(self, doc):
title = doc["wikipedia_title"]
body = " ".join(doc["text"]).replace("\n", " ").strip()
url = doc["history"]["url"]
return CastDoc(doc["wikipedia_id"], title, url, [body])
class WapoDupes(Dupes):
@cached_property
def doc_ids(self):
doc_ids = set()
with self._base.stream() as fp:
for line in fp:
base_id, wapo_id, *__ = line.strip().split(b" ", 3)
if base_id != wapo_id:
if doc_id := self._remove_prefix(wapo_id.decode("utf-8")):
doc_ids.add(doc_id)
return doc_ids
def transform_msmarco_v1(doc):
return CastDoc(
doc.doc_id, doc.title, doc.url, [doc.body.replace("\n", " ").strip()]
)
def transform_msmarco_v2(doc):
doc_id = doc.doc_id[len("msmarco_doc_") :]
return CastDoc(doc_id, doc.title, doc.url, [doc.body.replace("\n", " ").strip()])
def _init():
subsets = {}
base_path = ir_datasets.util.home_path() / NAME
dlc = DownloadConfig.context(NAME, base_path)
documentation = YamlDocumentation(f"docs/{NAME}.yaml")
def wapo_converter(dsid, dupes: Dupes):
BeautifulSoup = ir_datasets.lazy_libs.bs4().BeautifulSoup
# NOTE: These rules are very specific in order to replicate the behaviour present in the official script
# here:
# Specifically, things like skipping empty documents, filtering by "paragraph" subtype, and starting the
# paragraph index at 1 are all needed to perfectly match the above script.
# Note that the script does NOT strip HTML markup, which is meant to be removed out in a later stage (e.g., indexing).
# We do that here for user simplicity, as it will allow the text to be consumed directly by various models
# without the need for further pre-processing. (Though a bit of information is lost.)
for wapo_doc in ir_datasets.load(dsid).docs_handler().docs_wapo_raw_iter():
doc_id = wapo_doc["id"]
# Ignore this one
if dupes.has(doc_id):
continue
pid = itertools.count(1) # paragrah index starts at 1
for paragraph in wapo_doc["contents"]:
if (
paragraph is not None
and paragraph.get("subtype") == "paragraph"
and paragraph["content"] != ""
):
text = paragraph["content"]
if paragraph.get("mime") == "text/html":
text = BeautifulSoup(
f"{text}", "lxml-xml"
).get_text()
yield GenericDoc(f"WAPO_{doc_id}-{next(pid)}", text)
# --- Version 0 and 1 (2019 and 2020)
# https://github.com/daltonj/treccastweb#year-2-trec-2020
# documents = MARCO Ranking passages (v1) and Wikipedia (TREC CAR)
# Version 0 contains WAPO (but this is not used)
docs_v0 = PrefixedDocs(
f"{NAME}/docs_v0",
PrefixedDocsSpec(
"WAPO_",
IterDocs(
f"{NAME}/v1/wapo-v2",
partial(wapo_converter,
"wapo/v2", ColonCommaDupes(dlc["wapo_dupes"], prefix="WAPO_")
),
),
),
PrefixedDocsSpec(
"MARCO_",
DocsSubset(
f"{NAME}/v1/msmarco-passages",
LazyDocs("msmarco-passage"),
ColonCommaDupes(dlc["marco_dupes"], prefix="MARCO_"),
),
),
PrefixedDocsSpec("CAR_", LazyDocs("car/v2.0")),
)
docs_v1 = PrefixedDocs(
f"{NAME}/docs_v1",
PrefixedDocsSpec(
"MARCO_",
DocsSubset(
f"{NAME}/v1/msmarco-passages",
LazyDocs("msmarco-passage"),
ColonCommaDupes(dlc["marco_dupes"], prefix="MARCO_"),
),
),
PrefixedDocsSpec("CAR_", LazyDocs("car/v2.0")),
)
base = Dataset(documentation("_"))
subsets["v0"] = Dataset(docs_v0)
subsets["v0/train"] = Dataset(
docs_v0,
CastQueries(dlc["2019/train/queries"], Cast2019Query),
TrecQrels(dlc["2019/train/qrels"], QRELS_DEFS_TRAIN),
TrecScoredDocs(dlc["2019/train/scoreddocs"]),
)
qids_train_v0 = Lazy(lambda: {q.query_id for q in subsets["v0/train"].qrels_iter()})
subsets["v0/train/judged"] = Dataset(
docs_v0,
FilteredQueries(subsets["v0/train"].queries_handler(), qids_train_v0),
subsets["v0/train"].qrels_handler(),
FilteredScoredDocs(subsets["v0/train"].scoreddocs_handler(), qids_train_v0),
)
subsets["v1"] = Dataset(docs_v1)
subsets["v1/2019"] = Dataset(
docs_v1,
CastQueries(dlc["2019/eval/queries"], Cast2019Query),
TrecQrels(dlc["2019/eval/qrels"], QRELS_DEFS),
TrecScoredDocs(dlc["2019/eval/scoreddocs"]),
)
qids_2019 = Lazy(lambda: {q.query_id for q in subsets["v1/2019"].qrels_iter()})
subsets["v1/2019/judged"] = Dataset(
docs_v1,
FilteredQueries(subsets["v1/2019"].queries_handler(), qids_2019),
subsets["v1/2019"].qrels_handler(),
FilteredScoredDocs(subsets["v1/2019"].scoreddocs_handler(), qids_2019),
)
subsets["v1/2020"] = Dataset(
docs_v1,
CastQueries(dlc["2020/queries"], Cast2020Query),
TrecQrels(dlc["2020/qrels"], QRELS_DEFS),
)
qids_2020 = Lazy(lambda: {q.query_id for q in subsets["v1/2020"].qrels_iter()})
subsets["v1/2020/judged"] = Dataset(
docs_v1,
FilteredQueries(subsets["v1/2020"].queries_handler(), qids_2020),
subsets["v1/2020"].qrels_handler(),
)
# --- Version 2 (2021)
# https://github.com/daltonj/treccastweb#year-3-trec-2021
# Documents = WAPO 2020, KILT and MS Marco v1 (documents)
# We provide passage offsets for the three document collections
# Duplicates are in two files:
# wapo-near-duplicates for WAPO
# marco_duplicates.txt for MS-MARCO
def register_docs(namespace: str, use_docs: bool, *tuples):
"""Register all documents (sub)collections
Tuples: (name prefix, document ID prefix, raw documents, passage count)
"""
all_docs_spec = []
all_passages_spec = []
passages = []
for dsid, prefix, raw, count in tuples:
prefixed = PrefixedDocs(None, PrefixedDocsSpec(prefix, raw))
subsets[f"{namespace}/{dsid}"] = Dataset(prefixed)
segmented = SegmentedDocs(
prefixed,
dlc[f"{namespace}/offsets/{dsid}"],
f"{NAME}/docs_{namespace}_{dsid}",
)
subsets[f"{namespace}/{dsid}/segmented"] = Dataset(segmented)
passage = CastPassageDocs(segmented, count)
passages.append(passage)
subsets[f"{namespace}/{dsid}/passages"] = Dataset(passage)
# Add this
all_docs_spec.append(
PrefixedDocsSpec(prefix, (raw if use_docs else passage), not use_docs)
)
all_passages_spec.append(
PrefixedDocsSpec(prefix, passage, True)
)
# All documents together
all_docs = PrefixedDocs(f"{NAME}/docs_{namespace}", *all_docs_spec)
subsets[f"{namespace}"] = Dataset(all_docs)
if use_docs:
# Add a passage dataset
subsets[f"{namespace}/passages"] = PrefixedDocs(f"{NAME}/passages_{namespace}", *all_passages_spec)
return all_docs
"""
docs_v2 = register_docs(
"v2",
True,
(
"msmarco",
"MARCO_",
TransformedDocs(
DocsSubset(
f"{NAME}/v2/msmarco-documents",
LazyDocs("msmarco-document"),
ColonCommaDupes(dlc["v2/dupes/marco_v1"]),
),
CastDoc,
transform_msmarco_v1,
),
19_092_817,
),
(
"wapo",
"WAPO_",
DocsSubset(
f"{NAME}/v2/wapo-v4",
WapoV4Docs("wapo/v4"),
WapoDupes(dlc["v2/dupes/wapo"]),
),
3_728_553,
),
("kilt", "KILT_", KiltCastDocs("kilt"), 17_124_025),
)
"""
#subsets["v2/2021"] = Dataset(
# docs_v2,
# CastQueries(dlc["2021/queries"], Cast2021Query),
# TrecQrels(dlc["2021/qrels"], QRELS_DEFS),
#)
# --- Version 3 (2022)
# https://github.com/daltonj/treccastweb#year-4-trec-2022
# Official documents = processed (split) WAPO 2020, KILT, MS Marco V2
v3_dupes = dlc["v3/dupes"]
"""
docs_v3 = register_docs(
"v3",
False,
(
"msmarco",
"MARCO_",
DocsSubset(
f"{NAME}/v3/msmarco-documents-v2",
TransformedDocs(
LazyDocs("msmarco-document-v2"), CastDoc, transform_msmarco_v2
),
Dupes(v3_dupes, prefix="MARCO_"),
),
86_326_322,
),
(
"wapo",
"WAPO_",
DocsSubset(
f"{NAME}/v3/wapo-v4",
WapoV4Docs("wapo/v4"),
Dupes(v3_dupes, prefix="WAPO_"),
),
2_963_130,
),
(
"kilt",
"KILT_",
DocsSubset(
f"{NAME}/v3/kilt-v4",
KiltCastDocs("kilt"),
Dupes(v3_dupes, prefix="KILT_"),
),
17_111_488,
),
)
"""
#subsets["v3/2022"] = Dataset(
# docs_v3,
# CastQueries(dlc["2022/queries"], Cast2022Query),
# TrecQrels(dlc["2022/qrels"], QRELS_DEFS),
#)
# --- Register all datasets
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(
f"{NAME}/{s}", Dataset(subsets[s], documentation(s))
)
return base, subsets
base, subsets = _init()
================================================
FILE: ir_datasets/datasets/trec_fair.py
================================================
import json
import codecs
from typing import NamedTuple, Dict, List, Optional
import ir_datasets
from ir_datasets.util import GzipExtract, Cache, Lazy
from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries, Deprecated
from ir_datasets.formats import BaseQueries, BaseDocs, BaseQrels, TrecQrel
from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS
from itertools import chain
_logger = ir_datasets.log.easy()
NAME = 'trec-fair'
QREL_DEFS = {
1: "relevant"
}
class FairTrecDoc(NamedTuple):
doc_id: str
title: str
text: str
marked_up_text: str
url: str
quality_score: Optional[float]
geographic_locations: Optional[List[str]]
quality_score_disk: Optional[str]
def default_text(self):
"""
title and text
"""
return f"{self.title} {self.text}"
class FairTrec2022Doc(NamedTuple):
doc_id: str
title: str
text: str
url: str
pred_qual: Optional[float]
qual_cat: Optional[str]
page_countries: Optional[List[str]]
page_subcont_regions: Optional[List[str]]
source_countries: Optional[Dict[str, int]]
source_subcont_regions: Optional[Dict[str, int]]
gender: Optional[List[str]]
occupations: Optional[List[str]]
years: Optional[List[int]]
num_sitelinks: Optional[int]
relative_pageviews: Optional[float]
first_letter: Optional[str]
creation_date: Optional[str]
first_letter_category: Optional[str]
gender_category: Optional[str]
creation_date_category: Optional[str]
years_category: Optional[str]
relative_pageviews_category: Optional[str]
num_sitelinks_category: Optional[str]
def default_text(self):
"""
title and text
"""
return f'{self.title} {self.text}'
class FairTrecQuery(NamedTuple):
query_id: str
text: str
keywords: List[str]
scope: str
homepage: str
def default_text(self):
"""
text
"""
return self.text
class FairTrec2022TrainQuery(NamedTuple):
query_id: str
text: str
url: str
def default_text(self):
"""
text
"""
return self.text
class FairTrecEvalQuery(NamedTuple):
query_id: str
text: str
keywords: List[str]
scope: str
def default_text(self):
"""
text
"""
return self.text
class FairTrecDocs(BaseDocs):
def __init__(self, dlc, mlc):
super().__init__()
self._dlc = dlc
self._mlc = mlc
def docs_iter(self):
return iter(self.docs_store())
def _docs_iter(self):
def _metadata_iter():
with self._mlc.stream() as stream2:
for metadata_line in stream2:
yield json.loads(metadata_line)
textifier = ir_datasets.lazy_libs.pyautocorpus().Textifier()
metadata_iter = _metadata_iter()
next_metadata = None
with self._dlc.stream() as stream1:
for line in stream1:
data1 = json.loads(line)
if next_metadata is None:
next_metadata = next(metadata_iter, None)
if next_metadata is not None:
if data1['id'] == next_metadata['page_id']:
match = next_metadata
next_metadata = None
try:
plaintext = textifier.textify(data1['text'])
except ValueError as err:
message, position = err.args
if message == "Expected markup type 'comment'":
# unmatched
'
v0:
desc: '
Version 0 of the TREC CAsT corpus. This version uses documents from the Washington Post (version 2), TREC CAR (version 2),
and MS MARCO passage (version 1).
This corpus was originally meant to be used for evaluation of the 2019 task, but the Washington Post
corpus was not included for scoring in the final version due to "an error in the process led to ambiguous
document ids," and Washington Post documents were removed from participating systems. As such,
trec-cast/v1 (which doesn''t include the Washington Post) should be used for
the 2019 version of the task. However, this version still can be used for the training set
(trec-cast/v0/train) or for replicating the original submissions
to the track (prior to the removal of Washingotn Post documents).
'
docs_instructions: &inst "WaPo docs available from NIST"
bibtex_ids: ['Dalton2019Cast']
v0/train:
desc: '
Training set provided by TREC CAsT 2019.
'
docs_instructions: *inst
bibtex_ids: ['Dalton2019Cast']
v0/train/judged:
desc: '
trec-cast/2019/train, but with queries that do not appear in the qrels removed.
'
docs_instructions: *inst
bibtex_ids: ['Dalton2019Cast']
v1:
desc: '
Version 1 of the TREC CAsT corpus. This version uses documents from the TREC CAR (version 2) and MS MARCO passage
(version 1). This version of the corpus was used for TREC CAsT 2019 and 2020.
'
bibtex_ids: ['Dalton2019Cast']
v1/2019:
desc: '
Official evaluation set for TREC CAsT 2019.
'
bibtex_ids: ['Dalton2019Cast']
v1/2019/judged:
desc: '
trec-cast/v1/2019, but with queries that do not appear in the qrels removed.
'
bibtex_ids: ['Dalton2019Cast']
v1/2020:
desc: '
Official evaluation set for TREC CAsT 2020.
'
bibtex_ids: ['Dalton2020Cast']
v1/2020/judged:
desc: '
trec-cast/v1/2020, but with queries that do not appear in the qrels removed.
'
bibtex_ids: ['Dalton2020Cast']
v2:
desc: '
Version 2 of the TREC CAsT corpus. This version uses documents from
the Washington Post (2017-20), KILT and MS Marco V1 (documents).
This version of the corpus was used for TREC CAsT 2021. Segmented passages
version are also provided (using the 2021 script).
'
v3:
desc: '
Version 3 of the TREC CAsT corpus. This version uses segmented documents
from the Washington Post (2017-20), KILT and MS Marco V2 (documents). This
version of the corpus was used for TREC CAsT 2022.
'
================================================
FILE: ir_datasets/docs/trec-fair.yaml
================================================
_:
pretty_name: "TREC Fair Ranking"
desc: '
The TREC Fair Ranking track evaluates systems according to how well they fairly rank documents.
'
2021:
desc: '
The TREC Fair Ranking 2021 track focuses on fairly prioritising Wikimedia articles for editing to provide a fair exposure to articles from different groups.
'
2021/train:
desc: '
Official TREC Fair Ranking 2021 train set.
'
2021/eval:
desc: '
Official TREC Fair Ranking 2021 evaluation set.
'
2022:
desc: '
The TREC Fair Ranking 2022 track focuses on fairly prioritising Wikimedia articles for editing to provide a fair exposure to articles from different groups.
'
2022/train:
desc: '
Official TREC Fair Ranking 2022 train set.
'
================================================
FILE: ir_datasets/docs/trec-mandarin.yaml
================================================
_:
pretty_name: 'TREC Mandarin'
desc: '
A collection of news articles in Mandarin in Simplified Chinese, used for multi-lingual evaluation in TREC 5 and
TREC 6.
Document collection from LDC2000T52.
'
docs_instructions: &inst "docs available from LDC"
bibtex_ids: ['Rogers2000Mandarin']
data_access: '
To use this dataset, you need a copy of the source corpus, provided by the the
Linguistic Data Consortium. The specific resource needed
is LDC2000T52.
Many organizations already have a subscription to the LDC, so access to the collection
can be as easy as confirming the data usage agreement and downloading the corpus. Check
with your library for access details.
The source file is: LDC2000T52.tgz.
ir_datasets expects this file to be copied/linked as ~/.ir_datasets/trec-mandarin/corpus.tgz.
'
trec5:
desc: '
Mandarin Chinese benchmark from TREC 5.
'
docs_instructions: *inst
bibtex_ids: ['Harman1997Chinese', 'Rogers2000Mandarin']
trec6:
desc: '
Mandarin Chinese benchmark from TREC 6.
'
docs_instructions: *inst
bibtex_ids: ['Wilkinson1998Chinese', 'Rogers2000Mandarin']
================================================
FILE: ir_datasets/docs/trec-robust04.yaml
================================================
_:
pretty_name: 'TREC Robust 2004'
desc: '
The TREC Robust retrieval task focuses on "improving the consistency of retrieval technology by
focusing on poorly performing topics."
The TREC Robust document collection is from TREC disks 4 and 5. Due to the
copyrighted nature of the documents, this collection is for research use only, which requires
agreements to be filed with NIST. See details here.
'
docs_instructions: &inst "docs available from NIST"
bibtex_ids: ['Voorhees2004Robust']
data_access: '
To use this dataset, you need a copy of TREC
disks 4 and 5, provided by NIST.
Your organization may already have a copy. If this is the case, you may only need to complete a new
"Individual Argeement". Otherwise, your organization will need to file the "Organizational agreement"
with NIST. It can take some time to process, but you will end up with a password-protected download link.
ir_datasets needs the following directories from the source:
ir_datasets expects the above directories to be copied/linked under ~/.ir_datasets/trec-robust04/trec45.
The source document files themselves can either be compressed or uncompressed (it seems they have been distributed
both ways in the past.) If ir_datasets does not find the files it is expecting, it will raise an error.
'
fold1:
desc: '
Robust04 Fold 1 (Title) proposed by Huston & Croft (2014) and used in numerous works
'
docs_instructions: *inst
bibtex_ids: ['Voorhees2004Robust', 'Huston2014ACO']
fold2:
desc: '
Robust04 Fold 2 (Title) proposed by Huston & Croft (2014) and used in numerous works
'
docs_instructions: *inst
bibtex_ids: ['Voorhees2004Robust', 'Huston2014ACO']
fold3:
desc: '
Robust04 Fold 3 (Title) proposed by Huston & Croft (2014) and used in numerous works
'
docs_instructions: *inst
bibtex_ids: ['Voorhees2004Robust', 'Huston2014ACO']
fold4:
desc: '
Robust04 Fold 4 (Title) proposed by Huston & Croft (2014) and used in numerous works
'
docs_instructions: *inst
bibtex_ids: ['Voorhees2004Robust', 'Huston2014ACO']
fold5:
desc: '
Robust04 Fold 5 (Title) proposed by Huston & Croft (2014) and used in numerous works
'
docs_instructions: *inst
bibtex_ids: ['Voorhees2004Robust', 'Huston2014ACO']
================================================
FILE: ir_datasets/docs/trec-spanish.yaml
================================================
_:
pretty_name: 'TREC Spanish'
desc: '
A collection of news articles in Spanish, used for multi-lingual evaluation in TREC 3 and TREC 4.
Document collection from LDC2000T51.
'
docs_instructions: &inst "docs available from LDC"
bibtex_ids: ['Rogers2000Spanish']
data_access: '
To use this dataset, you need a copy of the source corpus, provided by the the
Linguistic Data Consortium. The specific resource needed
is LDC2000T51.
Many organizations already have a subscription to the LDC, so access to the collection
can be as easy as confirming the data usage agreement and downloading the corpus. Check
with your library for access details.
The source file is: LDC2000T51.tgz.
ir_datasets expects this file to be copied/linked as ~/.ir_datasets/trec-spanish/corpus.tgz.
'
trec3:
desc: '
Spanish benchmark from TREC 3.
'
docs_instructions: *inst
bibtex_ids: ['Harman1994Trec3', 'Rogers2000Spanish']
trec4:
desc: '
Spanish benchmark from TREC 4.
'
docs_instructions: *inst
bibtex_ids: ['Harman1995Trec4', 'Rogers2000Spanish']
================================================
FILE: ir_datasets/docs/trec-tot-2025.yaml
================================================
_:
pretty_name: 'TREC Tip-of-the-Tongue'
desc: '
Tip of the tongue: The phenomenon of failing to retrieve something from memory, combined with partial recall and the feeling that retrieval is imminent. More details are available on the official page for the TREC Tip-of-the-Tongue (ToT) Track.
'
2025:
desc: '
Corpus for the TREC 2025 tip-of-the-tongue search track.
'
2025/train:
desc: '
Train query set for TREC 2025 tip-of-the-tongue search track.
'
2025/dev1:
desc: '
Dev-1 query set for TREC 2025 tip-of-the-tongue search track (the original 2023 dev set).
'
2025/dev2:
desc: '
Dev-2 query set for TREC 2025 tip-of-the-tongue search track (the original 2023 test set).
'
2025/dev3:
desc: '
Dev-3 query set for TREC 2025 tip-of-the-tongue search track (the original 2024 test set).
'
================================================
FILE: ir_datasets/docs/trec-tot.yaml
================================================
_:
pretty_name: 'TREC Tip-of-the-Tongue'
desc: '
Tip of the tongue: The phenomenon of failing to retrieve something from memory, combined with partial recall and the feeling that retrieval is imminent. More details are available on the official page for the TREC Tip-of-the-Tongue (ToT) Track.
'
2023:
desc: '
Corpus for the TREC 2023 tip-of-the-tongue search track.
'
2023/train:
desc: '
Train query set for TREC 2023 tip-of-the-tongue search track.
'
2023/dev:
desc: '
Dev query set for TREC 2023 tip-of-the-tongue search track.
'
2024:
desc: '
Corpus for the TREC 2024 tip-of-the-tongue search track.
'
2024/test:
desc: '
Test query set for TREC 2024 tip-of-the-tongue search track.
'
================================================
FILE: ir_datasets/docs/tripclick.yaml
================================================
_:
pretty_name: 'TripClick'
desc: '
TripClick is a large collection from the Trip Database.
Relevance is inferred from click signals.
A copy of this dataset can be obtained from the Trip Database through the process described
here. Documents, queries,
and qrels require the "TripClick IR Benchmark"; for scoreddocs and docpairs, you will also need to
request the "TripClick Training Package for Deep Learning Models".
'
docs_instructions: &docs_inst "docs available from the Trip Database"
queries_instructions: &queries_inst "queries available from the Trip Database"
qrels_instructions: &qrels_inst "qrels available from the Trip Database"
scoreddocs_instructions: &scoreddocs_inst "scoreddocs available from the Trip Database"
docpairs_instructions: &docpairs_inst "docpairs available from the Trip Database"
bibtex_ids: ['Rekabsaz2021TripClick']
data_access: '
To use this dataset, you need a copy of the source files, provided by the Trip Database.
A copy of the source files can be requested through the procedure detailed
here. Documents, queries,
and qrels require the "TripClick IR Benchmark"; for scoreddocs and docpairs, you will also need to
request the "TripClick Training Package for Deep Learning Models". If you want the raw query logs,
you will need to request the "Logs Dataset".
The source files you will need are:
- benchmark.tar.gz (for docs, queries, and qrels)
- dlfiles.tar.gz (for docpairs and scoreddocs)
- dlfiles_runs_test.tar.gz (for scoreddocs on the test set)
- logs.tar.gz (for raw qlogs)
ir_datasets expects these files to be copied/linked in ~/.ir_datasets/tripclick/.
'
logs:
desc: '
Raw query logs from TripClick.
Note that this subset includes a broader set of documents than the main collection, but they
only provide the title and URL.
'
docs_instructions: *docs_inst
qlogs_instructions: &qlogs_inst "qlogs available from the Trip Database"
bibtex_ids: ['Rekabsaz2021TripClick']
train:
desc: '
Training subset of tripclick, including all queries from
tripclick/train/head, tripclick/train/torso,
and tripclick/train/tail.
The dataset provides docpairs in a full text format; we map this text back to the query and doc IDs.
A small number of docpairs could not be mapped back, so they are skipped.
'
docs_instructions: *docs_inst
queries_instructions: *queries_inst
qrels_instructions: *qrels_inst
scoreddocs_instructions: *scoreddocs_inst
docpairs_instructions: *docpairs_inst
bibtex_ids: ['Rekabsaz2021TripClick']
train/hofstaetter-triples:
desc: '
A version of tripclick/train that replaces the original (noisy) training
triples (docpairs) with those sampled from BM25 instead, as suggested by Hofstätter et al (2022).
'
docs_instructions: *docs_inst
queries_instructions: *queries_inst
qrels_instructions: *qrels_inst
bibtex_ids: ['Rekabsaz2021TripClick', 'Hofstaetter2022TripClick']
train/head:
desc: '
The most frequent queries in the train set. This represents 20% of the search engine traffic.
'
docs_instructions: *docs_inst
queries_instructions: *queries_inst
qrels_instructions: *qrels_inst
scoreddocs_instructions: *scoreddocs_inst
docpairs_instructions: *docpairs_inst
bibtex_ids: ['Rekabsaz2021TripClick']
train/head/dctr:
desc: '
The same as tripclick/train/head, but using qrels scaled by the Document
Click Through Rate (DCTR).
'
docs_instructions: *docs_inst
queries_instructions: *queries_inst
qrels_instructions: *qrels_inst
scoreddocs_instructions: *scoreddocs_inst
docpairs_instructions: *docpairs_inst
bibtex_ids: ['Rekabsaz2021TripClick']
train/torso:
desc: '
The moderately frequent queries in the train set. This represents 30% of the search engine traffic.
'
docs_instructions: *docs_inst
queries_instructions: *queries_inst
qrels_instructions: *qrels_inst
scoreddocs_instructions: *scoreddocs_inst
docpairs_instructions: *docpairs_inst
bibtex_ids: ['Rekabsaz2021TripClick']
train/tail:
desc: '
The least frequent queries in the train set. This represents 50% of the search engine traffic.
'
docs_instructions: *docs_inst
queries_instructions: *queries_inst
qrels_instructions: *qrels_inst
scoreddocs_instructions: *scoreddocs_inst
docpairs_instructions: *docpairs_inst
bibtex_ids: ['Rekabsaz2021TripClick']
val:
desc: '
Validation subset of tripclick, including all queries from
tripclick/val/head, tripclick/val/torso,
and tripclick/val/tail.
The scoreddocs are the official BM25 results from Anserini.
'
docs_instructions: *docs_inst
queries_instructions: *queries_inst
qrels_instructions: *qrels_inst
scoreddocs_instructions: *scoreddocs_inst
docpairs_instructions: *docpairs_inst
bibtex_ids: ['Rekabsaz2021TripClick']
val/head:
desc: '
The most frequent queries in the validation set. This represents 20% of the search engine traffic.
'
docs_instructions: *docs_inst
queries_instructions: *queries_inst
qrels_instructions: *qrels_inst
scoreddocs_instructions: *scoreddocs_inst
docpairs_instructions: *docpairs_inst
bibtex_ids: ['Rekabsaz2021TripClick']
val/head/dctr:
desc: '
The same as tripclick/val/head, but using qrels scaled by the Document
Click Through Rate (DCTR).
'
docs_instructions: *docs_inst
queries_instructions: *queries_inst
qrels_instructions: *qrels_inst
scoreddocs_instructions: *scoreddocs_inst
docpairs_instructions: *docpairs_inst
bibtex_ids: ['Rekabsaz2021TripClick']
val/torso:
desc: '
The moderately frequent queries in the validation set. This represents 30% of the search engine traffic.
'
docs_instructions: *docs_inst
queries_instructions: *queries_inst
qrels_instructions: *qrels_inst
scoreddocs_instructions: *scoreddocs_inst
docpairs_instructions: *docpairs_inst
bibtex_ids: ['Rekabsaz2021TripClick']
val/tail:
desc: '
The least frequent queries in the validation set. This represents 50% of the search engine traffic.
'
docs_instructions: *docs_inst
queries_instructions: *queries_inst
qrels_instructions: *qrels_inst
scoreddocs_instructions: *scoreddocs_inst
docpairs_instructions: *docpairs_inst
bibtex_ids: ['Rekabsaz2021TripClick']
test:
desc: '
Test subset of tripclick, including all queries from
tripclick/test/head, tripclick/test/torso,
and tripclick/test/tail.
The scoreddocs are the official BM25 results from Anserini.
'
docs_instructions: *docs_inst
queries_instructions: *queries_inst
qrels_instructions: *qrels_inst
scoreddocs_instructions: *scoreddocs_inst
docpairs_instructions: *docpairs_inst
bibtex_ids: ['Rekabsaz2021TripClick']
test/head:
desc: '
The most frequent queries in the test set. This represents 20% of the search engine traffic.
'
docs_instructions: *docs_inst
queries_instructions: *queries_inst
qrels_instructions: *qrels_inst
scoreddocs_instructions: *scoreddocs_inst
docpairs_instructions: *docpairs_inst
bibtex_ids: ['Rekabsaz2021TripClick']
test/torso:
desc: '
The moderately frequent queries in the test set. This represents 30% of the search engine traffic.
'
docs_instructions: *docs_inst
queries_instructions: *queries_inst
qrels_instructions: *qrels_inst
scoreddocs_instructions: *scoreddocs_inst
docpairs_instructions: *docpairs_inst
bibtex_ids: ['Rekabsaz2021TripClick']
test/tail:
desc: '
The least frequent queries in the test set. This represents 50% of the search engine traffic.
'
docs_instructions: *docs_inst
queries_instructions: *queries_inst
qrels_instructions: *qrels_inst
scoreddocs_instructions: *scoreddocs_inst
docpairs_instructions: *docpairs_inst
bibtex_ids: ['Rekabsaz2021TripClick']
================================================
FILE: ir_datasets/docs/tweets2013-ia.yaml
================================================
_:
pretty_name: 'Tweets 2013 (Internet Archive)'
desc: '
A collection of tweets from a 2-month window achived by the Internet Achive. This collection
can be a stand-in document collection for the TREC Microblog 2013-14 tasks. (Even though it is
not exactly the same collection, Sequiera and Lin show that it it close enough.)
This collection is automatically downloaded from the Internet Archive, though download speeds are often
slow so it takes some time. ir_datasets constructs a new directory hierarchy during the download process
to facilitate fast lookups and slices.
'
bibtex_ids: ['Sequiera2017TweetsIA']
trec-mb-2013:
desc: '
TREC Microblog 2013 test collection.
'
bibtex_ids: ['Lin2013Microblog', 'Sequiera2017TweetsIA']
trec-mb-2014:
desc: '
TREC Microblog 2014 test collection.
'
bibtex_ids: ['Lin2014Microblog', 'Sequiera2017TweetsIA']
================================================
FILE: ir_datasets/docs/vaswani.yaml
================================================
_:
pretty_name: 'Vaswani'
desc: '
A small corpus of roughly 11,000 scientific abstracts.
'
================================================
FILE: ir_datasets/docs/wapo.yaml
================================================
_:
pretty_name: 'Washington Post'
desc: '
The Washington Post collection.
'
docs_instructions: &inst "docs available from NIST"
data_access: '
To use this dataset, you need a copy of Washington Post Collection,
provided by NIST.
Your organization may already have a copy. If this is the case, you may only need to complete a new
"Individual Argeement". Otherwise, your organization will need to file the "Organizational agreement"
with NIST. It can take some time to process, but you will end up with a password-protected download link.
The source file required is WashingtonPost.v2.tar.gz.
ir_datasets expects the above file to be copied/linked under ~/.ir_datasets/wapo/WashingtonPost.v2.tar.gz.
'
v2:
desc: '
Version 2 of the Washington Post collection, consisting of articles published between 2012-2017.
The collection is obtained from NIST by requesting it from NIST here.
body contains all body text in plain text format, including paragrphs and multi-media captions.
body_paras_html contains only source paragraphs and contains HTML markup.
body_media contains images, videos, tweets, and galeries, along with a link to the content
and a textual caption.
'
docs_instructions: *inst
v2/trec-core-2018:
desc: '
The TREC Common Core 2018 benchmark.
\n\n\n$', flags=48)),
})
def test_queries(self):
self._test_queries('aquaint/trec-robust-2005', count=50, items={
0: TrecQuery('303', 'Hubble Telescope Achievements', 'Identify positive accomplishments of the Hubble telescope since it\nwas launched in 1991.', 'Documents are relevant that show the Hubble telescope has produced\nnew data, better quality data than previously available, data that\nhas increased human knowledge of the universe, or data that has led\nto disproving previously existing theories or hypotheses. Documents\nlimited to the shortcomings of the telescope would be irrelevant.\nDetails of repairs or modifications to the telescope without\nreference to positive achievements would not be relevant.'),
9: TrecQuery('344', 'Abuses of E-Mail', 'The availability of E-mail to many people through their\njob or school affiliation has allowed for many efficiencies\nin communications but also has provided the opportunity for\nabuses. What steps have been taken world-wide by those\nbearing the cost of E-mail to prevent excesses?', "To be relevant, a document will concern dissatisfaction by\nan entity paying for the cost of electronic mail. Particularly\nsought are items which relate to system users (such as employees)\nwho abuse the system by engaging in communications of the type\nnot related to the payer's desired use of the system."),
49: TrecQuery('689', 'family-planning aid', 'To which countries does the U.S. provide aid to support family planning,\nand for which countries has the U.S. refused or limited support?', 'Relevant documents indicate where U.S. aid supports\nfamily planning or where such aid has been denied.\nDiscussions of why aid for family planning has been refused are\nalso relevant. Documents that mention U.S. aid to countries,\nbut not specifically for family planning are not relevant.\nDescriptions of funds for family planning in the U.S. itself are not relevant.'),
})
def test_qrels(self):
self._test_qrels('aquaint/trec-robust-2005', count=37798, items={
0: TrecQrel('303', 'APW19980609.1531', 2, '0'),
9: TrecQrel('303', 'APW19981117.0914', 0, '0'),
37797: TrecQrel('689', 'XIE20000925.0055', 0, '0'),
})
if __name__ == '__main__':
unittest.main()
================================================
FILE: test/integration/argsme.py
================================================
from datetime import datetime
from re import compile
from unittest import main
from ir_datasets.formats import ArgsMeDoc, ArgsMeStance, ArgsMePremise, \
ArgsMeSourceDomain, ArgsMeMode, ArgsMeAspect, ArgsMeProcessedDoc
from ir_datasets.formats.argsme import ArgsMeSentence
from test.integration.base import DatasetIntegrationTest
class TestArgsMe(DatasetIntegrationTest):
# noinspection PyTypeChecker
def test_docs(self):
self._test_docs("argsme/1.0", count=387692, items={
0: ArgsMeDoc(
doc_id="c67482ba-2019-04-18T13:32:05Z-00000-000",
conclusion="Contraceptive Forms for High School Students",
premises=[
ArgsMePremise(
text=compile("My opponent forfeited every round\. None of my argu.{393}it is illegal to sell you them is, frankly, wrong\."),
stance=ArgsMeStance.CON,
annotations=[],
),
],
premises_texts=compile("My opponent forfeited every round\. None of my argu.{393}it is illegal to sell you them is, frankly, wrong\."),
aspects=[],
aspects_names="",
source_id="c67482ba-2019-04-18T13:32:05Z",
source_title="Debate Argument: Contraceptive Forms for High School Students | Debate.org",
source_url="https://www.debate.org/debates/Contraceptive-Forms-for-High-School-Students/1/",
source_previous_argument_id=None,
source_next_argument_id="c67482ba-2019-04-18T13:32:05Z-00001-000",
source_domain=None,
source_text=None,
source_text_conclusion_start=None,
source_text_conclusion_end=None,
source_text_premise_start=None,
source_text_premise_end=None,
topic="Contraceptive Forms for High School Students",
acquisition=datetime.fromisoformat(
"2019-04-18T13:32:05+00:00"),
date=None,
author=None,
author_image_url=None,
author_organization=None,
author_role=None,
mode=None,
),
387691: ArgsMeDoc(
doc_id="671509c8-2019-04-17T11:47:34Z-00007-000",
conclusion="Charter schools",
premises=[
ArgsMePremise(
text="Charter schools are exploited most by affable students",
stance=ArgsMeStance.CON,
annotations=[],
),
],
premises_texts="Charter schools are exploited most by affable students",
aspects=[],
aspects_names="",
source_id="671509c8-2019-04-17T11:47:34Z",
source_title="Debate: Charter schools - Debatepedia",
source_url="http://www.debatepedia.org/en/index.php/Debate:_Charter_schools",
source_previous_argument_id="671509c8-2019-04-17T11:47:34Z-00022-000",
source_next_argument_id="671509c8-2019-04-17T11:47:34Z-00057-000",
source_domain=None,
source_text=None,
source_text_conclusion_start=None,
source_text_conclusion_end=None,
source_text_premise_start=None,
source_text_premise_end=None,
topic="Charter schools",
acquisition=datetime.fromisoformat("2019-04-17T11:47:34+00:00"),
date=None,
author=None,
author_image_url=None,
author_organization=None,
author_role=None,
mode=None,
),
})
self._test_docs("argsme/1.0-cleaned", count=382545, items={
0: ArgsMeDoc(
doc_id="c67482ba-2019-04-18T13:32:05Z-00000-000",
conclusion="Contraceptive Forms for High School Students",
premises=[
ArgsMePremise(
text=compile("My opponent forfeited every round\. None of my argu.{393}it is illegal to sell you them is, frankly, wrong\."),
stance=ArgsMeStance.CON,
annotations=[],
),
],
premises_texts=compile("My opponent forfeited every round\. None of my argu.{393}it is illegal to sell you them is, frankly, wrong\."),
aspects=[],
aspects_names="",
source_id="c67482ba-2019-04-18T13:32:05Z",
source_title="Debate Argument: Contraceptive Forms for High School Students | Debate.org",
source_url="https://www.debate.org/debates/Contraceptive-Forms-for-High-School-Students/1/",
source_previous_argument_id=None,
source_next_argument_id="c67482ba-2019-04-18T13:32:05Z-00001-000",
source_domain=None,
source_text=None,
source_text_conclusion_start=None,
source_text_conclusion_end=None,
source_text_premise_start=None,
source_text_premise_end=None,
topic="Contraceptive Forms for High School Students",
acquisition=datetime.fromisoformat("2019-04-18T13:32:05+00:00"),
date=None,
author=None,
author_image_url=None,
author_organization=None,
author_role=None,
mode=None,
),
382544: ArgsMeDoc(
doc_id="671509c8-2019-04-17T11:47:34Z-00007-000",
conclusion="Charter schools",
premises=[
ArgsMePremise(
text="Charter schools are exploited most by affable students",
stance=ArgsMeStance.CON,
annotations=[],
),
],
premises_texts="Charter schools are exploited most by affable students",
aspects=[],
aspects_names="",
source_id="671509c8-2019-04-17T11:47:34Z",
source_title="Debate: Charter schools - Debatepedia",
source_url="http://www.debatepedia.org/en/index.php/Debate:_Charter_schools",
source_previous_argument_id="671509c8-2019-04-17T11:47:34Z-00022-000",
source_next_argument_id="671509c8-2019-04-17T11:47:34Z-00057-000",
source_domain=None,
source_text=None,
source_text_conclusion_start=None,
source_text_conclusion_end=None,
source_text_premise_start=None,
source_text_premise_end=None,
topic="Charter schools",
acquisition=datetime.fromisoformat("2019-04-17T11:47:34+00:00"),
date=None,
author=None,
author_image_url=None,
author_organization=None,
author_role=None,
mode=None,
),
})
self._test_docs("argsme/2020-04-01", count=387740, items={
0: ArgsMeDoc(
doc_id="Sb38112c8-A443a9828",
conclusion="school",
premises=[
ArgsMePremise(
text="Done.",
stance=ArgsMeStance.PRO,
annotations=[],
),
],
premises_texts="Done.",
aspects=[],
aspects_names="",
source_id="Sb38112c8",
source_title="Debate: school | Debate.org",
source_url="https://www.debate.org/debates/school/3/",
source_previous_argument_id=None,
source_next_argument_id=None,
source_domain=ArgsMeSourceDomain.debateorg,
source_text=compile("DEBATES OPINIONS FORUMS POLLS Google Search My Deb.{3149}p Version © 2019 Debate\.org\. All rights reserved\. "),
source_text_conclusion_start=1630,
source_text_conclusion_end=1636,
source_text_premise_start=2664,
source_text_premise_end=2670,
topic="school",
acquisition=datetime.fromisoformat("2019-04-18T17:49:41+00:00"),
date=None,
author=None,
author_image_url=None,
author_organization=None,
author_role=None,
mode=ArgsMeMode.discussion,
),
387739: ArgsMeDoc(
doc_id="S153cd52f-A118dded8",
conclusion=compile("The recent throne speech said that the government .{27}o ensure that our communities continue to be safe\."),
premises=[
ArgsMePremise(
text=compile("How can Canadians trust the Liberals when they say.{1049}replaced by the system changers in the opposition\."),
stance=ArgsMeStance.CON,
annotations=[],
),
],
premises_texts=compile("How can Canadians trust the Liberals when they say.{1049}replaced by the system changers in the opposition\."),
aspects=[
ArgsMeAspect(
name="Pornography",
weight=3,
normalized_weight=1,
rank=1,
),
],
aspects_names="Pornography",
source_id="S153cd52f",
source_title="4129762",
source_url=None,
source_previous_argument_id=None,
source_next_argument_id=None,
source_domain=ArgsMeSourceDomain.canadian_parliament,
source_text=compile("How can Canadians trust the Liberals when they say.{1050}eplaced by the system changers in the opposition\. "),
source_text_conclusion_start=149,
source_text_conclusion_end=276,
source_text_premise_start=0,
source_text_premise_end=1149,
topic="Child Pornography",
acquisition=datetime.fromisoformat("2019-07-25T09:33:44.811404+00:00"),
date=datetime.fromisoformat("1999-10-17T22:00:00+00:00"),
author="Paul Forseth",
author_image_url="https://www.ourcommons.ca/Parliamentarians/Images/OfficialMPPhotos/38/Forsep.JPG",
author_organization="Reform",
author_role="Opposition",
mode=ArgsMeMode.person,
),
})
self._test_docs("argsme/2020-04-01/debateorg", count=338620, items={
0: ArgsMeDoc(
doc_id="Sb38112c8-A443a9828",
conclusion="school",
premises=[
ArgsMePremise(
text="Done.",
stance=ArgsMeStance.PRO,
annotations=[],
),
],
premises_texts="Done.",
aspects=[],
aspects_names="",
source_id="Sb38112c8",
source_title="Debate: school | Debate.org",
source_url="https://www.debate.org/debates/school/3/",
source_previous_argument_id=None,
source_next_argument_id=None,
source_domain=ArgsMeSourceDomain.debateorg,
source_text=compile("DEBATES OPINIONS FORUMS POLLS Google Search My Deb.{3149}p Version © 2019 Debate\.org\. All rights reserved\. "),
source_text_conclusion_start=1630,
source_text_conclusion_end=1636,
source_text_premise_start=2664,
source_text_premise_end=2670,
topic="school",
acquisition=datetime.fromisoformat("2019-04-18T17:49:41+00:00"),
date=None,
author=None,
author_image_url=None,
author_organization=None,
author_role=None,
mode=ArgsMeMode.discussion,
),
338619: ArgsMeDoc(
doc_id="Sca72da7d-Adbd84fd2",
conclusion="It Should be Legal in the U.S. to Occasionally Hit Someone",
premises=[
ArgsMePremise(
text=compile("In this debate, I will argue that occasionally hit.{2302}fe\. My opponent may begin his argument\. Good luck\."),
stance=ArgsMeStance.PRO,
annotations=[],
),
],
premises_texts=compile("In this debate, I will argue that occasionally hit.{2302}fe\. My opponent may begin his argument\. Good luck\."),
aspects=[
ArgsMeAspect(
name="Obesity",
weight=1,
normalized_weight=1,
rank=1,
),
],
aspects_names="Obesity",
source_id="Sca72da7d",
source_title="Debate Topic: It Should be Legal in the U.S. to Occasionally Hit Someone | Debate.org",
source_url="https://www.debate.org/debates/It-Should-be-Legal-in-the-U.S.-to-Occasionally-Hit-Someone/1/",
source_previous_argument_id=None,
source_next_argument_id=None,
source_domain=ArgsMeSourceDomain.debateorg,
source_text=compile("DEBATES OPINIONS FORUMS POLLS Google Search My Deb.{26529}p Version © 2019 Debate\.org\. All rights reserved\. "),
source_text_conclusion_start=1677,
source_text_conclusion_end=1735,
source_text_premise_start=2391,
source_text_premise_end=4794,
topic="It Should be Legal in the U.S. to Occasionally Hit Someone",
acquisition=datetime.fromisoformat("2019-04-18T19:21:03+00:00"),
date=None,
author=None,
author_image_url=None,
author_organization=None,
author_role=None,
mode=ArgsMeMode.discussion,
),
})
self._test_docs("argsme/2020-04-01/debatepedia", count=21197, items={
0: ArgsMeDoc(
doc_id="S96f2396e-Aaf079b43",
conclusion="Mine Ban Treaty (Ottawa Treaty)",
premises=[
ArgsMePremise(
text="Casualties in repelling N. Korean invasion would be higher w/o mines",
stance=ArgsMeStance.CON,
annotations=[],
),
],
premises_texts="Casualties in repelling N. Korean invasion would be higher w/o mines",
aspects=[],
aspects_names="",
source_id="S96f2396e",
source_title="Debate: Mine Ban Treaty (Ottawa Treaty) - Debatepedia",
source_url="http://www.debatepedia.org/en/index.php/Debate:_Mine_Ban_Treaty_%28Ottawa_Treaty%29",
source_previous_argument_id=None,
source_next_argument_id=None,
source_domain=ArgsMeSourceDomain.debatepedia,
source_text=compile("Welcome to Debatepedia! \| About \| Help \| FAQ \| Med.{33182} Disclaimers Problem with the site\? Edit Close \. "),
source_text_conclusion_start=1063,
source_text_conclusion_end=1094,
source_text_premise_start=18501,
source_text_premise_end=18569,
topic="Mine Ban Treaty (Ottawa Treaty)",
acquisition=datetime.fromisoformat("2019-04-17T11:47:26+00:00"),
date=None,
author=None,
author_image_url=None,
author_organization=None,
author_role=None,
mode=ArgsMeMode.discussion,
),
21196: ArgsMeDoc(
doc_id="S148bb110-A119d66b0",
conclusion="Environmental impact of barages is ugly.",
premises=[
ArgsMePremise(
text=compile("Barages are fairly massive objects, like Dams, tha.{160} and possibly reduced property values and tourism\."),
stance=ArgsMeStance.PRO,
annotations=[],
),
],
premises_texts=compile("Barages are fairly massive objects, like Dams, tha.{160} and possibly reduced property values and tourism\."),
aspects=[],
aspects_names="",
source_id="S148bb110",
source_title="Debate: Tidal energy - Debatepedia",
source_url="http://www.debatepedia.org/en/index.php/Debate:_Tidal_energy",
source_previous_argument_id=None,
source_next_argument_id=None,
source_domain=ArgsMeSourceDomain.debatepedia,
source_text=compile("Welcome to Debatepedia! \| About \| Help \| FAQ \| Med.{28127} Disclaimers Problem with the site\? Edit Close \. "),
source_text_conclusion_start=22030,
source_text_conclusion_end=22070,
source_text_premise_start=22070,
source_text_premise_end=22331,
topic="Tidal energy",
acquisition=datetime.fromisoformat("2019-04-17T11:47:38+00:00"),
date=None,
author=None,
author_image_url=None,
author_organization=None,
author_role=None,
mode=ArgsMeMode.discussion,
),
})
self._test_docs("argsme/2020-04-01/debatewise", count=14353, items={
0: ArgsMeDoc(
doc_id="S5920cdef-A982becb7",
conclusion="placebo effect and phenylthiamine",
premises=[
ArgsMePremise(
text=compile("But are chocolate eaters happy\? \[\[http://news\.bbc\..{623} health !!! as it increases our colestrol in body\."),
stance=ArgsMeStance.CON,
annotations=[],
),
],
premises_texts=compile("But are chocolate eaters happy\? \[\[http://news\.bbc\..{623} health !!! as it increases our colestrol in body\."),
aspects=[],
aspects_names="",
source_id="S5920cdef",
source_title="Is chocolate good for you? - DebateWise",
source_url="https://debatewise.org/debates/1904-is-chocolate-good-for-you/",
source_previous_argument_id=None,
source_next_argument_id=None,
source_domain=ArgsMeSourceDomain.debatewise,
source_text=compile("Browse Our Categories Is chocolate good for you\? L.{10743}me \| About Us \| Privacy & Contact Us Top wpDiscuz "),
source_text_conclusion_start=751,
source_text_conclusion_end=784,
source_text_premise_start=1667,
source_text_premise_end=2390,
topic="Is chocolate good for you?",
acquisition=datetime.fromisoformat("2019-04-19T12:46:26+00:00"),
date=None,
author=None,
author_image_url=None,
author_organization=None,
author_role=None,
mode=ArgsMeMode.discussion,
),
14352: ArgsMeDoc(
doc_id="Sc47177f2-A730d8f9e",
conclusion="NO",
premises=[
ArgsMePremise(
text=compile(":/ Adolf Hitler was not evil he wasn't a murderer.{244}wer Told You To Jump Off A Bridge Would You Do It\?"),
stance=ArgsMeStance.PRO,
annotations=[],
),
],
premises_texts=compile(":/ Adolf Hitler was not evil he wasn't a murderer.{244}wer Told You To Jump Off A Bridge Would You Do It\?"),
aspects=[
ArgsMeAspect(
name="Jews",
weight=2,
normalized_weight=0.6666666666666666,
rank=1,
),
ArgsMeAspect(
name="Adolf Hitler",
weight=1,
normalized_weight=0.3333333333333333,
rank=2,
),
],
aspects_names="Jews Adolf Hitler",
source_id="Sc47177f2",
source_title="Adolf Hitler Does Not Deserve His Reputation as Evil - DebateWise",
source_url="https://debatewise.org/debates/357-adolf-hitler-does-not-deserve-his-reputation-as-evil/",
source_previous_argument_id=None,
source_next_argument_id=None,
source_domain=ArgsMeSourceDomain.debatewise,
source_text=compile("Browse Our Categories Adolf Hitler Does Not Deserv.{54510} Us \| Privacy & Contact Us Top wpDiscuz 113shares "),
source_text_conclusion_start=43380,
source_text_conclusion_end=43382,
source_text_premise_start=43383,
source_text_premise_end=43727,
topic="Adolf Hitler Does Not Deserve His Reputation as Evil",
acquisition=datetime.fromisoformat("2019-04-19T12:44:52+00:00"),
date=None,
author=None,
author_image_url=None,
author_organization=None,
author_role=None,
mode=ArgsMeMode.discussion,
),
})
self._test_docs("argsme/2020-04-01/idebate", count=13522, items={
0: ArgsMeDoc(
doc_id="Sf9294c83-Af186e851",
conclusion="the War in Iraq was Worth the Cost",
premises=[
ArgsMePremise(
text="His removal provides stability and security not only for Iraq but for the Middle East as a region",
stance=ArgsMeStance.PRO,
annotations=[],
),
],
premises_texts="His removal provides stability and security not only for Iraq but for the Middle East as a region",
aspects=[],
aspects_names="",
source_id="Sf9294c83",
source_title="This House Believes that the War in Iraq was Worth the Cost | idebate.org",
source_url="https://idebate.org/debatabase/international-middle-east-politics-terrorism-warpeace/house-believes-war-iraq-was-worth",
source_previous_argument_id=None,
source_next_argument_id=None,
source_domain=ArgsMeSourceDomain.idebate,
source_text=compile("idebate\.org Educational and informative news and r.{17520}in with\.\.\. Login with Facebook Login with Twitter "),
source_text_conclusion_start=196,
source_text_conclusion_end=230,
source_text_premise_start=8137,
source_text_premise_end=8234,
topic="the War in Iraq was Worth the Cost",
acquisition=datetime.fromisoformat("2019-04-19T12:40:25+00:00"),
date=None,
author=None,
author_image_url=None,
author_organization=None,
author_role=None,
mode=ArgsMeMode.discussion,
),
13521: ArgsMeDoc(
doc_id="Sd6cf79d9-Af8d9e187",
conclusion="Socialism is a more secure system than the free market in Capitalism",
premises=[
ArgsMePremise(
text=compile("In order to avoid economic crisis there is a need .{1343}agall\. Financial Times\. Retrieved June 14, 2011 1\."),
stance=ArgsMeStance.CON,
annotations=[],
),
],
premises_texts=compile("In order to avoid economic crisis there is a need .{1343}agall\. Financial Times\. Retrieved June 14, 2011 1\."),
aspects=[
ArgsMeAspect(
name="Capitalism",
weight=2,
normalized_weight=0.6666666666666666,
rank=1,
),
ArgsMeAspect(
name="Socialism",
weight=1,
normalized_weight=0.3333333333333333,
rank=2,
),
],
aspects_names="Capitalism Socialism",
source_id="Sd6cf79d9",
source_title="This House believes that capitalism is better than socialism | idebate.org",
source_url="https://idebate.org/debatabase/economy-economy-general-philosophy-political-philosophy/house-believes-capitalism-better",
source_previous_argument_id=None,
source_next_argument_id=None,
source_domain=ArgsMeSourceDomain.idebate,
source_text=compile("idebate\.org Educational and informative news and r.{29663}in with\.\.\. Login with Facebook Login with Twitter "),
source_text_conclusion_start=19722,
source_text_conclusion_end=19790,
source_text_premise_start=21487,
source_text_premise_end=22930,
topic="capitalism is better than socialism",
acquisition=datetime.fromisoformat("2019-04-19T12:39:56+00:00"),
date=None,
author=None,
author_image_url=None,
author_organization=None,
author_role=None,
mode=ArgsMeMode.discussion,
),
})
self._test_docs("argsme/2020-04-01/parliamentary", count=48, items={
0: ArgsMeDoc(
doc_id="S1f6b58eb-A5c530110",
conclusion=compile("I want them to know that their braids, their dread.{165} boardroom, and yes, even here on Parliament Hill\."),
premises=[
ArgsMePremise(
text=compile("I want them to know that their braids, their dread.{165} boardroom, and yes, even here on Parliament Hill\."),
stance=ArgsMeStance.PRO,
annotations=[],
),
],
premises_texts=compile("I want them to know that their braids, their dread.{165} boardroom, and yes, even here on Parliament Hill\."),
aspects=[
ArgsMeAspect(
name="Women",
weight=2,
normalized_weight=0.6666666666666666,
rank=1,
),
ArgsMeAspect(
name="Woman",
weight=1,
normalized_weight=0.3333333333333333,
rank=2,
),
],
aspects_names="Women Woman",
source_id="S1f6b58eb",
source_title="4718632",
source_url=None,
source_previous_argument_id=None,
source_next_argument_id=None,
source_domain=ArgsMeSourceDomain.canadian_parliament,
source_text=compile("This week I have my hair in braids, much like I ha.{1066}boardroom, and yes, even here on Parliament Hill\. "),
source_text_conclusion_start=900,
source_text_conclusion_end=1165,
source_text_premise_start=0,
source_text_premise_end=1165,
topic="Body Shaming",
acquisition=datetime.fromisoformat("2019-07-25T09:33:44.814585+00:00"),
date=datetime.fromisoformat("2017-09-19T22:00:00+00:00"),
author="Celina Caesar-Chavannes",
author_image_url="https://www.ourcommons.ca/Parliamentarians/Images/OfficialMPPhotos/42/CaesarChavannesCelina_Lib.jpg",
author_organization="Liberal",
author_role="Government",
mode=ArgsMeMode.person,
),
47: ArgsMeDoc(
doc_id="S153cd52f-A118dded8",
conclusion=compile("The recent throne speech said that the government .{27}o ensure that our communities continue to be safe\."),
premises=[
ArgsMePremise(
text=compile("How can Canadians trust the Liberals when they say.{1049}replaced by the system changers in the opposition\."),
stance=ArgsMeStance.CON,
annotations=[],
),
],
premises_texts=compile("How can Canadians trust the Liberals when they say.{1049}replaced by the system changers in the opposition\."),
aspects=[
ArgsMeAspect(
name="Pornography",
weight=3,
normalized_weight=1,
rank=1,
),
],
aspects_names="Pornography",
source_id="S153cd52f",
source_title="4129762",
source_url=None,
source_previous_argument_id=None,
source_next_argument_id=None,
source_domain=ArgsMeSourceDomain.canadian_parliament,
source_text=compile("How can Canadians trust the Liberals when they say.{1050}eplaced by the system changers in the opposition\. "),
source_text_conclusion_start=149,
source_text_conclusion_end=276,
source_text_premise_start=0,
source_text_premise_end=1149,
topic="Child Pornography",
acquisition=datetime.fromisoformat("2019-07-25T09:33:44.811404+00:00"),
date=datetime.fromisoformat("1999-10-17T22:00:00+00:00"),
author="Paul Forseth",
author_image_url="https://www.ourcommons.ca/Parliamentarians/Images/OfficialMPPhotos/38/Forsep.JPG",
author_organization="Reform",
author_role="Opposition",
mode=ArgsMeMode.person,
),
})
self._test_docs("argsme/2020-04-01/processed", count=365408, items={
0: ArgsMeProcessedDoc(
doc_id="Sf9294c83-Af186e851",
conclusion="the War in Iraq was Worth the Cost",
premises=[
ArgsMePremise(
text=compile("His removal provides stability and .{21} Iraq but for the Middle East as a region"),
stance=ArgsMeStance.PRO,
annotations=[],
),
],
premises_texts=compile("His removal provides stability and .{21} Iraq but for the Middle East as a region"),
aspects=[],
aspects_names="",
source_id="Sf9294c83",
source_title="This House Believes that the War in Iraq was Worth the Cost | idebate.org",
source_url="https://idebate.org/debatabase/international-middle-east-politics-terrorism-warpeace/house-believes-war-iraq-was-worth",
source_previous_argument_id=None,
source_next_argument_id=None,
source_domain=ArgsMeSourceDomain.idebate,
source_text=compile("idebate.org Educational and informative news and .{17542} Facebook Login with Twitter "),
source_text_conclusion_start=196,
source_text_conclusion_end=230,
source_text_premise_start=8137,
source_text_premise_end=8234,
topic="the War in Iraq was Worth the Cost",
acquisition=datetime.fromisoformat("2019-04-19T12:40:25+00:00"),
date=None,
author=None,
author_image_url=None,
author_organization=None,
author_role=None,
mode=ArgsMeMode.discussion,
sentences=[
ArgsMeSentence(
id="Sf9294c83-Af186e851__PREMISE__1",
text="His removal provides stability and security not only for Iraq but for the Middle East as a region"
),
ArgsMeSentence(
id="Sf9294c83-Af186e851__CONC__1",
text="the War in Iraq was Worth the Cost"
)
],
),
365407: ArgsMeProcessedDoc(
doc_id="S148bb110-A119d66b0",
conclusion="Environmental impact of barages is ugly.",
premises=[
ArgsMePremise(
text=compile("Barages are fairly massive objects, like .{190} property values and tourism."),
stance=ArgsMeStance.PRO,
annotations=[]
)
],
premises_texts=compile("Barages are fairly massive objects, like .{190} property values and tourism."),
aspects=[],
aspects_names="",
source_id="S148bb110",
source_title="Debate: Tidal energy - Debatepedia",
source_url="http://www.debatepedia.org/en/index.php/Debate:_Tidal_energy",
source_previous_argument_id=None,
source_next_argument_id=None,
source_domain=ArgsMeSourceDomain.debatepedia,
source_text=compile("Welcome to Debatepedia! \| About \| Help \| FAQ \| Media Kit Personal .{28131} with the site\?\xa0 Edit Close . "),
source_text_conclusion_start=22030,
source_text_conclusion_end=22070,
source_text_premise_start=22070,
source_text_premise_end=22331,
topic="Tidal energy",
acquisition=datetime.fromisoformat("2019-04-17T11:47:38+00:00"),
date=None,
author=None,
author_image_url=None,
author_organization=None,
author_role=None,
mode=ArgsMeMode.discussion,
sentences=[
ArgsMeSentence(
id="S148bb110-A119d66b0__PREMISE__1",
text="Barages are fairly massive objects, like Dams, that obstruct the natural flow of water and can, subsequently, have harmful environmental impacts."
),
ArgsMeSentence(
id="S148bb110-A119d66b0__PREMISE__2",
text="These effects can be very ugly, causing frustration among locals and possibly reduced property values and tourism."
), ArgsMeSentence(
id="S148bb110-A119d66b0__CONC__1",
text="Environmental impact of barages is ugly."
)
]
,
),
})
if __name__ == "__main__":
main()
================================================
FILE: test/integration/base.py
================================================
import re
import unittest
import ir_datasets
_logger = ir_datasets.log.easy()
class DatasetIntegrationTest(unittest.TestCase):
def _test_docs(self, dataset_name, count=None, items=None, test_docstore=True, test_iter_split=True):
orig_items = dict(items)
with self.subTest('docs', dataset=dataset_name):
if isinstance(dataset_name, str):
dataset = ir_datasets.load(dataset_name)
else:
dataset = dataset_name
expected_count = count
items = items or {}
count = 0
for i, doc in enumerate(_logger.pbar(dataset.docs_iter(), f'{dataset_name} docs', unit='doc')):
count += 1
if i in items:
self._assert_namedtuple(doc, items[i])
del items[i]
if expected_count is None and len(items) == 0:
break # no point in going further
if expected_count is not None:
self.assertEqual(expected_count, count)
self.assertEqual({}, items)
if test_iter_split:
with self.subTest('docs_iter split', dataset=dataset_name):
it = dataset.docs_iter()
with _logger.duration('doc lookups by index'):
for idx, doc in orig_items.items():
self._assert_namedtuple(next(it[idx:idx+1]), doc)
self._assert_namedtuple(it[idx], doc)
if test_docstore:
with self.subTest('docs_store', dataset=dataset_name):
doc_store = dataset.docs_store()
with _logger.duration('doc lookups by doc_id'):
for doc in orig_items.values():
ret_doc = doc_store.get(doc.doc_id)
self._assert_namedtuple(doc, ret_doc)
def _test_queries(self, dataset_name, count=None, items=None):
with self.subTest('queries', dataset=dataset_name):
if isinstance(dataset_name, str):
dataset = ir_datasets.load(dataset_name)
else:
dataset = dataset_name
expected_count = count
items = items or {}
count = 0
for i, query in enumerate(_logger.pbar(dataset.queries_iter(), f'{dataset_name} queries', unit='query')):
count += 1
if i in items:
self._assert_namedtuple(query, items[i])
del items[i]
if expected_count is None and len(items) == 0:
break # no point in going further
if expected_count is not None:
self.assertEqual(expected_count, count)
self.assertEqual(0, len(items))
def _test_qrels(self, dataset_name, count=None, items=None):
with self.subTest('qrels', dataset=dataset_name):
if isinstance(dataset_name, str):
dataset = ir_datasets.load(dataset_name)
else:
dataset = dataset_name
expected_count = count
items = items or {}
count = 0
for i, qrel in enumerate(_logger.pbar(dataset.qrels_iter(), f'{dataset_name} qrels', unit='qrel')):
count += 1
if i in items:
self._assert_namedtuple(qrel, items[i])
del items[i]
if expected_count is None and len(items) == 0:
break # no point in going further
if expected_count is not None:
self.assertEqual(expected_count, count)
self.assertEqual(0, len(items))
def _test_qlogs(self, dataset_name, count=None, items=None):
with self.subTest('qlogs', dataset=dataset_name):
if isinstance(dataset_name, str):
dataset = ir_datasets.load(dataset_name)
else:
dataset = dataset_name
expected_count = count
items = items or {}
count = 0
for i, qlogs in enumerate(_logger.pbar(dataset.qlogs_iter(), f'{dataset_name} qlogs', unit='qlog')):
count += 1
if i in items:
self._assert_namedtuple(qlogs, items[i])
del items[i]
if expected_count is None and len(items) == 0:
break # no point in going further
if expected_count is not None:
self.assertEqual(expected_count, count)
self.assertEqual(0, len(items))
def _test_docpairs(self, dataset_name, count=None, items=None):
with self.subTest('docpairs', dataset=dataset_name):
if isinstance(dataset_name, str):
dataset = ir_datasets.load(dataset_name)
else:
dataset = dataset_name
expected_count = count
items = items or {}
count = 0
for i, docpair in enumerate(_logger.pbar(dataset.docpairs_iter(), f'{dataset_name} docpairs', unit='docpair')):
count += 1
if i in items:
self._assert_namedtuple(docpair, items[i])
del items[i]
if expected_count is None and len(items) == 0:
break # no point in going further
if expected_count is not None:
self.assertEqual(expected_count, count)
self.assertEqual(0, len(items))
def _build_test_docs(self, dataset_name, include_count=True, include_idxs=(0, 9)):
items = {}
count = 0
if isinstance(dataset_name, str):
dataset = ir_datasets.load(dataset_name)
else:
dataset = dataset_name
for i, doc in enumerate(_logger.pbar(dataset.docs_iter(), f'{dataset_name} docs', unit='doc')):
count += 1
if i in include_idxs:
items[i] = doc
if not include_count and ((include_idxs[-1] < 1000 and i == 1000) or (include_idxs[-1] >= 1000 and i == include_idxs[-1])):
break
items[count-1] = doc
items = {k: self._replace_regex_namedtuple(v) for k, v in items.items()}
count = f', count={count}' if include_count else ''
_logger.info(f'''
self._test_docs({repr(dataset_name)}{count}, items={self._repr_namedtuples(items)})
''')
def _build_test_queries(self, dataset_name):
items = {}
count = 0
if isinstance(dataset_name, str):
dataset = ir_datasets.load(dataset_name)
else:
dataset = dataset_name
for i, query in enumerate(_logger.pbar(dataset.queries_iter(), f'{dataset_name} queries', unit='query')):
count += 1
if i in (0, 9):
items[i] = query
items[count-1] = query
_logger.info(f'''
self._test_queries({repr(dataset_name)}, count={count}, items={self._repr_namedtuples(items)})
''')
def _build_test_qrels(self, dataset_name):
items = {}
count = 0
if isinstance(dataset_name, str):
dataset = ir_datasets.load(dataset_name)
else:
dataset = dataset_name
for i, qrel in enumerate(_logger.pbar(dataset.qrels_iter(), f'{dataset_name} qrels', unit='qrel')):
count += 1
if i in (0, 9):
items[i] = qrel
items[count-1] = qrel
_logger.info(f'''
self._test_qrels({repr(dataset_name)}, count={count}, items={self._repr_namedtuples(items)})
''')
def _build_test_scoreddocs(self, dataset_name):
items = {}
count = 0
if isinstance(dataset_name, str):
dataset = ir_datasets.load(dataset_name)
else:
dataset = dataset_name
for i, scoreddoc in enumerate(_logger.pbar(dataset.scoreddocs_iter(), f'{dataset_name} scoreddocs', unit='scoreddoc')):
count += 1
if i in (0, 9):
items[i] = scoreddoc
items[count-1] = scoreddoc
_logger.info(f'''
self._test_scoreddocs({repr(dataset_name)}, count={count}, items={self._repr_namedtuples(items)})
''')
def _build_test_docpairs(self, dataset_name):
items = {}
count = 0
for i, docpair in enumerate(_logger.pbar(ir_datasets.load(dataset_name).docpairs_iter(), f'{dataset_name} docpairs', unit='docpair')):
count += 1
if i in (0, 9):
items[i] = docpair
items[count-1] = docpair
_logger.info(f'''
self._test_docpairs({repr(dataset_name)}, count={count}, items={self._repr_namedtuples(items)})
''')
def _test_scoreddocs(self, dataset_name, count=None, items=None):
with self.subTest('scoreddocs', dataset=dataset_name):
if isinstance(dataset_name, str):
dataset = ir_datasets.load(dataset_name)
else:
dataset = dataset_name
expected_count = count
items = items or {}
count = 0
for i, scoreddoc in enumerate(_logger.pbar(dataset.scoreddocs_iter(), f'{dataset_name} scoreddocs', unit='scoreddoc')):
count += 1
if i in items:
self._assert_namedtuple(scoreddoc, items[i])
del items[i]
if expected_count is None and len(items) == 0:
break # no point in going further
if expected_count is not None:
self.assertEqual(expected_count, count)
self.assertEqual(0, len(items))
def _build_test_qlogs(self, dataset_name):
items = {}
count = 0
for i, qlog in enumerate(_logger.pbar(ir_datasets.load(dataset_name).qlogs_iter(), f'{dataset_name} qlogs', unit='qlogs')):
count += 1
if i in (0, 9):
items[i] = qlog
items[count-1] = qlog
_logger.info(f'''
self._test_qlogs({repr(dataset_name)}, count={count}, items={self._repr_namedtuples(items)})
''')
def _assert_namedtuple(self, a, b):
# needed because python <= 3.6 doesn't expose re.Pattern class
Pattern = re.Pattern if hasattr(re, 'Pattern') else type(re.compile(''))
self.assertEqual(type(a).__name__, type(b).__name__)
if hasattr(type(a), '_fields') or hasattr(type(b), '_fields'):
self.assertEqual(type(a)._fields, type(b)._fields)
for v_a, v_b in zip(a, b):
# support compiled regex for matching (e.g., for long documents)
if isinstance(v_b, Pattern):
self.assertRegex(v_a, v_b)
elif isinstance(v_a, Pattern):
self.assertRegex(v_b, v_a)
elif isinstance(v_a, tuple) and isinstance(v_b, tuple):
self._assert_namedtuple(v_a, v_b)
elif isinstance(v_a, list) and isinstance(v_b, list):
self._assert_namedtuple(v_a, v_b)
else:
self.assertEqual(v_a, v_b)
def _replace_regex_namedtuple(self, tup, maxlen=200):
result = []
for value in tup:
if isinstance(value, str) and len(value) > maxlen:
count = len(value) - maxlen
pattern = '^' + re.escape(value[:maxlen//2]) + (r'.{%i}' % count) + re.escape(value[-(maxlen//2):]) + '$'
result.append(re.compile(pattern, re.DOTALL))
elif isinstance(value, bytes) and len(value) > maxlen:
count = len(value) - maxlen
pattern = b'^' + re.escape(value[:maxlen//2]) + (b'.{%i}' % count) + re.escape(value[-(maxlen//2):]) + b'$'
result.append(re.compile(pattern, re.DOTALL))
elif isinstance(value, tuple) and len(value) > 0 and isinstance(value[0], tuple):
result.append(tuple(self._replace_regex_namedtuple(t) for t in value))
elif isinstance(value, list) and len(value) > 0 and isinstance(value[0], tuple):
result.append(list(self._replace_regex_namedtuple(t) for t in value))
else:
result.append(value)
return type(tup)(*result)
def _repr_namedtuples(self, items):
result = '{\n'
for key, value in items.items():
result += f' {repr(key)}: {self._repr_namedtuple(value)},\n'
result += '}'
return result
def _repr_namedtuple(self, value):
result = f'{type(value).__name__}('
for item in value:
if isinstance(item, re.Pattern):
if isinstance(item.pattern, str):
pattern = item.pattern.replace('\\ ', ' ').replace('\\\n', '\n') # don't want these escaped
else:
pattern = item.pattern.replace(b'\\ ', b' ').replace(b'\\\n', b'\n') # don't want these escaped
result += f're.compile({repr(pattern)}, flags={item.flags}), '
elif isinstance(item, list) and len(item) > 0 and isinstance(item[0], tuple) and hasattr(item[0], '_fields'):
result += '[' + ', '.join(self._repr_namedtuple(i) for i in item) + '], '
elif isinstance(item, tuple) and len(item) > 0 and isinstance(item[0], tuple) and hasattr(item[0], '_fields'):
result += '(' + ', '.join(self._repr_namedtuple(i) for i in item) + ',), '
else:
result += f'{repr(item)}, '
result = result[:-2] + ')'
return result
================================================
FILE: test/integration/beir.py
================================================
import re
import unittest
from ir_datasets.datasets.beir import BeirTitleDoc, BeirTitleUrlDoc, BeirSciDoc, BeirToucheDoc, BeirCordDoc, BeirCqaDoc, BeirCqaQuery, BeirToucheQuery, BeirCovidQuery, BeirUrlQuery, BeirSciQuery
from ir_datasets.formats import TrecQrel, GenericDoc, GenericQuery
from .base import DatasetIntegrationTest
class TestBeir(DatasetIntegrationTest):
def test_docs(self):
self._test_docs('beir/arguana', count=8674, items={
0: BeirTitleDoc('test-environment-aeghhgwpe-pro02b', re.compile('^You don’t have to be vegetarian to be green\\. Many special environments have been created by livestoc.{1667}, 12 October 2010 \\[2\\] Lucy Siegle, ‘It is time to become a vegetarian\\?’ The Observer, 18th May 2008$', flags=48), 'animals environment general health health general weight philosophy ethics'),
9: BeirTitleDoc('test-environment-aeghhgwpe-con01b', re.compile('^Human evolved as omnivores over thousands of years\\. Yet since the invention of farming there is no l.{283} over to farming we have get our food from the most efficient sources, which means being vegetarian\\.$', flags=48), 'animals environment general health health general weight philosophy ethics'),
8673: BeirTitleDoc('validation-society-fyhwscdcj-con02b', re.compile('^Many of the organisations that run child sponsorship schemes are dedicated to improving all of these.{594} encourage the sponsoring of children to build for a better future alongside other charity projects\\.$', flags=48), ''),
})
self._test_docs('beir/climate-fever', count=5416593, items={
0: BeirTitleDoc('1928_in_association_football', 'The following are the football ( soccer ) events of the year 1928 throughout the world .', '1928 in association football'),
9: BeirTitleDoc('1998_All-Ireland_Senior_Hurling_Championship', re.compile('^The All\\-Ireland Senior Hurling Championship of 1998 \\( known for sponsorship reasons as the Guinness .{91} \\. Offaly won the championship , beating Kilkenny 2\\-16 to 1\\-13 in the final at Croke Park , Dublin \\.$', flags=48), '1998 All-Ireland Senior Hurling Championship'),
5416592: BeirTitleDoc('NW_Rota-1', re.compile('^NW Rota\\-1 is a seamount in the Mariana Islands, northwest of Rota, which was discovered through its .{1135}many animals, although the unstable environment from the frequent eruptions limits animal diversity\\.$', flags=48), 'NW Rota-1'),
})
self._test_docs('beir/dbpedia-entity', count=4635922, items={
0: BeirTitleUrlDoc('', re.compile("^Animalia is an illustrated children's book by Graeme Base\\. It was originally published in 1986, foll.{136}al numbered and signed anniversary edition was also published in 1996, with an embossed gold jacket\\.$", flags=48), 'Animalia (book)', ''),
9: BeirTitleUrlDoc('', re.compile('^In organic chemistry, an alkane, or paraffin \\(a historical name that also has other meanings\\), is a .{191}cal formula CnH2n\\+2\\. For example, Methane is CH4, in which n=1 \\(n being the number of Carbon atoms\\)\\.$', flags=48), 'Alkane', ''),
4635921: BeirTitleUrlDoc('', re.compile('^Frankfurt am Main \\(German pronunciation: \\[ˈfʁaŋkfʊɐ̯t am ˈmaɪ̯n\\] \\) is the largest city in the German.{400}t of the European Union in 2013, the geographic centre of the EU is about 40 km \\(25 mi\\) to the east\\.$', flags=48), 'Frankfurt', ''),
})
self._test_docs('beir/fever', count=5416568, items={
0: BeirTitleDoc('1928_in_association_football', 'The following are the football ( soccer ) events of the year 1928 throughout the world .', '1928 in association football'),
9: BeirTitleDoc('1998_All-Ireland_Senior_Hurling_Championship', re.compile('^The All\\-Ireland Senior Hurling Championship of 1998 \\( known for sponsorship reasons as the Guinness .{91} \\. Offaly won the championship , beating Kilkenny 2\\-16 to 1\\-13 in the final at Croke Park , Dublin \\.$', flags=48), '1998 All-Ireland Senior Hurling Championship'),
5416567: BeirTitleDoc('Raúl_Castro', re.compile('^Raúl Modesto Castro Ruz \\(; American Spanish: \\[raˈul moˈðesto ˈkastɾo ˈrus\\]; born 3 June 1931\\) is a C.{1534}ighth Congress of the Communist Party of Cuba, which is scheduled to take place 16 to 19 April 2021\\.$', flags=48), 'Raúl Castro'),
})
self._test_docs('beir/fiqa', count=57638, items={
0: GenericDoc('3', re.compile("^I'm not saying I don't like the idea of on\\-the\\-job training too, but you can't expect the company to.{260}g out with thousands in student debt and then complaining that they aren't qualified to do anything\\.$", flags=48)),
9: GenericDoc('138', re.compile('^So you asked him in 2010 how he was gong to compete with DVD rental distributors like Netflix \\(which.{103}y were going to continue to compete as a DVD rental distributor just like the mentioned competitors\\?$', flags=48)),
57637: GenericDoc('599987', re.compile("^Giving the government more control over the distribution of goods and services, even more than it ha.{165}ply aren't competitive\\. https://www\\.thelocal\\.dk/20170829/denmarks\\-government\\-announces\\-new\\-tax\\-plan$", flags=48)),
})
self._test_docs('beir/hotpotqa', count=5233329, items={
0: BeirTitleUrlDoc('12', re.compile('^Anarchism is a political philosophy that advocates self\\-governed societies based on voluntary instit.{149}ierarchical free associations\\. Anarchism holds the state to be undesirable, unnecessary and harmful\\.$', flags=48), 'Anarchism', 'https://en.wikipedia.org/wiki?curid=12'),
9: BeirTitleUrlDoc('316', re.compile('^The Academy Award for Best Production Design recognizes achievement for art direction in film\\. The c.{280} the award is shared with the set decorator\\(s\\)\\. It is awarded to the best interior design in a film\\.$', flags=48), 'Academy Award for Best Production Design', 'https://en.wikipedia.org/wiki?curid=316'),
5233328: BeirTitleUrlDoc('55408517', "Wilfrid Tatham (12 December 1898 – 26 July 1978) was a British hurdler. He competed in the men's 400 metres hurdles at the 1924 Summer Olympics.", 'Wilfrid Tatham', 'https://en.wikipedia.org/wiki?curid=55408517'),
})
# NOTE: Beir doesn't handle the encoding properly, so it differs from msmarco-passage. However, we do not correct here so that these benchmarks are identical with the Beir suite
self._test_docs('beir/msmarco', count=8841823, items={
0: GenericDoc('0', re.compile('^The presence of communication amid scientific minds was equally important to the success of the Manh.{125}nd engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated\\.$', flags=48)),
9: GenericDoc('9', re.compile("^One of the main reasons Hanford was selected as a site for the Manhattan Project's B Reactor was its.{13} the Columbia River, the largest river flowing into the Pacific Ocean from the North American coast\\.$", flags=48)),
99: GenericDoc('99', re.compile("^\\(1841 \\- 1904\\) Contrary to legend, AntonÃ\xadn DvoÅ\x99ák \\(September 8, 1841 \\- May 1, 1904\\) was not born i.{120} in the way of his son's pursuit of a musical career, he and his wife positively encouraged the boy\\.$", flags=48)),
# Antonín Dvořák
243: GenericDoc('243', re.compile('^John Maynard Keynes, 1st Baron Keynes, CB, FBA \\(/Ë\x88keɪnz/ KAYNZ; 5 June 1883 â\x80\x93 21 April 1946\\), wa.{46}y changed the theory and practice of modern macroeconomics and the economic policies of governments\\.$', flags=48)),
# /ˈkeɪnz/
1004772: GenericDoc('1004772', re.compile('^Jordan B Peterson added, Jason Belich ð\x9f\x87ºð\x9f\x87¸ @JasonBelich\\. Replying to @JasonBelich @jordanbpeters.{24}for anybody with the authority to deploy code to slip a bit of code to enforce a grey list of sorts\\.$', flags=48)),
# 🇺🇸
1032614: GenericDoc('1032614', re.compile('^The CLP Group \\(Chinese: ä¸\xadé\x9b»é\x9b\x86å\x9c\x98\\) and its holding company, CLP Holdings Ltd \\(SEHK: 0002\\) \\(Chines.{290}any Syndicate, its core business remains the generation, transmission, and retailing of electricity\\.$', flags=48)),
# 中電集團
1038932: GenericDoc('1038932', re.compile('^Insulin\\-naïve with type 1 diabetes: Initially â\x85\x93â\x80\x93½ of total daily insulin dose\\. Give remainder .{115}tially 0\\.2 Units/kg once daily\\. May need to adjust dose of other co\\-administered antidiabetic drugs\\.$', flags=48)),
# naïve ⅓–½
8841822: GenericDoc('8841822', re.compile('^View full size image\\. Behind the scenes of the dazzling light shows that spectators ooh and ahh at o.{266}h special chemicals, mainly metal salts and metal oxides, which react to produce an array of colors\\.$', flags=48)),
})
self._test_docs('beir/nfcorpus', count=3633, items={
0: BeirTitleUrlDoc('MED-10', re.compile('^Recent studies have suggested that statins, an established drug group in the prevention of cardiovas.{1524}evaluated further in a clinical trial testing statins’ effect on survival in breast cancer patients\\.$', flags=48), 'Statin Use and Breast Cancer Survival: A Nationwide Cohort Study from Finland', 'http://www.ncbi.nlm.nih.gov/pubmed/25329299'),
9: BeirTitleUrlDoc('MED-335', re.compile('^OBJECTIVE: Meat and milk products are important sources of dietary phosphorus \\(P\\) and protein\\. The u.{1495}s\\. Copyright © 2012 National Kidney Foundation, Inc\\. Published by Elsevier Inc\\. All rights reserved\\.$', flags=48), 'Differences among total and in vitro digestible phosphorus content of meat and milk products.', 'http://www.ncbi.nlm.nih.gov/pubmed/21978846'),
3632: BeirTitleUrlDoc('MED-961', re.compile('^BACKGROUND: Current unitage for the calciferols suggests that equimolar quantities of vitamins D\\(2\\) .{1382}cy and lower cost, D3 should be the preferred treatment option when correcting vitamin D deficiency\\.$', flags=48), 'Vitamin D(3) is more potent than vitamin D(2) in humans.', 'http://www.ncbi.nlm.nih.gov/pubmed/21177785'),
})
self._test_docs('beir/nq', count=2681468, items={
0: BeirTitleDoc('doc0', re.compile('^In accounting, minority interest \\(or non\\-controlling interest\\) is the portion of a subsidiary corpor.{151}of outstanding shares, or the corporation would generally cease to be a subsidiary of the parent\\.\\[1\\]$', flags=48), 'Minority interest'),
9: BeirTitleDoc('doc9', re.compile("^Hermann is rushed to Chicago Med after being stabbed at Molly's\\. After losing a lot a blood, it is d.{172}lli grows more concerned about Chili's erratic behavior\\. Mouch considers finally proposing to Platt\\.$", flags=48), 'Chicago Fire (season 4)'),
2681467: BeirTitleDoc('doc2681467', 'Rookies in italics', '1990 New England Patriots season'),
})
self._test_docs('beir/quora', count=522931, items={
0: GenericDoc('1', 'What is the step by step guide to invest in share market in india?'),
9: GenericDoc('10', 'Which fish would survive in salt water?'),
522930: GenericDoc('537933', 'What is it like to have sex with your cousin?'),
})
self._test_docs('beir/scidocs', count=25657, items={
0: BeirSciDoc('632589828c8b9fca2c3a59e97451fde8fa7d188d', re.compile('^An evolutionary recurrent network which automates the design of recurrent neural/fuzzy networks usin.{1388}pared to both GA and PSO in these recurrent networks design problems, demonstrating its superiority\\.$', flags=48), 'A hybrid of genetic algorithm and particle swarm optimization for recurrent network design', ['1725986'], 2004, ['93e1026dd5244e45f6f9ec9e35e9de327b48e4b0', '870cb11115c8679c7e34f4f2ed5f469badedee37', '7ee0b2517cbda449d73bacf83c9bb2c96e816da7', '97ca96b2a60b097bc8e331e526a62c6ce3bb001c', 'f7d4fcd561eda6ce19df70e02b506e3201aa4aa7', '772f83c311649ad3ca2baf1c7c4de4610315a077', '0719495764d98886d2436c5f5a6f992104887160', 'a1aa248db86001ea5b68fcf22fa4dc01016442f8', 'a1877adad3b8ca7ca1d4d2344578235754b365b8', '8aedb834e973a3b69d9dae951cb47227f9296503', '1e5048d87fd4c34f121433e1183d3715217f4ab4', 'b1c411363aded4f1098572f8d15941337310ca15', '05bd67f3c33d711f5e8e1f95b0b82bab45a34095', 'f59f50a53d81f418359205c814f098be5fa7655a', '8cc9fa42cb88f0307da562bb7a8104cb2ed4474c', 'c26229b43496b2fe0fa6a81da69928b378092d4d', 'fe49526fef68e26217022fc56e043b278aee8446', 'c471da1875ad3e038469880b5f8321fb15364502', 'a2f65aae36fee93adf4e32589816b386bd0121cf', '97d58db3c8d08ba6b28fcb7b87031222b077669a', '3bb96f380b213d3b597722bf6ce184ff01299e14', '2450a56cfa19bb75fdca9bb80326502cf999f503', 'aacb4c8cbb3ebeba8045169333d9915954bfc9e0', '21bf7734d99d9967a92f23ded5c97a8638defabb', '6c80c53474a48d3a9bfdab25c6771cdc32fc754e', '1e4aebb032a75b186f6bc80d3ec72ce415d2c509', '278a2bcb2bfdf735f33bcd3423f75160fa349816', '5a6cf5c1cf29b080ed49707961c760bf4f68031f', 'cd22b27e2f094ac899b3f4795db0fd59d90ec4ef', 'ee05187997dcb548b86ab25e25a19a2eaeae46f8', 'd9a2c54ec3aaaea66cef9a664b704a056498d958', '41a5d7d783e7776715543a80f1dea31c2a6a416d', 'c91d076423d20939df90447c17f7995ad48af5c2', '115ab3aa4915185549dcb488a432934bc6e9602a', 'd3c27966e7ff87ea64f8e7644964d5d210bb4bd0', '239eb7a57f4dbf67da36d0c0ab2bc9ed7b2da740', 'b738cd6aeb90fcc4acae1811adb7bb569b198f26', '6cbc15829a4c16189f1871b7fdb5ca850555ec5f', '9c244015d82b2911bcfa74ca68555db4660bda49', 'd2a2add50f11f8c5be0db504509e1acfad435817', '0f7606f0f386e860db2b6ef97f4c71f4ae205646', '4e5da9e9bc3695609fea69ce04f147c7096ade8d', '5c01686a41c31a6b7a9077edb323ed88cf158a98', 'bafa852ed1764321494cdbe4cad97d022cbf24de', '2c7434dc50df0b4adb11e52fb6c3c1dd816dee88', 'a2d7c5237e7e0f3ed63d04a10ecd33a2e289c0c6', '644bcc8870ca92db212ab96640c98b26cf4708b0', 'bc7355ebb81756a284aa3489edca2da2f67e8be2', 'e7508004e13b0f2d3d0c3b07f4f967b38561096e', '1143e42cd4fcd8f2564834138c99555cfbff20fc', '2838302385c5f2338212e81962485c7bfb52bb15', '302e4b35e9c55c367de957d99c53567cd4f9af40', 'a468e406d170f802dafba994dbe9950c244c7320', 'd600ce2e7676b6d5d97be126014faceca3650408', 'ca10ca753aff094b91f51785dbe7b387e1c50275', '5d05a1ec8ae2cf34ce2ffd9efa07c6e5d39136ca', '1ce9cb252c3c3c4083cdd8c51f24ee1eb3a7cb17', '8f75355add4f9520fc4ffcf525419c5a299814db', '23a400d9b5a70223bf15cf0438d3408d8923ef1e', '7cbb016c73eb05f5a2b3996af687a0a2681fed97', 'c36efedbc8c0aaafaa32e42d93dfe6352c1f99ab', 'b9190fa1b349435a0532c7cb6a29bb26a7f7c78c', '3e4e7eab7bf967c2fce4c4af41e212f9aa26af87', 'fdf8da6cad2e443280845663f2fd5211fa2d5316', '9fcfc1ea5a4171cc7d1e6de3931999242f8a0ecf', '4339003eae685c293426398e801ee7e79f5416e2', '7c55fe6aa32bab5f61cd16df4d8156a0aad47742', 'bbd0c02b60a5737f49b019606f1d8adfc8eb4706', 'bad5e3c5fc9da04968790f1aa4166aa570511454', '7e720d2daa6a72b02f04091b14ffa74b5f9d6755', 'b406feee35d476c4aa516bff4ecc5c4c6ff6c353', 'fdece23a929504045b87fac8ff2c490110d1d624', 'cd887424b25d9036cbfb817fa57a3c509297a82b', 'f0b9dc64f6df004d3f776031050317f0a7fb1bdc', 'd71e30919f03df92d76bd3bb8d113f1df6b03710', '93336ce843d4f83e8c22e02494880398c210908c', '3e4be8785a1ad34c60356385a5a7417f7f2d6699', 'bcb7894325606c9765810563e89ad4fc275ae010', 'da10f1c9b5a6253ffbb9c8e933993b773e49a188', '106b8c7a4053d77ccca319a7dd4b054f60cb4026', 'e233b5c4b8ea9e6daebee83b956bffbdca2d08c2', '59066ff5d305249658150618dbebe7ab21ae82ea', '3e769b6dd0b0fb3393dc894752b0afffd8d2d064', 'd127160840debb1a7edf38ac5cf02914fb2f8a59', 'b8485fe3b70588697cad5f46726ce18ca8afb77e', 'f0ebee9b612d517fc3831cd45abd503539e25085', '381323297bd2017330fb53c2d81b2802cd7caf88', 'd6e99b3ba3d2d8c00c50a154a3e5171f99ae2c85', '03576d44433ecb04e7d87b526fe8238a4ae6d15f', '09b48c8502fa63183b39cd725ecebb634f83c037', 'b2409618be98dca139d2b1a326c9f47e279bd600', '8d0d940113d281ae522a72662fef3d6c40f9d6cb', 'eeca94cc6cffe49537533ee37fd7ba5d18e70386', 'ef8ff6394a9e463acfd4ce2784c68e2f92a55e17', '14d05578d0b6b71c2f023b71e9fe71a6b44430da', 'ae8440cdf5ecbb8dfd00df723f92c34d21fcabb3', 'cf6a72d1eb2011d5a2b57d58c4ad9bd3751c3443', '4c3c3bf48d48b3ceee50a9b463b80ab9834aea68', '7720e805627d1f5d6c992fcdb0f2bbc31e133284', '2e440ff1540094c0608c200eacfd5573b424391e', '919955f6b198260e4d889a8f6ac55feac3f20ac7', '88f0d71a63552a7cee72f7e8e18588d0776f6d8f', 'b4f7477fb596e4933323f153fb1e287bef8b1335', 'b52c1f8077090c660ab4a22f47e7f8483b4bb7cf', '7486b9ceadd64e496470e75b315eac543aec7f2e', '5d068f5e3dd9813c3b108f6eb08dc436a134a218', 'f36bb6f93f3045d3bae924411ec7a07be0a49c6f', '3dbb1997727ad478b1e2a6c8c27386cc92fccf9d', 'eec9b96d796a8a4b00af1b3e2ced301dd4607312', '5ee347218ba58940df02c35fd7fcf1795ff477a7', 'd5a8509199984393f728f4268acfee97a8ba4ff6', 'c1220fd757136150ee5f55e83b12cdcb4302048b', '4c422a7012858a200af4967b11da8f9a457ecad8', '44e6a642e05f9c11f2df02133d9b57d2a4b30d50', '02a4293cb083dd54a0d685dd4fa25ce165850557', '6222bfa3e282948e250a8688027079380012b2cb', 'ee666f494724f4ef3949a5532d59b207bb42de6b', '14e21725a5e25978b25ec4e27dc190d2e2ec542e', 'e37b6d3840d037f5536404fe65c81373c2574d66', 'bdce2c4a25826d3242e04f993af16c10845fc78e', '92ec3f0e7d0cd30a68b901878b3d27a752a6705f', '85c45753caf576297cc43dd285ca4d78f230dde1', '1c8bc4fe215841be7290956ea96502a7e494e76e', 'b0574087d5b0b4d4ed4819bdd590a1b7c2024802', 'aca085cef4b6c042886cb608bba803036b704000', '4da844e414c5ddc177ecd4bcd73ed975ea8cfd23', '924c6135ca4faa043fb8e0edce850630863a0302', 'b72c6b29224871d4ced87289a225b1ed47cbc6fe', 'f42b9bb8e50573c66e9b9166102989dabc76fe7a', 'e4a2496ebd30c6f2b882be322eb2aba9de3cd15b', '294ca24667d7c88231492ab3bc9167a4ad958456', 'ec658577c3612814cf1a2a0f7d55457f42e788e2', '368eea6972bf8140324e5c22684cb60b52a7c35c', '14e9aa7473b18aa729315939cb5b1e427275cd00', '9316a26093d195273702effdd9502a077f1b4dcd', '5bc936d39907e99068ba1d07f3fa8883ffa0ca74', 'a1f4ea8b9f875567e71f71f00bb3b0168642c91d', '21229ea3dc420e19b3b3b57345dcc2a5d6bea98b', '04257c2d45c025d1bac07119af163dad7b1aebd1', '27d6cff7c45020b673790d94f92a58ae1880027b', '634cbffc7aac82a1a7843db2d9c1bc5a351f6849', '866696dd758d404291195c651e254e428b6a4be8', 'd24ecc520e2d4bd7980dc5bce547791688a71f72', 'd8869d381387be11924a04b4525f9a408ef37cdb', '77db5dbfb33c0919f60c49bb5ddb99861ffef474', '0b5da93fe58dfc004d3569998e00f979766af658', 'a90b2fba001ea50302b8ac1023a06ffcbfd8f7bc', 'c5ab88898f33f2a37e4a6ab7563f562264c47854', 'd5708326c5e682d933ae1539b424aee93d6b7188', 'b2d146286843822549fa46ef0e5263ff5b8ef436', 'd66c46ae198aec982bb9e98762b39c9bb11bf6ca', '1f7c71361c065d2a0586be3868260a81122648d3', 'd76e965a8a5fb9139e306ea5052896f5816358a9', '9bdac06d5b9a9804f3f5ba029aaf3a974ce831c7', '284f78d2b7fb96868e1ce7cd4ed02321c450ef68', 'fe7fdc4dbd9d45487ab6c69caeb6182a69ca2019', 'eb21adaf5017ddc593cf9a3f252adf31ee240645', '1aa9501d7ec7084f8500864b3ff808100c8045be', '250019746203612925abbe02d83589c3738d3982', '60127d1a428037c1835292ddaa3dbca95fd12ab7', '518158093bc0234a8a7c9657cc03b79483c21a76', '41251b17e995217b2417585e2a44cdc07789a0f4', '014951e19c98ce58a2607fc12df411bacb982d3e', 'aeb6458d44cd2fef802fa5a9f59f87d62e0c02c0', '1c7b5df940825c71c97efa97e309bedd89562635', '4fb91b679f5e987ca6c3a9948f0985a52e9014e4', 'faff869e53b2f379ab23afdc01292a300132edba', '3fa12b2e36163600f62bc8fcb4946ad734cbdc00', 'ee12be5cc21a34d28f5a98b68d0ccc5c416caa12', 'ba91b468f06275fb4b7882421efcd8070aaeec07', 'e8309b9b44361c03231a62bf59d2587185a7e81c', 'eb7f5ffb624b9e82a825b27e1a3c7d23e2527a35', '468a2bb7a12fdd1dcef61a3194070d7d9a644fdb', '6983da35d34d9456bc6184e36d92426c5a117e97', '969b3eb194dbff25b137f29cb9a015dc38b6a2ac', '321e868c86d2a3c86003ac7aebe374c1eab25b81', 'a301b95dccd5a8fc6362e475af50182bf6a1caa7', '9396fe8096d937bbd482cdafe29e5c4e1751fd06', 'a9ba36a7b0bc90a9b85574c1c597805f05771e6a', '7184e02e6ccfe08ca3ef7c3b16be2023b6be0e24', '25917acd4d1e96faae4398452eeca743dd64b2d8', 'd0fae819552bd425bfc3d780429bbb7b8d7a4d0b', '410316db0cdb9001e76daf3e2a27ccd3c6156042', 'ce49a2c822c9a27efc00cc7aae022e8d1aefa982', '78d2f29d9b5af247250e04ae1e686b0c6886b2b1', '94ea5678154d34b270133ade5265fe21a551e2cf', '38e58fce0b460951ee28a2162fcfa7d2847f4ce6', '7783feae9f5f2abd5bf1584b98a5707c519d4769', 'bb7ecf6bfe1776320f4e7d68c56435573aa5eef4', '3f7eb16d88d60db473f703fb8137972293b6eaee', '55f923c75fb5344d4df8b3fd12e16bcc49db7372', 'ece8d11971adecfb15c81ccf4e0c5b2b48d10649', '0d616bc6963f3241bb2c417b4a584c0dcaee7125', '396fc63019a3dd0e550f6eecb0bfd1c34601aa22', '592623484c2d4a482b4841100eb2aaaf6fc85ead', '15667d08f9a2c4bceb5d6e8d3a368dcbdde75bd5', 'c30bd16c400bee7a51d7ce9aad20560d01e28ba0', 'b5a5534e0d3104a634f61e47773801649bb277d5', 'cca7f73fea0ad9d96622509f5428ed8410421948', '17b3bc528440771b104d3df884dffa417f61000f', 'fcea8027dfc0ca7b8871084ad55c46f09615df22', '410ed30d7eb5fbf18764d15089d15c2f68896727', 'f4ff2c3a64c8e094d46bd4ed89e9b02897b9937f', 'dcf92a2d6e5d96678031f313b8a78b6d8e4fdd3e', '50c079e60bbd843d32e46cd1b9aa7f64daa5b8ac', 'aa9d4403443abe31b7c828ee6df0b21c155f3dfd', '9d7ea82ffc353f5be53e245981bdf6c0e8e93839', '6098353c87dea12e0a5881f66ccf738face30d7a', 'b91b62138e674d8e35edd564d785550775d2c745', '0e31b1eca8f03d8b0cf561fb0f76835f7ee7f91a', '6b1ee3d9df1356725cfc04a6316725415e925fab', '1b1de1c77f7aa95eaeafe3bd0e8fc681c13e5a49', 'cb2db48e636a4d871185d832074f68449f424a59', 'fee8292e18978a34260e4a500ccdef9f1994a536', 'f276c78c60e04ba9b0329be8446faea57366a236', '41a0d41f46f90c017b539acd752674970b54ae09', 'a6c0d67613e238e73ebfcb1f9b90d7e248a89a45', '91df33c9139df01b8b42b9650d8690b6bcb76bb8', '269ff07c14a212d4f3f7711fcb1ccf5ce1b9450f', '2aa083a360f5a1d8b598b63d098f8c1b19e428af', '4d6c669dbfd103083f96d2e8a893a31738ab235e', '79262d5d32c1c7de6839dee1d848121d92d96c37', '6b8d37fd28d1dbde8cab08e032eb3830e994f8de', 'bc4e5593972e8cf713535b0fdb6acb1f5cdf93b5', 'cc041ff04e40f09cec667065ac30b9ac3ce2f3a7', '1c312bff25f99daf788c9b8a6db902baeb3dd5f7', '5762d52d8230068883364b93dffbca73c809e49d', '8a19a2bcdcb16929da444ae71631e258fe0b4bfb', 'ea5a76e30fab975e2549bc798bfd2c9bada3e33d', '9a872f7533e423a59aacdb54fe6139fdbb4e7cde', '7d88721118e3fcad46bcc943105c2e9f478d5fa4', '05c4d2ebc8a5bdfb32714fdb1950616891074b18', '126df9f24e29feee6e49e135da102fbbd9154a48', '405dfaad697076f3ca61b50d48db45722bc3b503', '23aa5210aa633ec78e3ce1823cb9cacb18ff7124', '6fd38c3d2f0a455c1223a47280485b26c0fb9b65', '9155a25fa50c2df0fc4e155f8f1e3fd8679ae4bd', '29c18531bdba93cbef2c431de4047c182b642694', 'e01d12ea29720c96003f59ec74cd56b70571bc42', 'abea867b5328f52c0c6beb33f60cb4876ea14595', 'a333ac76e4b2041ac684913b23a8cd719fd46445', 'd6ebe01eb11e9211760a220466238c368f712474', 'e5f06f97eda1af0488eaf495db763f4044a52769', '95eecfd28ea0f3441d838a50c5bfbedd3550070a', 'a5c60b655425ec47bd1119b6ed33edc96071f10a', '04f1a046c66f87073660a839930f1cb6200886aa', 'b43b67fafeaf88eda5673504063188a02d4e1b45', 'ca2885d3eba2983b82345e9362adeecac63f3ae5', '8d677c93182b8163e8bf8d6004906c79c1c06b70', '957d4b3ad38c69ee0b231b385421fac362ca5d65', '3560b2618e9c64cfd587b87966d9f19ff659a148', '8aa041e9e43afdc538647a3f68305538cb321003', 'bb6694528752e33fc74a6ad2d28ef9c1a7b8d750', '1c21e5c66cceb27d56f5823934a0fafa0157b231', '8d3777856af3010c935730ff5b3f482f259f0e74', '653818a8f78fa6b45d7d3fd89af84daa96eca38b', 'eeaf75f79f237b450976b6305724c72400c84095', '7b72aa337f1e5019ba47d92c26917062605fb6d3', '5ac98eb478b8c7b1b20bf7dde0be6ecea38f82e3', '78495383450e02c5fe817e408726134b3084905d', 'ecc84e3ee9348fb182b282a848159a24423efdbe', 'dd4041c26c4d50a7966697819a2bb1ce0e4d1783', 'db38399a4851e72187322ae7f7f9faa3f2eb69d1', '4068609212f024a8cf3c5b7bc755d415f269ce74', 'ae2a4c6f2ad099cd8f5eb66f1935cc1777244bf2', 'ed073c437190660927387d93f41b6db3b2684311', '5bf7cc6483fd054d0ad7f3a37eda94db4cbb6e58', 'c0c00fd1934b61b39ecbbcde007eb919d7a59bd4', '0638b5dd509cd4f6d68d10557967e7b66a741852', 'b88931cf54a7fc51e1b09fa3fb99ecd6cdf41d6b', 'cb9f43961f4b7033d3730e258b14bd5f59f242aa', 'a99f04d7250b5b5c29e3cc28a6c81dd0eebeecd8', '64835ff8ae3412811c182771e30fd33841cc92a1', '15b991b966a9b446705f4484e3380702121e470e', '2b497978211c471e4a5586a2d99dfd087b533b4a', '9313677ca439b3c63a591786d2ec4b3a192aa32f', '7e5110d13ce5a393977c1b4aca7c2a7c06680392', 'b04645579b5be335ad0e107e248b3c1885b9168d', 'b0ead0e018797a68bde2a5cef44926dc5dd8a27c', '1a93897ed610235bbd42debeba79d7cd3d37a28d', '01924bb7ce5da3457c4a20006e2dc2b92af72434', '07bbf1e69528718397e5ebf9a2101b8d9b320743', '5f074195e88c9a6da54602ccb5d7a755668f055c', 'df207c92b2196461da35d5fa69f0968b339709e3', '9bd81c4e318cb217b29f7c381def34f1d7454ecf', '850793dbb35ce40ba591eaae09f59b06ea27f4b7', '8eb24b298856bbe5b13590f185152a3af198bd2e', '353bf82f2ffa870473a814155b4214f93af41bdd', 'b4ded7e3a94e831dd807f0078ca2bad8598c8578', 'd7e7a08814f2d6456690334c04c832a117d35b2b', '6b8f96342eaafafbb4c87e7e6913aea2f9a663f4', '729c81595b1a792d08c407fe5d57826020837a53', '97e1cf63054283e8df52425cacd22eca1eb53499', '392ce361c5b9d0d948baf2a4010676dcf592ee7b', '6c60baf027de044cd8236deff23cdce78b525361', 'a10c03e8084116c112a954da9ed8ae59c426e356', '842ee396e6997b5036fb4cf0bdea527bd37aefd0', '41de39e0cfa898bae4a977f45408ce19dab329de', '21f427b1b38c45e499480f11856b9fe30615eec2', 'f80d7a7f97258d7642491434785db78c195c6f84', '807c519bb57af7329db35dce846849c900b7097a', '87b39a2592f2b96f8366f7467beaa406434dc134', '198e0e490533df571b9e6606c5b8a6e54afe2065', 'e3699b84cce7e3e9bc26f6450dd261d29721f00c', '9ad6f491f498a1c503c5f80a8acdd03156af1429', 'f64a08b8937bb243219905083f6396f11e33654e', '7f0264c0e4abb2d6701ea5006bc33a8e5b1b569f', '3ec328171fc900fca1d324ce25f238718cf7893c', '2c13925dce07b0b829ef87a95beb1942fe1f0e1e', 'ad64a762ca7ff3fd65cb065c528740a37b5064c4', 'ec9d2d7868125331d6f34d0cdef53cf27b450820', '1b39e04c57ff07f5614f81c0c4797ee50803122f', 'a7065010db5543c02d53f602ba6931e9f0e69d8c', 'e0171a544d32782a4c3882cb3ff34dd057a7ff49', '57c587eedfd0f25f86fad3991ab2978398c890c8', 'b441ddf343911c1a0f8d6b3a437d79ce00c68f0f', '89c8aa20414672a4591b2d4df82d334137fff44b', '64e1a2b174496b919cbb3dcf50902a9af5f444ca', '53c86b26f02f56e92a38c560e4aa5b046878c763', '4c32870cc6450cb596cb74b01eae3bf7144d03e4', 'f878ae929922759f6a094d6c856a23e939858853', 'a73308d440625a67c80cd2d5c9090008c9be9f80', 'e890bfb449753bc794fab3cc98ecf8b66a2601f6', 'bd1ab7fe6df2a5dbd292f5a89e777972c1e500bc', 'ab2129caff74c53defd559ffcd1f12d793e8f174', '464023e4041a378b4826a1ce406575dc79b473b7', 'e8bf30c2c133c6e3ae6681909efa25d8edb8971e', '5ee43bc72c88f50fcfb6fdd710a06225846ce00b', 'fdb874a5c8e58f8abccf525a727a3413e1f1b759', '643c8c17ece46f4aec5db67199cc40bba3fa2a78', '0d286827a2bc9e7de533277d2c11255bac66496b', 'ea864047e3fd214878beced6a648408f8eaacefa', 'c0f8667b872b62155e66e57e36e4bb4044506747', '61ad1c50d8bb1a67d33e0020673f3347dc85935a', '6a32f50bb7f92445f5e5ace812ba146e9e2d7e63', '2a2bdad21be9bdc6af948462abe2a222f099097c', 'c93db39154a1eeb6085abe6119d1f9e4de4e5c8f', '69430980fe2880b9b06f72e0327ec15d669b1a54', '8e6d070195d4a5a048aa8e8d06cc2798afdc11ca', '0f1ca0e044c0998f0ea294ce5c429a574916d601', '2d9a5fc8c6affb69e1cb38a0181cd8b1b38e83b7', '9e5a8818e1386f42c59a8aeac81d32a12f011d93', '5c250d3e035ef770850d36599e29ebc8c69d7dc8', '4fdf040a82f1f15a066c9a8ea0943ec2a6358395', 'ea1a9eeab82cae242f8ae6a8b0f8a5fe8ba27845', 'fcb3a54d3a6b9b339eb6f8583f84cc10efae8986', 'c88bebb40a245d1f5555bdfb80f1b902be62f936', 'f4aebce1b34d9de6a2a403641977c7870d6f3918', 'fa70d436d25ab51992efac193b29fd398a90f5f0', '8a14cd4d89d3b9431bde792d9d826d7c2c407383', 'b4066a397e303161363ce89862132fdeae0199a2', '8bdeedacab0b3ecc5968c76f219e501ca6659176', 'af4b5684fbca9ed7b50c6df3f72999b036ea8e79', '6e7b61999ed8275efd2d4b34e4f53e783e8a9164', '05f7f75c3cab14e941bd27b585c86d3998f412df', '4cb570c770dcdf88b48d235ef280cc8e2c9beda3', '1561f98b6683778e36b0df2f7a7f079069bb2a3b', 'de16ca9859ab10c36e8aea8eab09f5143f68b39d', '8e57c4b3669133e09961df068b7ddd357a2059eb', '34fe25f49a3c7e0723ba4eea466884f7ef33104d', 'd7ea23352a88c086c148cee79890a63470d16b8d', '7a05002e8edb4c28e40c57e987c792f97bef71aa', '35e90d988889a16545583b267a1f931882127e99', '3678067dc8d6ec3e787a18c8e99762cbb4c232fa', '787085f7b73000b94b9dbd59be04fc0b6596f720', 'ac1464a6d98cf0ac9a951eadad9dd065301e99d8', '0df69845757c31a9329dc74cde93e0c2138a6896', 'cc2ecd92bab62b16732b9d20636f6034fa439a91', '49838912325b6d046b6cc14d27b062c5f7bc1449', 'c7037b1179e4d38336b53b5727845f56dedc9a99', '15c174e00313143f52f667a4c0ae3b4a31ba07f7', '8ef4ea7a639b7f7e1ae9ac16177900dc6ca76000', '091bed11c0694e311fa5676a34b03079d62e5472', '81d86206f59ff83c5a46c584c6b461a95fe3dd32', '9c1d843c92c3efb30f0a151657b0a0ad92f2ea62', 'b5ccda12348d74005d9e928a42a7671a337b8ff7', '0b588ebe591e163d8b1e1df7982769e08ae1eb0b', '21d41af61952500538e907673613b1d3b765acdf', '0f79e6324b3ee895cb672a68c7020b8a7fde1b17', '4ed3802e8151f582807363e61422f3beaf17274d', '043f85b09645e788f5b069c7542c348dcdfe173a', '006ae86a02d8cdc2b471621ec1761583577ac484', '5d0885053508c58638af879e0de3376d3b3ac87c', 'd1176e115e3e49775f15874142e41885b1a74ce5', '7f2a764a9f5da7b2e5dbc9ecdf7cb85cecd9bc3f', '3b93daa4adee016d5fd0439e3d52380097c5a099', '8b6b4d9766c062d0e2ec8fde014bf90e98538049', '9fe20cfafebeb2d0176700fe1c9b64ccc583853d', '15f716405fa7fc11d3b59f19c98f2ec28734f8a7', 'dec613220a62e998b20037a447c1782f842def1a', 'e88087cc3fde1c50a9255a24a7822103d5eae9fa', '7b9f983e9c835a325b64bd15449d284f38d9a5ee', '753d9142646e875cf9ab1d1de18884f936e6daa6', '2d6d6226fa47bca22bf875335d45ff4347d9e67e', '77c2a4233cb6499d3739ffc530635f43960b41af', '899a5b6629a067ffab4d9aa570b0c13294c42983', '888c1a0c5977d413ee58e0eb84042f6d5080e938', '6a6c706607ff24a4868d5f16497fe0ae3d8e6859', '6fd2ad1aae22d635b382ea4acbeef67fae3af5d2', '975aa6ce3ad36ea2efa34fbca953c87f81657374', '574e7de856c34f980ac80336c913ae968b3c4687', '6f003acbc28d7b323862888d92cf12db16d4c873', 'c05874dae758a7456ed10679f538a00616e94302', 'cec1faaf96cd970a15dfbbee015df73cc6770a0f', '86d2f41be9a37be8e2b36ea57a2d41e82c61837c', '97a3cf067eb6c7849ecd0b47e215097a485170d1', '1b39e43ca0438d348a333c13c73c59943fbb9c28', 'e05fd4a3d18c58d6c84cfc82810a6244328b7689', '55924efe87fd5ca2642c14b5c9e6ffdaccfe9143', '6b3c55ba65d3ceb8f9db1472655ec0ce5a121aa1', '43720fe1b3660cc2071c2f72a05516f7b287c966', 'c33e1b6d5a6fadebf50c9df1ff901faf366413ab', 'a4134433d5d1fe9f83b44c04c59636d1a85120bd', '4268fdd19ed1d233dfe45ded139dedfa06a193c1', '9e164ab067d610a244224d74fa0b21aaf217d54e', '610c1c4ddfb745f44885841ed293c0d64d225f58', '6e0f1085945fe359b46b8a6b583f8bd3ab48d953', 'f300d56a3dc444c8f2ab7a4d0df71f10dfe0467d', 'e4a07483c81e90b2278ef29bd7bdc3ccde6d1ce0', 'a7121630c53b6cd44bcc975c293e9bca065498db', '53e4b45e7e652f55276b47f34e66f29f4e33df0c', 'b011484ac12278ce2fc41990b190d623b328306b', 'dc7df1803ab4bb87dc92679210d7b69ce0e237e9', 'd33a47e79344733264e95a554fa8d5ced7c1061b', '4bcd7aec3bf2493b57363f59e890c0a49a6860a6', '8b3956496fb31ca373991b136e476f883b8a0937', '9837eac799c279e35ff02f80e1651a89b4962e07', '06eab37d7f50713ca38a89744ad35c4e3b596cc4', 'fa4f5d701711149913af347be12de46fd0a192ab', 'd6b587f28be7e334b29881970ebc53d22d4801d9', '4bcb6a68c40d11d97fd33bafe8928c7e8bc784e2', '21e58c2114c2e33d7792881f95dd73ed4532e916'], ['57fdc130c1b1c3dd1fd11845fe86c60e2d3b7193', '51317b6082322a96b4570818b7a5ec8b2e330f2f', '2a047d8c4c2a4825e0f0305294e7da14f8de6fd3', 'e105fa310790f91644d2d9f978582652d2d4de55', '0e5fea0a13594cfc6ab9c8cdf3095ed16b728e70', '2883f7edbe5d4a80bb694a4ee36abce29cab5706', 'f191a434b8ea61ddb6b20cfe99e65dfa710ab5e4', 'a24109c954c160dfd52ea6ff107b9fe6f75da0fe', '506172b0e0dd4269bdcfe96dda9ea9d8602bbfb6', '9cf5415eaec3c9cb70ea2dac92eab7652f829fc0']),
9: BeirSciDoc('305c45fb798afdad9e6d34505b4195fa37c2ee4f', re.compile("^Iron, the most ubiquitous of the transition metals and the fourth most plentiful element in the Eart.{813}rk has begun to take advantage of iron's potential, and work in this field appears to be blossoming\\.$", flags=48), 'Synthesis, properties, and applications of iron nanoparticles.', ['5701357'], 2005, ['82b17ab50e8d80c81f28c22e43631fa7ec6cbef2', '649ad261855d2854f25093bf3efa541b2a249af7', '76eeff302dcb0fbef8986a6f317a70bdc1b263be', 'c3fcff9920c799eaf96378b289f79c26bf61a049', '4422e9e1655c09b558b3898f4b30caacd9bd3429', 'c93df6a2f47c04e3b45fcffcf5dbaa4032e65399', 'bc01ee13cd600fd91cd61a2367fee18083d38d2b', '52479e45194554361fb9d98c9ee33e23d252fefd', 'f6c961488bd11b9a4fc210b047f170a55a9eafa7', '1230a8fa38065b5310385d4d654bc425ccbbb6aa', 'e350e0d36f2f378acc8e81e773bd44e9ed22e966', '21cf8ba5f7965aff50269fc0115ad86e90aeccf3', '70fe357999f3860ea6ea10cd59d23148d7c62af3', '81316082c6c3dadc10ba97c1967979c02779ae4c', '0eddbed524231fd153e45a8c395601d754863468', '81c144d26df3d344b79ed36d49fca3aa20c9552c', '3f47268db51e4d283df6f3ca130e9913fda7c580', '849878e5d1140023cfdda8644052b0b2ef6d2aa3', '0b5c42f099d973605a9a6467baf9031fc9ca9a7e', '700bb2eabbe05de449ef18c74eae338aadfde954', 'ed10bf3bd7b9b006a916fa34d910f20fdf83d631', '9b83f2c29a1523fb765aa6317a3221892c3f173c', 'aad2912ff93b99a2f6abbbfe4627a82326a33565', 'd2e5db5f68170f768dc27407d02dbf56be7240a6', '56ba06f04c8e80920ee46b3a1c37de6918e0da47', 'fd072638033a25f2d635e12e17d4856b9611a1cf', '3f5a0e1110855a442e0fe31bd15c3006d90ba683', '0d2a00690d137883805ea160305ebfbdb9a0e9ef', '0b4a03a77b9c066421b94f18d06ed5051586fe5a', '074e2837dba47e756f270061fb723f06868d550d', '219872c6070a2f5400e95930a5e659305ddee09e', 'ece0c28b3b60672a2b2d04a529fc43a2bffa80e8', 'dccedf91d83f215186acbaf0fee8cce96630e69c', '016a58d257f1592fc4bf4f409d46e34448af9328', '4bafd7734b3f54b8d55b9e0755bef768cd96d1c3', '324d172296533539fbed787a3c255f883f0df455', '76e522626f8522e89e4e4334545ffc45d28b1c1b', '62226d54b69b7594441707e2af6f15d7c284b82b', '3692ac11ecf46a584e47d3784cdc8589628ffba0', '7982e3b444f78bc83e69e3d805e817fce0836d9b', '3fa55a993ccc2ad96d30416bffd346ba87fb47c0', 'ae9ecf8f5a62cd6145305b987ec796aeb0277d32', 'b9d80a195094d78080414f55988532d53a15d2b6', 'aa3d041859dd502b575638c7f118b722f066dccb', '653277b69172858b6972854291e0e716d8f487e1', '97e1b60b4391cfa956ac682bd9cc422694f542f8', '1e29d7d2fa274aa34e04ff223ca2e14a1e38154a', '3c57f7b2b31363cc370bf6d0ac16d47b8f910d58', 'ba2135a7f332e6cbc15680e57215286baba69964', '7311bee2d68d0bbee8a33070e681c7eb225ebfc5', '73245bba0fe6a8a344a3243b7b471217b7451e9e', '2b14b5d080181e1bba344047ef799ed6ed944298', '27fb0b03765798505a08760c748fffd3a0f477c0', '753670263c3ebd07de930a4e08b2a958efbf5601', 'ac5a49658c3f623e4a5c834edeb3a05c6443e72f', '2eab1eb680cac5787ae2b151f542cf37c6021ae7', '41d5b1ea2e9c040265e75eaba77d47dda24b8b71', '0ea10201b14c37f09cb707eca0ce611511b14fbf', 'beeb798dc5a4b1317d311934e85db7aada84632c', 'c4b70f1dfd91635dfb26adc2043ca50570df2ce5', 'ce8d1f138728bbdbcbba60ce511c1b01d9099ebc', 'c853a66473866e6a84f42a7b08158efa204926de', '1075e3a139b142fe8b020f0b37a96967caab72f8', 'ac6750c5f26e3cc6bddcf94db6f12db0c806e4a3', '11a9b9c13b00ac50efa85b7408a69fbf8eb0914d', '1755b4180e6f160645ca0f2ecd829288ac3558b5', '376f5f4b92a36b799e75efaf91f525aacf41c6c0', 'a2b6fb94bfbb0a9ed912df378d515fe12fc50c21', '79d0dded1bfbd307b1449e9f0515f384f86ce9c3', '82c0878c01a449e35d0d55cdcaf3a5a117dc3dbf', '57c6b7148d049c56a128b1c785a71a060aa5fd83', '314fce43a13565578b55a24dae9a96ed5a666c42', '0b07b032ba0eaa3082909e710c39a7c0baa6a712', '29b8c3b4fd393acc338f1af30b35dc0ab4c9abd4', 'f62ce373a2984a4f393723bccd931f254e476391', 'c8333d1c7faf00497eed14e6c0b8e76e3a3ee0d0', 'f7789ef50c67e3d9d57dc9d3b15a7f6f720f76fe', '1be32177c7a1b9c65dfacad88f0a135164b44246', '9359d828b98809f71d91a913284ca0b3a864b168', 'b42f4ff18a858ff8f02e6b144d295df7fb24a862', 'bcccfa9ea926dc235a1ebfd6447b722744628c80', 'f38611811b63a177d10273b9b3d2edd3456cf8a7', '2d574bd536a3ffe32bd90339beb7c194b91c7af9', '01302fc606ae1b1d5c7fccf0ea938e9fbcc74fff', '5d610cc0cced232b7ff3c8610b7e9f035a06d758', 'd6f235ff5f6a85c25d5d239cbe9a8256d7543ffe', '30b6c4d696985c2d0ae8cba908fcd09312de4bb9', 'aea3bf8bcdf385288c2a9f8abc594ae894976efc', '8afc9f9315f3549c3ecc5f8db52a55533254af23', 'bf0bfadcd6a7d24cc07deab36c18ef54daaadce0', 'c0a22c3f7cca7befde2ce5d596bac180dd64b531', '77ed0d5b23793683b26e40f894bb11f6978222cd', 'a4d3445492f8b44163704ea90d72072df228fc8a', '0d2707b130fef30620c3b62d48e59716fa22a43c', 'e4685e3bf2583d249fe0163db58f620993601774', '5df1f62d5d94e4902064bb13caade24014824ca8', 'ea6525e27686631438c56ecf6446e8718a9fa39a', '158037eeb0be786d51577d0db76141a166b0006c', '099eaa1b094a3e4228098d7cc85a5ddd133bca71', 'f3cfa4ab011b43468de46a109de6eef21d1f8d7a', 'c56349ace2995868973d5f05f08a073dd1d7d9ec', 'c4f6b2eba635152d382a4b14668c11f9c367c343', 'c34c62fedbda674aa0f16994a0978092f7dfcfc1', '9992dc990ea5d51c9977dae7dd97410aa59a2a3c', '8f23e205e11b4ad316791f200913e35e89516377', 'adaa03397ea52277404dc19b791b25419b31c373', 'a20414c9016670be7d6e11bdfc9ca7f856f749fb', '909d29ff282cbd443502c5c0d96a613c67f33629', 'e29310ff23984b0c20d1df175dfcf48b92fd640a', 'c4ce94a35fdc50bbfacb8224f852d74259605671', '6fa1ab72cdedb42156299615c740717a44158cbd', '867a0973fb128e373f02b35407fde004e8fc4996', 'dac854b3a2e5953da02a181d053528c9eba3d975', 'ff5ed956fabfae3df623645a3e320eb8e2dd5e41', '351b44c60e6a2e02b1915151904c215a25a2cbcf', '0bd95a3a558049ef440458d5790c36f7fff9174b', 'b50647c4c2d0a34a2399e9b682f7d7387bf0e547', 'ac04f4f12cafacab1853bed4eafcbe7e4afb69a9', '0cf7a2b868929aa88fc4561b83f05f7d58fdd021', '64dd6882be209c9efdc496be3f7a399c02bfa9c4', 'de53d4d37a37668b26a54afa7120a5182cfd786a', 'e12c59792ffdeb75a28f3ba81db308583c0c17f9', 'c976caedc14e0400d021ea3d61199b88d827a9ff', '017e2fe5742d457636a86d9821729aa05cd3bdbc', 'c785c0f92a180f09601d9962f3aec18814e649cb', '74d843664fac2f50d591fefd451febb219c672aa', '4d4e54fcecf8c2841295a7331382a1047a5fc662', 'c15241dff7c872c86df0990c94f9883b5af1808a', '7d1f3d5a1d7ccfcaea5a24152bc7c1d0fe10d62d', 'accb8c725011e522f87f15b7f2a1951f50726d98', '9604163e2a0e3471736f6417b7867448a869356b', 'd55d770616c3199a53b352ac618633cf10ee4110', 'd3c61afb3a20b887c5528ba87af70a47c557d505', 'ffb56b30db82123488fe837b080b26d69de1647c', 'a8b884d6164039714a5d418dde0a5a8fd6693120', 'e32cb4b6c12c7a26c9e9e3d0321d255b95623856', '6f9af3a5881b9e6c2be0f3e4f56d08ec46f06166', '782c86ae6584aac656a8ce62876188b16afc7eb5', 'ce94afc8f8487ffc21981377c69f8f818caa0ef9', '443da6abcaa3cf6dd145f8dd557e19084bbc9815', 'd04bba486e8c9de2b8806e0b99c48ab8966449c4', '28d536de93f2bd84243ae235b4cd258fb48d88ea', 'a35a221b962e4a665412365fbde6367d9c369d96', 'be2fc72b84bc786939c991f2dcd021fb1dd7dff9', '68f36ad7e630629c520c58ce4fe9ac4d2491aaf1', 'ad5ad62f929c1c4ab202e4ef7c6bb6bfd1ac95c3', '24bc7c579fa0f912b4a3995641ad0bea0fd2dceb', '854b6cdb220968a68cf6ad11d6c3fbe96783d8c8', 'b009814a37c19914a564c071540bdf78c76f56bf', 'a5f4f2ab56ba1ac85a5770188caaf2a4ee16dcef', '7118310a746334aa772f94223c8cd2a240a9f4fa', '41670cba9f5c684c49998b1831abd7c46766bc11', '0a173cc4df2d354ae6913d737569b28cc6b9d992', 'e4bef1c2f5c2ddad1a8d869154fac6d0dd989404', '15aa004a99ea2842d2ab70d840c2ed29c4072c2d', '5bac092a3a48743ed515db42e1745004a0be525c', '092787f5e169a4658646d884ddd93092bc4b301a', '969596feb7162be4202d9b356bcd201ae1b9d70d', 'c25732e3973def6cadef8763684be8ff0235089f', 'e2021aa1ca43bcd99b6ac04e4da3f8d8d4ef39c5', '8209bed97970d4e8c908a49733a2cfe79b5f2315', 'ea1948d8a5eec4f9a0ccadae9beea0de08b46c71', '41999eefce1245fb1cc765a9246bad5f1ae247c8', '8d52d2e3b5b17413da36db38b7e29867888e7af9', '5bd9c844ee92675cb37c2a0c2156aa12dda9f9ce', '8f4b37f0682042416e4fa0f8d5a3acfce5c23d22', '958d8d959eb1eeeebea185d64aac34b45e46c2ac', '03516888707a1e1f87d8fd12345c311a90b961b9', '39c5d5a21947311de08c8178bf903361f257a14f', '35ee5c602cb52437ce7a4c75fe96e53df66288bb', '1b36511b90d6bb197f9308c4fe26ed1667c33023', 'b5bb5d8e2a5c8f5f7f04bd0965ceb7dce221bf6b', 'df8350e74a4d1eaf4a17236f42b9b77ec2951b5f', 'b55a5ccdf3e173a9f9fe470b8d61980c4843d67e', 'b37c32d0802a93cd4fa976452125344d516fc004', 'f670c311f4f0c41bd4202f29fb4ea4cc0ca8bd9d', 'a24d84fe139209fbda2311c6a9b357e28c10bcf1', 'e00ca9adfa26620699b0475509de93460f37268d', '865caae7eaf3a0e7ee0a12e95ace2a02bfdd8919', 'efc2cfe6a9003a230713003ec5df8eae17528826', '22bbcb8672acd1fdbe2f694058a6244efd435d1f', 'd3de8dcb601d10e9b71a3b726e0ff5111cb08aea', 'd7aeb2d16745ffcaeca1fb8e0a8f87b6738bb44f', 'b62fca281fe3fd29a1e11a636f007c5a55bd9938', 'c81c75bcdd7ea2b40a3712969eb5beeffcb390f7', '53eb63a95ca2b7e6c9b82ddf24ebcafd6cdaef48', '7a704ba047cfa97a2c8057996576db9a531f8afc', '72385f812acf7ebd57cffaf06536a3486383e354', 'b955cfe4a4407d3ad2e6d53c152a36a2b43d56dd', '1621ff5f63dac23bebf332fac295ec6d41e0ac1e', '7b4beb7fa993a24dc88ddeb5faa9c9994aa18230', '9feae90b61adc5bbde268e035897aedf9e913f5e', '74e6d3f71ddcdbdeb55a46c922e8833f103fbd16', '92137a58b6a250a4f39af004cdd05bc496ffe9f0', '39cd8d5b25186bbf99e61022e98b610d20773317', 'ede32d4160582f65d737a4712aee7a2015f5390a', 'fd179de2435327e7ccb95704702e457aac11123b', '3a8d6149eb4c92f1717ebaaeb9e51c4163a77e11', '992557cc89374c5b3c2010ba12a82717912608b4', '3b6262bcc745fffd6aef146157691fd6ac33e6f9', '8129b5b0f3190e29f65c09ec3ef8727c9a2da3b0', 'bcf018c16e4e2c11d6908facd658eed1313ff3b2', '401e73ef446390882f78d7375104e1ae4a344196', '4dec24735e5484fafabca32d9a3209e18a247942', '497163e2a7d3cf4a3908590b6ae68d0e45c0d6ea', '7851c70e64465581cadf41e18619a0a0726aa00b', '4ef426e926d2563d59915dad1ec87ab16b947427', 'a33419689c715a8a260a01b2352a05fa082ecd39', '8a371dd14880a9269d12f70037a45b154416babb', '48f10958236db99c535afae9bc2399442ad899a9', 'bc3e934f0b8c5b398dd5f7b9e9bba331c4d0dd06', 'fba52f021cc9318ef8a3eab67157858bdd6c55f6', '525d7d59cd78ae002c0fbc7407c8f835ec104ce4', 'a6414214a8a14bff445670890468b5d5e0f89fcb', 'fcf6e8b6d040aa6f1ff3c2724be88ec27d18a61c', '5ff79419c1222da84b5e37049d2b8eaee360c474', '9bb1864ced59b42d664a968dabe3645e91b057c5', '77f8a71ef25c0e230ae7c34b13b132d73fe37f84', 'd46300092fe5fc457136182b83483b8fd2a02e78', 'cc5d71a21289e532111b57be9d306bb070cf959c', 'e6a5a36705a34393dbda3bc8a9b77d94b75651c0', 'f366350fe99eec61b9e5e5fac070caac670da5e0', 'dc825f53d9d148226a293905a0ab796682050035', '68931e654a5dc1d970b4e2f22cb42856654faa2e', '68f4c1207bb2157fbb9f475100772797cfd098da', '2359ab3ad6039b5493907917e2224e2cb9c57d27', 'ea338d19d5a05d1a58730228f34a6feaadde464c', 'b7f1855af7785a6a5072f444ea9378a6e6ef0415', '79b4960b495e7d41b94f916359c0fab562ea78d8', '12c1b2214448324fd3d01d316c86665b00fba379', '0219b02acf88159b1357139c559f8eefed952091', 'e07392e9f2318d6eeb7f7c41b691534247e11a2c', '3eefbf1328896be2c792c7cbf96a990d77d8feb0', '49f8b520f74d461665d954f70acb3935c73aa298', 'f580f4b6c3366a0a1a945b1327c3b0d4c540507c', '2d0ebbc0e8f54659845fa2153dfe226aad82ff95', 'f00eba6c08f005ba7aa6f18e44cf0015bf69c8e0', '0f184261246ccaf557de764ef80457dfef546403', '73b9c4c0e575c51879c3631b6857661213f5cd47', '5472998c0f2e46a647197bd1c48041618ab522ca', 'efcf2bfcefaaba169ee645709ecf5bcc206a747b', 'bdf9581688b89c21ac1b1b861f3aca5db5b95a2a', 'ca4fda4f7bf014d4dbcf4dc1bd80212d1ae61d71', 'e206bd8357ba07cd3a48cb306dc7c51b0d329e7b', '4baa1d0a041ec74ed0a445d9387c57bace6fda3c', '39ada29c525ed3717e3d7869ad704b2ba7b544c4', 'd17409c5ef135740ed24337a5da29d7e9154f87f', 'e380b3d3b506aec795513c66371006d43e426f6a', 'acfc24f92024c4e2100c6514ea80b81a8635c410', 'ecb7771f0eac083f85e0cab38e314fda2818d3fc', 'f1199d9ae3e967e804a742db3a593e9ce7654a1b', '7d69bb4459ec841dffe85b9a54fd5265de0cc007', 'c1053712f2ca6b1b937a414b2715fe96417c9083', 'ff53c4d87109c09b43b8a76e6e87ec1da64badb2', '372bb9084c6045f281a81a6dd782a06b209f8d35', '6976f42a2540a257f330f6a965dc0695a8cb7092', 'e9147fef923c5eb43f0dc12fddb637ab15d51ca7', 'abc663f70ef0a1158993bd5701c8477fe1e46982', '79f31764916adf5027c972acacbf68084b5b7b37', '3d3be4c0e405517af14e27b8e1fdbde543658391', '1ab25555a9174c4707e388626d0d6540f335341a', '5ad26cced507b1b39d5784d35c71e3cd8a080f58', '78c618f755b5822ed7d9565f63a717b41ebdbd47', '230b964bbfcb5ab9476ca159312d56e917fc6dbd', '33aac94569d23bd6a988e94100e47839808ff04e', 'b301638ae79e0d3af5eca862092a1a2c39562edb', 'e78653a4f162aa37ea4acdfeaa23792f2ce13bb5', '575f59ebb94ac4c1a29624affa1930752628a4ab', '7ea6f7ecb8987fca6ce0b561956029da127e0b30', 'f55212398b23a7a441a69c3448741f040c0929be', '4e3c76b7cbef1f11863ad535e6a43557ef9e0ed1', 'b76e1330c3d373d2492cdfffa4665209b5fb50fd', 'b0b2b7600879ba59b1a6b3dc369c0e0aa5464ac4', '2e0fa5c6e38b900e1b3622bf2ce3cc6eaa4d983e', '5ffa3c58579776df72e2b77b9f9bfd51c235de6b', 'f83be882c6b9f84b5adf76d4251bfba4b703a399', '2aac341861cd7269f97ff5418de23836dce6de8b', '742d730969401e56a7de1a35321f1ad8ed50ecbd', '764774f071883e2ff68babfd063d43a4daa2f921', '55ea401f7660c52e877bacd2229c97b2c6955ac1', '0deac7c72edb8cc37db930680238bb2601630e26', '0dc294932dafa11cc89149ccb29e810521654787', '21b180e4e2ff346e45d3f1aa2c48c81db84ef46a', '554386661c697e1e6c341637782f3ae00f126757', '9a08d01c5e48debce2d52e698beb6c50e062d665', '9f8f731cc84ddf8f44249dbabed4b8dd4a3b8d08', 'c3c8547c666701666ce75e92163c241ca5f85e2a', '2f7724bc63966142ff91ec88d31028b419866432', '91043b821acdb9d9c40462462902f46e51fd0a63', 'b089c66326e30ab02fed73fa264fc2e3a5df9603', '7f62a0e29ecd6015dd43770c0518793a9cdf96b6', '7c0287d7e14bd29f349c34021962cf9426f65e92', '060c0a33f2a20a0119dcce7757653ae7dce9a42b', '835752249a453a521656f81ab304255036a08949', 'd738014c3dafd3ae62bbf174f03d54d2e1a5ff80', 'eb90379a029c1770a84046c87771d9dc0d307d09', '1066e0f9b917b48d27e80b7753cfa28993f533a7', '06af1fda3e210c48fc53448adae16968a3b56dc9', '936b73c9112ebd80603657df0a2348ec77bcd777', 'beb69b64de0a5990b4e5fd2b6c8242d11a607939', '9cc8dd3f817acfa7bcdb7396ee634c83186c0965', '2c2e82fa02d82854f076a48f4df4e6cbd19d71cb', '67f2be0495f4e254e06326182a85dc94519c82a5', 'b33fed1870ad04e3a132d27cf8b851bdac500f99', '7427ea64f1ec24863a5908e2b150951b63e3a999', '16ae92614e0d5bc012592eea169351e1552227f5', 'c1dc82fab3bda6bbcb62780a277c40735d892e25', 'a2bdd8eed73fd484551675c1428a8a80bec59a9c', '52b8208627010d872222f557891e82cea7e46ac9', 'ae103997d2ffee109ddd6ca5edc9488fbe18fa7a', '1608a4f9cd7348eaab2c38083a97aeac9bdd0179', '9fc4fc4680c0889189219d97c99ef21d4ac0c3e7', 'b8c0dfdaa8a10b2154b1009d755ecc4434a59b55', 'a03cd9f82b0dec7211cf69706a584e76e88be92c', '60dc76da9a4ec31b4a82412805e8bb9da97d2b62', '427f96c89f2a843d4178d70c567b1eac0a34c09d', '52f7101ba422d10ce51f91ae17e8a9c3c3e9d803', '319af963f7916906a8b289bc29ed73712e62c924', '926c26d1f4f44a3e378a0afb0dd4902f54f56cfd', 'ddce99ce4ed40bb0d628794676b9aa7afa02e3eb', 'be9f029b77c20f71167a73332ee57012ea0a7c1c', 'dfbd9e2f0b2c09d9dc554bbff38c02b80a936ad1', '8c9aea4b33049b53c90f40bd5ffe914edf6809db', '395b8c11dc7629faafb4b36b7996fae8888c3734', 'b86ccc5d835789db47e67e3fdfc8431ea448b706', '16a59335dc6283b5d6af107c30f7c95d801c6a48', '9b292736c1a20393c5066b251dc7eda48cad837e', '80581200117906d25b4613d37ad2dee5bf40412f', 'e6d99d566c8acc46b88a49a5441e641ac9bba22b', '5485c00be97e3079be0adec7ce43930b8e88f7db', 'dc052e81525a6919e21a58fadc7f32b843d7b513', '566175d708e15390b3a8e2c4f395f271854973a5', '32b623788b524de9c2e63cf6c7e9b2e7fd18fcfd', '855c40f815b90d177109232b88d8b3a7551b5237', '7c4d2db34738222fb564df8b0664157deab68cc4', 'c67a4eaa3f27267a25037ef8b06822f6631e783a', 'e45a59ae928c5f2ae0f56b6ba781455958eb01ce', 'c6f221afbe9610f2e9794f7346aa1e586ece2ff8', 'e145f48e48416a03d428a8df054fda200ca93eea', '09edabbe36b0e8c093d6da0047896587360e9112', '9746b0d7f7405fac73e347a98eccc4c387b216a5', '6c6249c85922c18e422031f1acebdc9339aac0ac', 'f251e5af0d9a2114e6becdaa3f54325cb743ca98', 'edbfe3b8c228ca664723d464208f161b99819514', '32d814c8f0b83c922953b3dcde1842a3bfb12f88', '33707ac4306abd31f160bc2b2fe51f846a7dd8e4', '24abddabaf2bc24b74b39120481f00f3bdf8dee1', '8850cf6c6666b79a37683d6dd23e6a1a42fb90de', 'ad889aa95c9e9fc81bb39a830efd816d40ecbcb1', '3ff1f4f289a5a3b30843977b9e2e0b96337c80e2', 'b49c22c953c3347f3564c09143d2b4af8de650a8', '4b98980d38325f4acc7dc97a4e0f28b59104a2ca', 'bc06585dd065b5dbdd8c921842fc9b5fc8b0f824', 'd335fd80c55caa51bb45df048867145741442efa', '26c88ad749aaa5c38bb8df402c688945919a509a', 'a539ba2d6f9681673583ff14434ae6a10abf6007', '0f0742dbb0c5000610eab03a19ec3e143924ae85', '91c009e7c9ad80bfe17afe16549592434afab665', 'e9c287b098a12dc688b5bd15ccf3450b52332afa', 'bb72a1231d67087e7f9c6127aff01bb2cc88d6e3', '3399c46b98dde9ad4ed5f8ac6d845e879e8652c7', 'f813cdfcc4335225363dc2e451c3fffeaf802d48', 'da1478be124100b7b1e7568ab620c9256268a277', '6108b950956c81a70ff3825080f861d05089edb2', '6828e0f244452d65b87d2ee9d951e355dbf211de', 'bc3d402ec8f695b6d2693e2fed826a3df53311e9', 'fc9f9e817d2e516003f748f0a8b327c5490a0231', '864ec1624f5113c76e43c722a950993ac89b179c', '91f41249f40d7bcc23168e9b619e926cac0c0c04', '230f5f69a64e853d9f82a7671d6a3e23cef21833', '53d6201b1b93fe6e3b2960379e04451021e100c0', '18dd8da3ff75481aa3886c45db32a82c73312bf3', '375a5f47f41681a56972fd459e6ee4c45c157075', '84cdf4f541e5df3979c0fea8ffd9cabdd210d9d8', '00c988c15598723a56554afebeb7205ce9e125ce', '50272ffe52aef0a4809ec45fbc7db08b2ac5e061', 'b111f0a20689f728775ceff8eb9428080a38fc3e', 'a25987f004462fd10bfe725b80ce951293d1f023', '4f92e2f8aa903474b7ed42d04ad46d8258bf35e8', '07453e6118007b17085f4c7b2488980a2f286d4e', '7485695064599f3c8bce132a67b62b3bdfe00494', '7e91565735983342e2d574a9041abbd489a4cb30', 'e80c8919ad8c91b1d07f63840f838c030a1f0c9e', '6a365c98a6e471575bdfeaf91bfa6a4e59dd77d1', 'b2bfd9406c86673be294d1f707f0880e9635cb6c', 'd2fe98b10345504770d388882dbb5bf1066048c5', 'b239b72a6889c3886bfa4e8b4e635f68062338ae', '88c5ac54a4833632bd41ba007a18f8dfe383add5', 'e23eedf4af59c9a6b25e5131185945f5bf7f3d40', '442855386a0754f524d8ab75b0696c830b4dbd45', '4e7b1213b71a0c32a7de951140214811d6606601', '8fb23352aaebc55e7be1245b0cf17470cea10f01', '967aa275ef52cfd0c1f9120ca39dbc94d5e23181', '1e3a6dc165d7351f20112a4a6a3ab77a9ae61f2c', '37aa433dae08bda1a45c5e83ca8497c4c2f8d95f', '3b6c72d3eac24cbc966cc5fcbb4443affe67fd15', 'f890bc9f98fe18f811d8bb2708f3411bba5dff12', 'd978bf57ab07e2d222f6c13fad6251d10ced695a', 'f35865413c0769a1d018bea58e5552abd2f2f141', '647bebe39fbd4ff408600f4d7fd599f4bbd8b4b7', '97f300a878382f910dbe5c950b29ba0c7b6d1638', 'e43f56c393009fd6bddb1628409c37162310591a', 'cbe50ce355fef7f56bd5bbc23633e5e0d9a4e4aa', '2185bc89e605412401f7bcf3744c0eebf6d160f5', '7dea0e9e5685ca477f512ef3c37147555e03e2a2', 'e0871c5b82ac7eb4345c8f7f3d393454eee32d12', 'd27dcea3db37e2135ad62f644dc6d8882581df59', 'af1f8904fe6f6041869790150c64e3804d5ce385', 'aa869232abec3716f6bea6b05003dc8fcc96fa0b', 'c558c38a0ba025342d50a7bcf4fca78ed121322f', 'baca82c40b68348a9788f68c1d917883f4455bfe', '1a0f4fee9f2ce40bfa98963659400ffbd72f533b', 'e95d55ddbc0d651bd543ea3bca650e4fea6ec3f0', 'ae61a81a2986ca66ce262d096c7557c926775ece', 'c87444cef977cce169853815ba3e44e6039a6176', '929d760ff022a61d83267bb31e5a9debd0fad992', '3eacd8a05832fd3741865f0d33e93509003057d3', 'df8843a770a3817aa6128144968f9780e4d8b93c', '6babd89607b1d72b80ee6f7e5a2b600eb7dde83c', '48d80bde5fe88f2c1f02a4cd953e176007171b39', 'd66382542ae64f9f72845a22e584c7ba3814263e', 'd249718d5a0abb9a24bc58085dc052a5d0065118', 'aef9504ab27b78fbfa8c9e03dc676546c0d9ace7', 'ce630a0ca55c28ee5c78701445f99de22e37d908', 'e5b2f34a3cc5b0fe8967fe4e911060fc33727608', '92096cc0bff6454642716f5f8859e37bd5a26d89', '3febafa73e50ba16acd9ebdcb8f97520392acf25', '531d1311849001df8a199a1561a8ccea74d69b31', 'dec866c42e27e6ae9a1d04de405e06abc9c43231', '24974211a78aba94f94e0b1461ff8aecde6a2858', '02060d0511d0b69a2a88ae24d70bb3bff5f55196', '7e0ca91c5a0f2358b5e4123e6ec97edc8b001f6c', 'b47975da306b3a13b51a2ab9d2590e9cb380a358', '199b8894a9d6c45a60784afe53e791663796ff82', '3cda6f14b9c753ff7c7a1fe1a7ef238d06de7d8d', 'b6f87b13112358d65ac4a4868620bb15e916e74b', 'ee10fff76b317675bef74fc788f89f1c59eab4b9', '2063fa14fa570773a6d88e77e777d87811ca0e55', '21c4188daf823693faade26d11b2bf823d68a1a4', '9fc29f559fbe1062abb47a1accf6357c2a23db0d', 'e5edb1fa542980f20a99d5e740e03e866a0ff86d', '3d3972fcaec4f025766c9a9fad74675caa5ca991', '023eba60e52495d80efcafcef9abf35adf38baf9', '30117df64ded972ef9ad78eb1cb3086e43424d08', 'edeea9ff910f0a42f3efb722e203bf1a190ded01', '595054842ee43c6ff199afe1e6e95cee3b18fbf6', '599b3cfab390ab4d2757cde4807b2468accf4004', '58d91aa5b6626920cbb924bea2b8933e95b7ab57', '060163b1ca089f648cc95ccf9f31f6bd5645327d', '8b58936a85ec96e6f12e7de5558141955cb3baca', '461c23462aba4e4c34a49e72564da739a8f64d9c', '0e00e2789b9907bc3c5d192bb0f6536a816b2a72', '92824bfb08987bd73c60b4d69abcb30dbe9f328c', 'e14212c50b25171a3bae06acd67088f6189f1f0a', '09ab8816f2dffe2d9fe651e04a4d8868e701afbb', '788d64bc1282b8ff8d0e38c19b60e7679b6c569e', '9a7d6f41fe294aaa7ab3050d2cd695d5a25ed3a1', '32f134d380a8d46fbc178fd7f66cebf6374ef570', 'fd8ef7098cbc17b1b39431598250a10490a172aa', 'dc9364152536239d00204c963d96e97587d691ed', '16bc2fd12f2c4bf216e2b0de71d75477151926bd', '3ef379894ea4780ffb5be0f180008d85fc955c7f', '1487d2e3f8f66df27b1d62f999629b1dc8e7850b', '525f139a9b30ba35649c122a0e0c3024acdf7a2f', '592074660bd09cc687eaf1b5405e695b9a7c7df8', '2446763300c204dcecf16e603a38f3a9572a428f', 'a1025f329fc408b30e46a7ba2a4cabdd7b4b986e', 'b8277c72a333bdc50fc570e263e1a00e77efafe5', 'f287158565caa170d99d47f31c5fcf03c93a293c', '2af88fe47c4f87430126690410366cdca27b7c05', 'c8504bba850d87d6c1ba9ec3d32a42f5bcf38078', '0039975b8f375ca7f4905bacd70cabf2bdd38f84', '511eca6e5b4e68c55bf090ec1802501b76fdecdc', 'f00dbb58d325623c5a6f934fdc49c74995174d20', '6d1bd66c17e47b03f4a2449a934bbfd3a6e5f535', 'ca98a256c3430490eb6d2cf7994b60353fc15200', '431a514a8b439da8612575ccfbfc4f559880b38f'], []),
25656: BeirSciDoc('dec997b20ebe2b867f68cc5c123d9cb9eafad6bb', re.compile('^Training deep neural networks generally requires massive amounts of data and is very computation int.{1320}classification problems, thus significantly boosting our ability to solve such problems efficiently\\.$', flags=48), 'Deriving optimal weights in deep neural networks', ['9716460', '2116548', '1695338'], 2018, [], ['367f2c63a6f6a10b3b64b8729d601e69337ee3cc', '178325c2b267bee56931f22e4f17c6454de7475a', '0d67362a5630ec3b7562327acc278c1c996454b5', '2efc0a99f13ef8875349ff5d47c278392c39e064', '15e0daa3d2e1438159e96f6c6fd6c4dd3756052c', '5d90f06bb70a0a3dced62413346235c02b1aa086', '7346d681807bf0852695caa42dbecae5265b360a', 'c61d139a2382760f560164e25e4be264de5dd59f', '1827de6fa9c9c1b3d647a9d707042e89cf94abf0', '563e821bb5ea825efb56b77484f5287f08cf3753']),
})
self._test_docs('beir/scifact', count=5183, items={
0: BeirTitleDoc('4983', re.compile('^Alterations of the architecture of cerebral white matter in the developing human brain can affect co.{1609}or MRI provides insight into microstructural development in cerebral white matter in living infants\\.$', flags=48), 'Microstructural development of human newborn cerebral white matter assessed in vivo by diffusion tensor magnetic resonance imaging.'),
9: BeirTitleDoc('70490', re.compile('^Likelihood ratios are one of the best measures of diagnostic accuracy, although they are seldom used.{286}ples illustrate how the clinician can use this method to refine diagnostic decisions at the bedside\\.$', flags=48), 'Simplifying likelihood ratios'),
5182: BeirTitleDoc('198309074', re.compile('^Introduction: Among the inflammatory mediators involved in the pathogenesis of obesity, the cell adh.{1481}nical inflammation that results from obesity by reducing the cell adhesion molecules and chemokines\\.$', flags=48), 'Adhesion molecules and chemokines: relation to anthropometric, body composition, biochemical and dietary variables'),
})
self._test_docs('beir/trec-covid', count=171332, items={
0: BeirCordDoc('ug7v899j', re.compile('^OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 pa.{1647}preschool children and that the mortality rate of pneumonia in patients with comorbidities was high\\.$', flags=48), 'Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia', 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC35282/', '11472636'),
9: BeirCordDoc('jg13scgo', re.compile('^This report describes the design and implementation of the Real\\-time Outbreak and Disease Surveillan.{1077} be a resource for implementing, evaluating, and applying new methods of public health surveillance\\.$', flags=48), 'Technical Description of RODS: A Real-time Public Health Surveillance System', 'https://academic.oup.com/jamia/article-pdf/10/5/399/2352016/10-5-399.pdf', '12807803'),
171331: BeirCordDoc('pnl9th2c', '', 'Vascular Life during the COVID-19 Pandemic Reminds Us to Prepare for the Unexpected', 'https://www.sciencedirect.com/science/article/pii/S1078588420303804?v=s5; https://www.ncbi.nlm.nih.gov/pubmed/32446539/; https://doi.org/10.1016/j.ejvs.2020.04.040; https://api.elsevier.com/content/article/pii/S1078588420303804', '32446539'),
})
self._test_docs('beir/webis-touche2020', count=382545, items={
0: BeirToucheDoc('c67482ba-2019-04-18T13:32:05Z-00000-000', re.compile('^My opponent forfeited every round\\. None of my arguments were answered\\. I don’t like the idea of winn.{293} sold to minors in ANY state\\. A retailer who says it is illegal to sell you them is, frankly, wrong\\.$', flags=48), 'Contraceptive Forms for High School Students', 'CON', 'https://www.debate.org/debates/Contraceptive-Forms-for-High-School-Students/1/'),
9: BeirToucheDoc('fbe6ad2-2019-04-18T11:12:36Z-00001-000', re.compile('^Why is it that so\\-called christians, Because there is no such a thing as a christian, Have serious t.{315}, All you did was babble on and on and on\\. So in this sense, It was YOU that forfeited\\. Sheesh! Bye\\.$', flags=48), 'The closet dementia of the superior ego god complex, The bible and why you should not believe in god', 'PRO', 'https://www.debate.org/debates/The-closet-dementia-of-the-superior-ego-god-complex-The-bible-and-why-you-should-not-believe-in-god/1/'),
382544: BeirToucheDoc('671509c8-2019-04-17T11:47:34Z-00007-000', 'Charter schools are exploited most by affable students', 'Charter schools', 'CON', 'http://www.debatepedia.org/en/index.php/Debate:_Charter_schools'),
})
self._test_docs('beir/webis-touche2020/v2', count=382545, items={
0: BeirToucheDoc('c67482ba-2019-04-18T13:32:05Z-00000-000', re.compile('^My opponent forfeited every round\\. None of my arguments were answered\\. I don’t like the idea of winn.{293} sold to minors in ANY state\\. A retailer who says it is illegal to sell you them is, frankly, wrong\\.$', flags=48), 'Contraceptive Forms for High School Students', 'CON', 'https://www.debate.org/debates/Contraceptive-Forms-for-High-School-Students/1/'),
9: BeirToucheDoc('fbe6ad2-2019-04-18T11:12:36Z-00001-000', re.compile('^Why is it that so\\-called christians, Because there is no such a thing as a christian, Have serious t.{315}, All you did was babble on and on and on\\. So in this sense, It was YOU that forfeited\\. Sheesh! Bye\\.$', flags=48), 'The closet dementia of the superior ego god complex, The bible and why you should not believe in god','PRO', 'https://www.debate.org/debates/The-closet-dementia-of-the-superior-ego-god-complex-The-bible-and-why-you-should-not-believe-in-god/1/'),
382544: BeirToucheDoc('671509c8-2019-04-17T11:47:34Z-00007-000', 'Charter schools are exploited most by affable students', 'Charter schools', 'CON', 'http://www.debatepedia.org/en/index.php/Debate:_Charter_schools'),
})
self._test_docs('beir/cqadupstack/android', count=22998, items={
0: BeirCqaDoc('51829', re.compile('^I want to send files to android tablet with a application from PC\\. \\- I can send files directly to ta.{188}m \\? \\- How can show my device as a external drive\\? my application that sent files written via Delphi\\.$', flags=48), 'How can show android tablet as a external storage to PC?', ['usb-connection-mode']),
9: BeirCqaDoc('19394', re.compile('^I bought "Cut the Rope" on my Nexus One cellphone from the Android Market\\. When I open this game on .{51}it to be "Purchased"\\. How can I add my Google account to Kindle Fire\'s Amazon appstore account list\\?$', flags=48), 'How can I use an app purchased from the Market on a Kindle Fire?', ['google-play-store', 'amazon-kindle-fire', 'accounts']),
22997: BeirCqaDoc('38348', re.compile('^With the growing number of Android devices in all sorts of different form factors \\(dev boards like R.{163}roid\\. For example, having the standard Linux build tools available would let me easily run a server\\.$', flags=48), 'Is there any easy way to get GNU build tools on Android? If not... why not?',['linux', 'development']),
})
self._test_docs('beir/cqadupstack/english', count=40221, items={
0: BeirCqaDoc('11547', re.compile('^An eponym is one way to eternal \\(if posthumous\\) fame\\. But is there a word meaning an eponym someone .{65}oycott_ , Mr Justice _Lynch_ , and Patrick _Hooligan_ would not appreciate their undying notoriety\\.\\)$', flags=48), 'Is there a word meaning "an unwanted eponym"?', ['single-word-requests', 'eponyms']),
9: BeirCqaDoc('182056', re.compile("^In the following statement, which one is grammatically correct\\? > XYZ caterers \\*\\*is\\*\\* on to somethin.{76} be 'are' as caterers is plural\\. But it has been suggested that I might be wrong\\. What do you think\\?$", flags=48), '"XYZ caterers is.." or "XYZ caterers are.."?', ['grammar', 'grammatical-number']),
40220: BeirCqaDoc('38346', re.compile('^A colleague and I were having a discussion as to the proper plural form of _abacus_\\. I believe the p.{183}rd that is part of the Arabic language\\. Any opinions or history to this matter would be appreciated\\.$', flags=48), 'Plural of "abacus"', ['meaning', 'etymology', 'grammar', 'latin', 'roots']),
})
self._test_docs('beir/cqadupstack/gaming', count=45301, items={
0: BeirCqaDoc('11542', 'What\'s your Supreme Commander 2 build order. I don\'t just want "6 mass extractors, 2 power and a factory". List of building and units out to the second or third factory, please.', 'Supreme Commander 2 - Build Orders', ['supreme-commander-2']),
9: BeirCqaDoc('19393', re.compile('^What are the benefits of an assault ship over an interceptor\\? I played some significant time ago, an.{176}So: a\\) What are the main uses of each b\\) Which would most benefit the style of play mentioned above\\?$', flags=48), 'Assault ships v. Interceptors', ['eve-online']),
45300: BeirCqaDoc('38346', re.compile("^_But you can't have more than one companion\\._ \\*\\*Wrong\\.\\*\\* So I was taking that stupid dog, Barbas, to.{156}ne with the dragon they start attacking each other\\. How can I get them to stop and be friends again\\?$", flags=48), 'How do I make my companions friends?', ['skyrim']),
})
self._test_docs('beir/cqadupstack/gis', count=37637, items={
0: BeirCqaDoc('73399', re.compile("^There is a satellite image it's size is 10 GB and I need to display this image using GeoServer and O.{211} response time using 32 GB satellite image\\. Please advice me how to achieve this\\? Thanks in advance\\.$", flags=48), 'Satellite image display with the help of GeoServer and OpenLayers', ['openlayers', 'geoserver']),
9: BeirCqaDoc('5983', re.compile('^Has anyone succeeded in programmatically updating metadata in ArcGIS 10\\? Considering using Python/ar.{254} except where they are in conflict in which case the added elements overwrite the existing elements\\.$', flags=48), 'Programmatically edit/update metadata in ArcGIS 10', ['arcobjects', 'arcgis-10.0', 'python', 'c#', 'metadata']),
37636: BeirCqaDoc('103092', re.compile('^Link: http://projects\\.nytimes\\.com/census/2010/explorer How can I also render that specific kind of m.{121} says its from Google at the bottom right, but then why does it look different from maps\\.google\\.com\\?$', flags=48), 'What map library does this census visualization use?', ['gui']),
})
self._test_docs('beir/cqadupstack/mathematica', count=16705, items={
0: BeirCqaDoc('35237', re.compile("^I'm trying to use `Get` to load some pretty substantial packages from a custom menu in the _Mathemat.{320}` / `MenuItem`\\) that will remove that time constraint so that my command can be executed completely\\.$", flags=48), 'Time constraints on KernelExecute commands or MenuItems?', ['front-end', 'menu']),
9: BeirCqaDoc('28990', re.compile("^I have multiple data sets, each of which is a 2D matrix\\. I want to construct a new 2D Matrix in whic.{139}ix2\\[i\\]\\[j\\] \\+ \\.\\.\\. \\+ MatrixN\\[i\\]\\[j\\]\\) I can't quite figure out how to do it in _Mathematica_\\. Thanks$", flags=48), 'Averaging multiple 2D data sets', ['list-manipulation', 'matrix']),
16704: BeirCqaDoc('34149', re.compile('^I want to add two matrices, the first one containing a 2D vector at each position the other one a li.{675},MB\\},2\\] This works but is rather slow\\. Is there a faster and maybe more elegant way to do this\\?$', flags=48), 'MapThread Alternatives', ['list-manipulation', 'performance-tuning', 'map']),
})
self._test_docs('beir/cqadupstack/physics', count=38316, items={
0: BeirCqaDoc('110557', re.compile("^Let's discuss about \\$SU\\(3\\)\\$\\. I understand that the most important representations \\(relevant to physi.{732} indices\\)\\. What is the general procedure to represent the generators in an arbitrary representation\\?$", flags=48), 'Representation of SU(3) generators', ['particle-physics', 'group-representations']),
9: BeirCqaDoc('11546', re.compile('^I have a question about the relation: \\$\\\\exp\\(\\-i \\\\vec\\{\\\\sigma\\} \\\\cdot \\\\hat\\{n\\}\\\\phi/2\\) = \\\\cos\\(\\\\phi/2\\) \\- i .{152}alized for \\$\\\\hat\\{n\\}\\$ being an operator\\? If so how exactly would the expression be different\\? Thanks\\.$', flags=48), 'generalizing spin rotations', ['quantum-mechanics', 'angular-momentum', 'spin']),
38315: BeirCqaDoc('38347', re.compile("^Let's say a box is moved by attaching a rope to it and pulling with an applied force at a certain an.{528}ined ramp, the above would not work\\. What do I need to do differently to solve this type of problem\\?$", flags=48), 'Overcoming Friction', ['homework', 'newtonian-mechanics', 'friction']),
})
self._test_docs('beir/cqadupstack/programmers', count=32176, items={
0: BeirCqaDoc('228054', re.compile('^I am in the midst of writing a web application for work\\. Everything is from scratch\\. I have been a P.{739}s of speed\\. So, my question is as the title asks, is a client\\-side centric app substantially slower\\?$', flags=48), 'Are (mostly) client-side JavaScript web apps slower or less efficient?', ['javascript', 'node.js', 'ajax', 'browser', 'client-side']),
9: BeirCqaDoc('127472', re.compile("^I've been developing web apps for a while now and it is standard practice in our team to use agile d.{317}words, when you develop ML and NLP algorithms as a job, do you use agile development in the process\\?$", flags=48), 'Is Agile Development used in Machine Learning and Natural Language Processing?', ['agile', 'development-process', 'machine-learning', 'nlp']),
32175: BeirCqaDoc('213799', re.compile("^I'm developing a small system with two components: one polls data from an internet resource and tran.{762}he other writes\\? I started writing the code but was wondering if this is a misapplication of SQLite\\.$", flags=48), 'SQLite with two python processes accessing it: one reading, one writing', ['web-development', 'python', 'sql', 'concurrency', 'sqlite']),
})
self._test_docs('beir/cqadupstack/stats', count=42269, items={
0: BeirCqaDoc('110556', re.compile("^I'm a beginner in statistics and R, sorry if this question may seem trivial\\. I've collected data mea.{5246}analysis do you suggest\\? \\* If yes, how can I interpret the result I got \\(please, in simple terms\\)\\?$", flags=48), 'Is this a case for an ordinal logistic regression? Problems interpreting output', ['r', 'regression', 'logistic', 'interpretation']),
9: BeirCqaDoc('89379', re.compile('^!\\[enter image description here\\]\\(http://i\\.stack\\.imgur\\.com/qmNwR\\.png\\) The image above represents a hyp.{574} know of a good way to do that\\? If there is a better place to ask this question, please let me know\\.$', flags=48), 'Need subspace partition algorithm, not necessarily a full classifier', ['machine-learning', 'data-mining']),
42268: BeirCqaDoc('38346', re.compile('^Regression: Wage=b0\\+b1collegegrad, where collegegrad is a dummy variable\\. Suppose you want to estima.{221}nd thus get the true ratio, so the estimator is consistent\\. Am I correct, or am I missing something\\?$', flags=48), 'Consistency of estimator', ['self-study', 'consistency']),
})
self._test_docs('beir/cqadupstack/tex', count=68184, items={
0: BeirCqaDoc('182565', re.compile('^I am using a pgfplots stacked bar to display the aggregated energy demand of a houshold and the asso.{1179} \\\\legend\\{low price, high price\\} \\\\end\\{axis\\} \\\\end\\{tikzpicture\\} \\\\end\\{document\\}$', flags=48), 'Adding horizontal lines to pgfplots bar', ['pgfplots', 'bar-chart']),
9: BeirCqaDoc('61123', re.compile('^> \\*\\*Possible Duplicate:\\*\\* > Left and right subscript > Superscripts before a letter in math I .{128} the subscript O but to be on the left side\\? Is this possible which commands/packages I need to use\\?$', flags=48), 'How to change the side on which the superscript appears?', ['superscripts']),
68183: BeirCqaDoc('103090', re.compile('^I appreciate it if you let me know the most elegant way to draw a crossed hierarchy such as the foll.{3}ngs: X /\\\\ Y Z /\\\\/\\\\ p q t q has two parents Y and Z\\.$', flags=48), 'Crossed hierarchy', ['tikz-pgf', 'horizontal-alignment', 'tikz-trees']),
})
self._test_docs('beir/cqadupstack/unix', count=47382, items={
0: BeirCqaDoc('110557', re.compile('^Is there a way to avoid ssh printing warning messages like this\\? "@@@@@@@@@@@@@@@@@@@@.{196}the remote host identity has changed but I know it is fine and just want to get rid of this warning\\.$', flags=48), 'Force ssh to not to print warnings', ['ssh']),
9: BeirCqaDoc('110550', 'What is the difference between the red5 versions RC1 and RC2 ? and what does RC mean?', 'What is the difference between red5 RC1 and RC2?', ['broadcast']),
47381: BeirCqaDoc('38346', re.compile("^I've got my vacation coming in and thought I might use that for something useful\\. Essentially, I've .{2438}stuff used in enterprise security, I'm very ignorant about how things are actually used in practice\\.$", flags=48), 'Getting from proficient to expert', ['shell', 'virtualization', 'storage', 'cluster']),
})
self._test_docs('beir/cqadupstack/webmasters', count=17405, items={
0: BeirCqaDoc('35236', re.compile("^I'm making a website for a small hotel in php\\. The hotel owners want a reservation system that uses .{290}d of buying with paypal\\. Is this possible\\? Does anyone know of an open php system that handles this\\?$", flags=48), 'Hotel Reservation Request Booking Paypal PHP', ['php', 'looking-for-a-script', 'paypal']),
9: BeirCqaDoc('503', re.compile("^My website used to have sitelinks and now it doesn't\\. It's very possible that it's due to changing t.{219}\\.imgur\\.com/sBaDc\\.jpg\\) What are some things that I can do to improve my chances of getting sitelinks\\?$", flags=48), 'What are the most important things I need to do to encourage Google Sitelinks?', ['seo', 'google', 'sitelinks']),
17404: BeirCqaDoc('38346', re.compile("^I'm looking for a keyword racking tracker tool for google\\. I have found a lot of them over the inter.{182}ord as my site has hundreds of pages\\. Any recommendation\\? Or do I have to set each URLs per keyword\\?$", flags=48), 'Keyword ranking tracker that works on a per-domain basis', ['seo', 'keywords', 'tools', 'ranking', 'google-ranking']),
})
self._test_docs('beir/cqadupstack/wordpress', count=48605, items={
0: BeirCqaDoc('108998', re.compile("^In a shortcode context, is there any difference here\\? array\\( 'slideshow' =.{32} array\\( 'slideshow' => NULL, \\), Is there a best practice for that\\?$", flags=48), 'What is the difference between Null vs Empty (Zero Length) string?', ['php', 'plugin-development', 'shortcode']),
9: BeirCqaDoc('19393', re.compile("^I'm using WP\\-Cufon for font replacements\\. It's adding extra cufon canvas out side of p tags in my pa.{127} it happening\\? How can I solve it\\? I'm having same kind of problem with all\\-in\\-one cufon plugin too\\.$", flags=48), 'WP-Cufon adding extra space in my paragraphs in Firefox and Chrome', ['plugins', 'javascript', 'plugin-all-in-one-cufon']),
48604: BeirCqaDoc('38344', 'Is there a specific reason why we can find max-width:97.5% instead of 100% in common themes such as Twenty Eleven?', 'Why max-width:97.5% on content images?', ['theme-development', 'css', 'maximized-width']),
})
def test_queries(self):
self._test_queries('beir/arguana', count=1406, items={
0: GenericQuery('test-environment-aeghhgwpe-pro02a', "Being vegetarian helps the environment Becoming a vegetarian is an environmentally friendly thing to do. Modern farming is one of the main sources of pollution in our rivers. Beef farming is one of the main causes of deforestation, and as long as people continue to buy fast food in their billions, there will be a financial incentive to continue cutting down trees to make room for cattle. Because of our desire to eat fish, our rivers and seas are being emptied of fish and many species are facing extinction. Energy resources are used up much more greedily by meat farming than my farming cereals, pulses etc. Eating meat and fish not only causes cruelty to animals, it causes serious harm to the environment and to biodiversity. For example consider Meat production related pollution and deforestation At Toronto’s 1992 Royal Agricultural Winter Fair, Agriculture Canada displayed two contrasting statistics: “it takes four football fields of land (about 1.6 hectares) to feed each Canadian” and “one apple tree produces enough fruit to make 320 pies.” Think about it — a couple of apple trees and a few rows of wheat on a mere fraction of a hectare could produce enough food for one person! [1] The 2006 U.N. Food and Agriculture Organization (FAO) report concluded that worldwide livestock farming generates 18% of the planet's greenhouse gas emissions — by comparison, all the world's cars, trains, planes and boats account for a combined 13% of greenhouse gas emissions. [2] As a result of the above point producing meat damages the environment. The demand for meat drives deforestation. Daniel Cesar Avelino of Brazil's Federal Public Prosecution Office says “We know that the single biggest driver of deforestation in the Amazon is cattle.” This clearing of tropical rainforests such as the Amazon for agriculture is estimated to produce 17% of the world's greenhouse gas emissions. [3] Not only this but the production of meat takes a lot more energy than it ultimately gives us chicken meat production consumes energy in a 4:1 ratio to protein output; beef cattle production requires an energy input to protein output ratio of 54:1. The same is true with water use due to the same phenomenon of meat being inefficient to produce in terms of the amount of grain needed to produce the same weight of meat, production requires a lot of water. Water is another scarce resource that we will soon not have enough of in various areas of the globe. Grain-fed beef production takes 100,000 liters of water for every kilogram of food. Raising broiler chickens takes 3,500 liters of water to make a kilogram of meat. In comparison, soybean production uses 2,000 liters for kilogram of food produced; rice, 1,912; wheat, 900; and potatoes, 500 liters. [4] This is while there are areas of the globe that have severe water shortages. With farming using up to 70 times more water than is used for domestic purposes: cooking and washing. A third of the population of the world is already suffering from a shortage of water. [5] Groundwater levels are falling all over the world and rivers are beginning to dry up. Already some of the biggest rivers such as China’s Yellow river do not reach the sea. [6] With a rising population becoming vegetarian is the only responsible way to eat. [1] Stephen Leckie, ‘How Meat-centred Eating Patterns Affect Food Security and the Environment’, International development research center [2] Bryan Walsh, Meat: Making Global Warming Worse, Time magazine, 10 September 2008 . [3] David Adam, Supermarket suppliers ‘helping to destroy Amazon rainforest’, The Guardian, 21st June 2009. [4] Roger Segelken, U.S. could feed 800 million people with grain that livestock eat, Cornell Science News, 7th August 1997. [5] Fiona Harvey, Water scarcity affects one in three, FT.com, 21st August 2003 [6] Rupert Wingfield-Hayes, Yellow river ‘drying up’, BBC News, 29th July 2004"),
9: GenericQuery('test-environment-assgbatj-pro01a', 'Animals shouldn’t be harmed The difference between us and other animals is a matter of degree rather than type [2]. Their bodies resemble ours, as do their ways of conveying meaning. They recoil from pain, appear to express fear of a tormentor, and appear to take pleasure in activities; a point clear to anyone who has observed a pet dog on hearing the word “walk”. We believe other people experience feelings like us because they are like us in appearance and behaviour. An animal sharing our anatomical, physiological, and behavioural characteristics is surely likely to have feelings like us. If people have a right to not be harmed, we must ask ourselves what makes animals different? If animals feel what we feel, and suffer like us, to condemn one to testing because of them being of a different species is similar to racism or sexism.[3]'),
1405: GenericQuery('test-society-epsihbdns-con01a', 'Freedom of movement is an intrinsic human right Every human being is born with certain rights. These are protected by various charters and are considered inseparable from the human being. The reason for this is a belief that these rights create the fundamental and necessary conditions to lead a human life. Freedom of movement is one of these and has been recognised as such in Article 13 of the Universal Declaration of Human Rights. [1] If a family finds themselves faced with starvation, the only chance they have of survival might be to move to another place where they might live another day. It is inhuman to condemn individuals to death and suffering for the benefit of some nebulous collective theory. While we might pass some of our freedoms to the state, we have a moral right to the freedoms that help us stay alive – in this context freedom of movement is one of those. [1] General Assembly, “The Universal Declaration of Human Rights”, 10 December 1948,'),
})
self._test_queries('beir/climate-fever', count=1535, items={
0: GenericQuery('0', 'Global warming is driving polar bears toward extinction'),
9: GenericQuery('21', 'Sea level rise has been slow and a constant, pre-dating industrialization'),
1534: GenericQuery('3134', 'Over the last decade, heatwaves are five times more likely than if there had been no global warming.'),
})
self._test_queries('beir/dbpedia-entity', count=467, items={
0: GenericQuery('INEX_LD-20120112', 'vietnam war facts'),
9: GenericQuery('INEX_LD-2012336', '1906 territory Papua island Australian'),
466: GenericQuery('TREC_Entity-20', 'Scotch whisky distilleries on the island of Islay.'),
})
self._test_queries('beir/dbpedia-entity/dev', count=67, items={
0: GenericQuery('INEX_LD-20120112', 'vietnam war facts'),
9: GenericQuery('INEX_LD-2012336', '1906 territory Papua island Australian'),
66: GenericQuery('TREC_Entity-17', 'Chefs with a show on the Food Network.'),
})
self._test_queries('beir/dbpedia-entity/test', count=400, items={
0: GenericQuery('INEX_LD-20120111', 'vietnam war movie'),
9: GenericQuery('INEX_LD-20120312', 'tango culture countries'),
399: GenericQuery('TREC_Entity-20', 'Scotch whisky distilleries on the island of Islay.'),
})
self._test_queries('beir/fever', count=123142, items={
0: GenericQuery('75397', 'Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.'),
9: GenericQuery('76253', 'There is a movie called The Hunger Games.'),
123141: GenericQuery('81957', 'Trouble with the Curve is a television show.'),
})
self._test_queries('beir/fever/dev', count=6666, items={
0: GenericQuery('137334', 'Fox 2000 Pictures released the film Soul Food.'),
9: GenericQuery('18708', 'Charles Manson has been proven innocent of all crimes.'),
6665: GenericQuery('46064', 'The NAACP Image Award for Outstanding Supporting Actor in a Drama Series was first given in 1996.'),
})
self._test_queries('beir/fever/test', count=6666, items={
0: GenericQuery('163803', 'Ukrainian Soviet Socialist Republic was a founding participant of the UN.'),
9: GenericQuery('134850', 'Ice-T refused to ever make hip-hop music.'),
6665: GenericQuery('81957', 'Trouble with the Curve is a television show.'),
})
self._test_queries('beir/fever/train', count=109810, items={
0: GenericQuery('75397', 'Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.'),
9: GenericQuery('76253', 'There is a movie called The Hunger Games.'),
109809: GenericQuery('152180', 'Susan Sarandon is an award winner.'),
})
self._test_queries('beir/fiqa', count=6648, items={
0: GenericQuery('0', 'What is considered a business expense on a business trip?'),
9: GenericQuery('14', "What are 'business fundamentals'?"),
6647: GenericQuery('2399', 'Where do web sites get foreign exchange currency rate / quote information?'),
})
self._test_queries('beir/fiqa/dev', count=500, items={
0: GenericQuery('7208', 'Could an ex-employee of a company find themself stranded with shares they cannot sell (and a tax bill)?'),
9: GenericQuery('7526', 'First time investor wanting to invest in index funds especially Vanguard'),
499: GenericQuery('4872', 'Taking a car loan vs cash and effect on credit score'),
})
self._test_queries('beir/fiqa/test', count=648, items={
0: GenericQuery('4641', 'Where should I park my rainy-day / emergency fund?'),
9: GenericQuery('6715', 'What does it mean if “IPOs - normally are sold with an `underwriting discount` (a built in commission)”'),
647: GenericQuery('2399', 'Where do web sites get foreign exchange currency rate / quote information?'),
})
self._test_queries('beir/fiqa/train', count=5500, items={
0: GenericQuery('0', 'What is considered a business expense on a business trip?'),
9: GenericQuery('14', "What are 'business fundamentals'?"),
5499: GenericQuery('11104', 'Selling a stock for gain to offset other stock loss'),
})
self._test_queries('beir/hotpotqa', count=97852, items={
0: GenericQuery('5ab6d31155429954757d3384', 'What country of origin does House of Cosbys and Bill Cosby have in common?'),
9: GenericQuery('5adcd29a5542992c1e3a241d', "The 2015 Kids' Choice Sports Awards was hosted by an American footbal quarterback who was born on November 29th of what year?"),
97851: GenericQuery('5ac132a755429964131be17c', 'Blackfin is a family of processors developed by the company that is headquartered in what city?'),
})
self._test_queries('beir/hotpotqa/dev', count=5447, items={
0: GenericQuery('5ae81fbf55429952e35eaa37', 'Daniel Márcio Fernandes plays for a club founded in which year ?'),
9: GenericQuery('5a7fbc715542995d8a8ddf08', "The Monkey's Uncle and Benji the Hunted, are what form of entertainment?"),
5446: GenericQuery('5a8bae0c5542996e8ac889b5', 'The director of "An American Tragedy" emigrated permanently to the United States at what age?'),
})
self._test_queries('beir/hotpotqa/test', count=7405, items={
0: GenericQuery('5a8b57f25542995d1e6f1371', 'Were Scott Derrickson and Ed Wood of the same nationality?'),
9: GenericQuery('5a8db19d5542994ba4e3dd00', 'Are Local H and For Against both from the United States?'),
7404: GenericQuery('5ac132a755429964131be17c', 'Blackfin is a family of processors developed by the company that is headquartered in what city?'),
})
self._test_queries('beir/hotpotqa/train', count=85000, items={
0: GenericQuery('5ab6d31155429954757d3384', 'What country of origin does House of Cosbys and Bill Cosby have in common?'),
9: GenericQuery('5adcd29a5542992c1e3a241d', "The 2015 Kids' Choice Sports Awards was hosted by an American footbal quarterback who was born on November 29th of what year?"),
84999: GenericQuery('5a7543b155429916b01642cd', 'What is the title of the book that documents the involvement of the president of the BioProducts Division at Archer Daniels Midland in a conspiracy case?'),
})
self._test_queries('beir/msmarco', count=509962, items={
0: GenericQuery('1185869', ')what was the immediate impact of the success of the manhattan project?'),
9: GenericQuery('186154', 'feeding rice cereal how many times per day'),
509961: GenericQuery('195199', 'glioma meaning'),
})
self._test_queries('beir/msmarco/dev', count=6980, items={
0: GenericQuery('300674', 'how many years did william bradford serve as governor of plymouth colony?'),
9: GenericQuery('54544', 'blood diseases that are sexually transmitted'),
6979: GenericQuery('195199', 'glioma meaning'),
})
self._test_queries('beir/msmarco/test', count=43, items={
0: GenericQuery('19335', 'anthropological definition of environment'),
9: GenericQuery('156493', 'do goldfish grow'),
42: GenericQuery('1133167', 'how is the weather in jamaica'),
})
self._test_queries('beir/msmarco/train', count=502939, items={
0: GenericQuery('1185869', ')what was the immediate impact of the success of the manhattan project?'),
9: GenericQuery('186154', 'feeding rice cereal how many times per day'),
502938: GenericQuery('405466', 'is carbonic acid soluble'),
})
self._test_queries('beir/nfcorpus', count=3237, items={
0: BeirUrlQuery('PLAIN-3', 'Breast Cancer Cells Feed on Cholesterol', 'http://nutritionfacts.org/2015/07/14/breast-cancer-cells-feed-on-cholesterol/'),
9: BeirUrlQuery('PLAIN-15', 'Why Do Heart Doctors Favor Surgery and Drugs Over Diet?', 'http://nutritionfacts.org/2015/06/02/why-do-heart-doctors-favor-surgery-and-drugs-over-diet/'),
3236: BeirUrlQuery('PLAIN-3472', 'How Doctors Responded to Being Named a Leading Killer', 'http://nutritionfacts.org/video/how-doctors-responded-to-being-named-a-leading-killer/'),
})
self._test_queries('beir/nfcorpus/dev', count=324, items={
0: BeirUrlQuery('PLAIN-1', 'Why Deep Fried Foods May Cause Cancer', 'http://nutritionfacts.org/2015/07/21/why-deep-fried-foods-may-cause-cancer/'),
9: BeirUrlQuery('PLAIN-101', 'How to Treat Multiple Sclerosis With Diet', 'http://nutritionfacts.org/2014/07/22/how-to-treat-multiple-sclerosis-with-diet/'),
323: BeirUrlQuery('PLAIN-3471', 'Uprooting the Leading Causes of Death', 'http://nutritionfacts.org/video/uprooting-the-leading-causes-of-death/'),
})
self._test_queries('beir/nfcorpus/test', count=323, items={
0: BeirUrlQuery('PLAIN-2', 'Do Cholesterol Statin Drugs Cause Breast Cancer?', 'http://nutritionfacts.org/2015/07/16/do-cholesterol-statin-drugs-cause-breast-cancer/'),
9: BeirUrlQuery('PLAIN-102', 'Stopping Heart Disease in Childhood', 'http://nutritionfacts.org/2014/07/15/stopping-heart-disease-in-childhood/'),
322: BeirUrlQuery('PLAIN-3472', 'How Doctors Responded to Being Named a Leading Killer', 'http://nutritionfacts.org/video/how-doctors-responded-to-being-named-a-leading-killer/'),
})
self._test_queries('beir/nfcorpus/train', count=2590, items={
0: BeirUrlQuery('PLAIN-3', 'Breast Cancer Cells Feed on Cholesterol', 'http://nutritionfacts.org/2015/07/14/breast-cancer-cells-feed-on-cholesterol/'),
9: BeirUrlQuery('PLAIN-15', 'Why Do Heart Doctors Favor Surgery and Drugs Over Diet?', 'http://nutritionfacts.org/2015/06/02/why-do-heart-doctors-favor-surgery-and-drugs-over-diet/'),
2589: BeirUrlQuery('PLAIN-3474', 'Fish Consumption and Suicide', 'http://nutritionfacts.org/video/fish-consumption-and-suicide/'),
})
self._test_queries('beir/nq', count=3452, items={
0: GenericQuery('test0', 'what is non controlling interest on balance sheet'),
9: GenericQuery('test9', 'who makes the decisions about what to produce in a market economy'),
3451: GenericQuery('test3451', 'when will notre dame played michigan state again'),
})
self._test_queries('beir/quora', count=15000, items={
0: GenericQuery('318', 'How does Quora look to a moderator?'),
9: GenericQuery('784', 'Why should one hate Shahrukh Khan?'),
14999: GenericQuery('537876', 'How do Russian politics and geostrategy affect Australia and New Zealand?'),
})
self._test_queries('beir/quora/dev', count=5000, items={
0: GenericQuery('318', 'How does Quora look to a moderator?'),
9: GenericQuery('784', 'Why should one hate Shahrukh Khan?'),
4999: GenericQuery('537790', 'What are the most interesting books on the side of atheism?'),
})
self._test_queries('beir/quora/test', count=10000, items={
0: GenericQuery('46', 'Which question should I ask on Quora?'),
9: GenericQuery('616', 'Which are the best books to understand calculus?'),
9999: GenericQuery('537876', 'How do Russian politics and geostrategy affect Australia and New Zealand?'),
})
self._test_queries('beir/scidocs', count=1000, items={
0: BeirSciQuery('78495383450e02c5fe817e408726134b3084905d', 'A Direct Search Method to solve Economic Dispatch Problem with Valve-Point Effect', ['50306438', '15303316', '1976596'], 2014, ['38e78343cfd5c013decf49e8cf008ddf6458200f'], ['632589828c8b9fca2c3a59e97451fde8fa7d188d', '4cf296b9d4ef79b838dc565e6e84ab9b089613de', '86e87db2dab958f1bd5877dc7d5b8105d6e31e46', '4b031fa8bf63e17e2100cf31ba6e11d8f80ff2a8', 'a718c6ca7a1db49bb2328d43f775783e8ec6f985', 'cf51cfb5b221500b882efee60b794bc11635267e', '6329874126a4e753f98c40eaa74b666d0f14eaba', 'a27b6025d147febb54761345eafdd73954467aca']),
9: BeirSciQuery('ae0fb9c6ebb8ce12610c477d2388447a13dc4694', 'Distributed Privacy-Preserving Collaborative Intrusion Detection Systems for VANETs', ['49104949', '1709793'], 2018, ['a1e81122931a5e96ced6569d0ee22b174db1ebb7', '96bbb9c86cdd9d19643686f623898367f9efb0bc', '228c40580e888fc9df003a16b8b7abb5d854a6eb', 'ab95903604d7fb8c03148b1a4f56af3c6de6fde1', '4875ac38970c742d6bfa760ca26ab7a629fde8da'], ['24d800e6681a129b7787cbb05d0e224acad70e8d', '216d7c407109f5557ae525b292856c4ab56996ca', '6e63c4a8320be712e3067eef3f042bb3df38a8e1', '49934d08d42ed9e279a82cbad2086377443c8a75', 'b45d9d5957416f363635025630d53bf593d3dd5c', '11861442e7b59669d630aed8c3b5d5290a70687e', '0dacd4593ba6bce441bae37fc3ff7f3b70408ee1', '8ef2a5e3dffb0a155a14575c8333b175b61e0675', '32334506f746e83367cecb91a0ab841e287cd958', '61efdc56bc6c034e9d13a0c99d0b651a78bfc596']),
999: BeirSciQuery('89e58773fa59ef5b57f229832c2a1b3e3efff37e', 'Analyzing EEG signals to detect unexpected obstacles during walking', ['2492849', '6622542', '2927560', '40259975', '3334492', '46629632'], 2015, ['37512f0a2d5ea940f4debe84593ec2c054126c1e', '5181fe04756a2481d44bad5ec7f26461e41eaca0', '858e561895faadc6d6300948f06fd018a56c6775', '46f3cf9ff98c02b382079ec2d514c47379c3ffaa', '26fc69fb8cc5969b515e3b7d2bdc6ff83f68ac58', 'f9e11a43ccb47b58bc08937750f65d6306e6961a', 'fcd16ea07b9f35a851444f9933ca72535015d46c', '5fc1491937224b215a543196fe2514794b329c03', 'ac9e0bb99f12d697137b2373e1d5ba6f8babf355'], ['f1277592f221ea26fa1d2321a38b64c58b33d75b', '42ad00c8ed436f6b8f0a4a73f55018210181e4a3', '22ff979fafd58acea3b838036fdc55ed60b1a265', 'a20369f96ca4d73fbe25cc9e099b0f9ad57eb4a9', '94485df9a4a975ac8ae32e7f539c8a4f77d88f12', 'a5e6a3fb9bbfc4e494427b4f3a1782b9aefcab92', '4933491737750764aa304288f004f05a06f68704', 'f24a222de1e81b4dd5d9e3a6c5feb4499b095d4d', '57abdefc6e05d475cf5f34d190b8225a74de79f0', '0989bbd8c15f9aac24e8832327df560dc8ec5324']),
})
self._test_queries('beir/scifact', count=1109, items={
0: GenericQuery('0', '0-dimensional biomaterials lack inductive properties.'),
9: GenericQuery('15', '50% of patients exposed to radiation have activated markers of mesenchymal stem cells.'),
1108: GenericQuery('1395', 'p16INK4A accumulation is linked to an abnormal wound response caused by the microinvasive step of advanced Oral Potentially Malignant Lesions (OPMLs).'),
})
self._test_queries('beir/scifact/test', count=300, items={
0: GenericQuery('1', '0-dimensional biomaterials show inductive properties.'),
9: GenericQuery('51', 'ALDH1 expression is associated with better breast cancer outcomes.'),
299: GenericQuery('1395', 'p16INK4A accumulation is linked to an abnormal wound response caused by the microinvasive step of advanced Oral Potentially Malignant Lesions (OPMLs).'),
})
self._test_queries('beir/scifact/train', count=809, items={
0: GenericQuery('0', '0-dimensional biomaterials lack inductive properties.'),
9: GenericQuery('15', '50% of patients exposed to radiation have activated markers of mesenchymal stem cells.'),
808: GenericQuery('1407', 'β1/Ketel is able to bind microtubules.'),
})
self._test_queries('beir/trec-covid', count=50, items={
0: BeirCovidQuery('1', 'what is the origin of COVID-19', 'coronavirus origin', "seeking range of information about the SARS-CoV-2 virus's origin, including its evolution, animal source, and first transmission into humans"),
9: BeirCovidQuery('10', 'has social distancing had an impact on slowing the spread of COVID-19?', 'coronavirus social distancing impact', "seeking specific information on studies that have measured COVID-19's transmission in one or more social distancing (or non-social distancing) approaches"),
49: BeirCovidQuery('50', 'what is known about an mRNA vaccine for the SARS-CoV-2 virus?', 'mRNA vaccine coronavirus', 'Looking for studies specifically focusing on mRNA vaccines for COVID-19, including how mRNA vaccines work, why they are promising, and any results from actual clinical studies.'),
})
self._test_queries('beir/webis-touche2020', count=49, items={
0: BeirToucheQuery('1', 'Should teachers get tenure?', "A user has heard that some countries do give teachers tenure and others don't. Interested in the reasoning for or against tenure, the user searches for positive and negative arguments. The situation of school teachers vs. university professors is of interest.", "Highly relevant arguments make a clear statement about tenure for teachers in schools or universities. Relevant arguments consider tenure more generally, not specifically for teachers, or, instead of talking about tenure, consider the situation of teachers' financial independence."),
9: BeirToucheQuery('10', 'Should any vaccines be required for children?', 'Anti-vaccination movements are on the rise, and so are pathogens like measles again. The freedom to not vaccinate paired with rampant disinformation may be a threat to society at large, and for children in particular. A users thus wonders, whether there are vaccines that should be mandatory.', 'Highly relevant arguments name one or more vaccines and reason about the (un)necessity to administer them to children. Relevant arguments talk about vaccination for children in general.'),
48: BeirToucheQuery('50', 'Should everyone get a universal basic income?', 'Redistribution of wealth is a fundamental concept of many economies and social systems. A key component might be a universal basic income, however, a user wonders whether this truly would help.', 'Highly relevant arguments take a clear stance toward the universal basic income, giving clear premises. Relevant arguments offer only emotional arguments, or talk about minimum wages, mentioning universal basic income only in passing.'),
})
self._test_queries('beir/webis-touche2020/v2', count=49, items={
0: BeirToucheQuery('1', 'Should teachers get tenure?', "A user has heard that some countries do give teachers tenure and others don't. Interested in the reasoning for or against tenure, the user searches for positive and negative arguments. The situation of school teachers vs. university professors is of interest.", "Highly relevant arguments make a clear statement about tenure for teachers in schools or universities. Relevant arguments consider tenure more generally, not specifically for teachers, or, instead of talking about tenure, consider the situation of teachers' financial independence."),
9: BeirToucheQuery('10', 'Should any vaccines be required for children?', 'Anti-vaccination movements are on the rise, and so are pathogens like measles again. The freedom to not vaccinate paired with rampant disinformation may be a threat to society at large, and for children in particular. A users thus wonders, whether there are vaccines that should be mandatory.', 'Highly relevant arguments name one or more vaccines and reason about the (un)necessity to administer them to children. Relevant arguments talk about vaccination for children in general.'),
48: BeirToucheQuery('50', 'Should everyone get a universal basic income?', 'Redistribution of wealth is a fundamental concept of many economies and social systems. A key component might be a universal basic income, however, a user wonders whether this truly would help.', 'Highly relevant arguments take a clear stance toward the universal basic income, giving clear premises. Relevant arguments offer only emotional arguments, or talk about minimum wages, mentioning universal basic income only in passing.'),
})
self._test_queries('beir/cqadupstack/android', count=699, items={
0: BeirCqaQuery('11546', 'Android chroot ubuntu - is it possible to get ubuntu to recognise usb devices', ['linux', 'development']),
9: BeirCqaQuery('20256', 'Does Android hide some amount of RAM from the User?', ['linux', 'development']),
698: BeirCqaQuery('61210', 'Can you remotely download AndroidLost to your phone if your phone battery is dead?', ['linux', 'development']),
})
self._test_queries('beir/cqadupstack/english', count=1570, items={
0: BeirCqaQuery('19399', 'Is "a wide range of features" singular or plural?', ['meaning', 'etymology', 'grammar', 'latin', 'roots']),
9: BeirCqaQuery('21616', 'How are "yes" and "no" formatted in sentences?', ['meaning', 'etymology', 'grammar', 'latin', 'roots']),
1569: BeirCqaQuery('76823', 'When to use articles and when not to?', ['meaning', 'etymology', 'grammar', 'latin', 'roots']),
})
self._test_queries('beir/cqadupstack/gaming', count=1595, items={
0: BeirCqaQuery('82449', 'Can the trophy system protect me against bullets?', ['skyrim']),
9: BeirCqaQuery('176686', 'Please instruct me on how to light myself on fire', ['skyrim']),
1594: BeirCqaQuery('146551', 'How can I fix a corrupted solo world?', ['skyrim']),
})
self._test_queries('beir/cqadupstack/gis', count=885, items={
0: BeirCqaQuery('52462', 'Calculating mean upslope aspect from each cell in DEM using Python?', ['gui']),
9: BeirCqaQuery('12833', 'How to smooth a DEM?', ['gui']),
884: BeirCqaQuery('104332', 'MODIS MOD13Q1 extract ndvi value', ['gui']),
})
self._test_queries('beir/cqadupstack/mathematica', count=804, items={
0: BeirCqaQuery('35544', 'How to use Automorphisms[] on a graph?', ['list-manipulation', 'performance-tuning', 'map']),
9: BeirCqaQuery('37414', 'limit calculation step by step', ['list-manipulation', 'performance-tuning', 'map']),
803: BeirCqaQuery('25260', 'NDSolve with vector function', ['list-manipulation', 'performance-tuning', 'map']),
})
self._test_queries('beir/cqadupstack/physics', count=1039, items={
0: BeirCqaQuery('110554', 'Magnetic field resistance material: are there any?', ['homework', 'newtonian-mechanics', 'friction']),
9: BeirCqaQuery('12012', 'Is spacetime simply connected?', ['homework', 'newtonian-mechanics', 'friction']),
1038: BeirCqaQuery('16082', 'How do I find the frictional force using a free body diagram?', ['homework', 'newtonian-mechanics', 'friction']),
})
self._test_queries('beir/cqadupstack/programmers', count=876, items={
0: BeirCqaQuery('88392', 'Why is closure important for JavaScript?', ['web-development', 'python', 'sql', 'concurrency', 'sqlite']),
9: BeirCqaQuery('210327', "What is the one or the few major changes from Java 6 to Java 7, couldn't JBoss do that already with Java 5?", ['web-development', 'python', 'sql', 'concurrency', 'sqlite']),
875: BeirCqaQuery('133937', 'Methods to rewrite a program', ['web-development', 'python', 'sql', 'concurrency', 'sqlite']),
})
self._test_queries('beir/cqadupstack/stats', count=652, items={
0: BeirCqaQuery('11546', 'Tool to confirm Gaussian fit', ['self-study', 'consistency']),
9: BeirCqaQuery('59955', 'Variance of superset from variance of subsets', ['self-study', 'consistency']),
651: BeirCqaQuery('35719', 'Improvement of regression model', ['self-study', 'consistency']),
})
self._test_queries('beir/cqadupstack/tex', count=2906, items={
0: BeirCqaQuery('197555', 'How can I learn to make my own packages?', ['tikz-pgf', 'horizontal-alignment', 'tikz-trees']),
9: BeirCqaQuery('57481', 'Aliasing issues using beamer with pdfLaTeX', ['tikz-pgf', 'horizontal-alignment', 'tikz-trees']),
2905: BeirCqaQuery('84944', 'How I can delete frametitle after pagebreak in mdframed box?', ['tikz-pgf', 'horizontal-alignment', 'tikz-trees']),
})
self._test_queries('beir/cqadupstack/unix', count=1072, items={
0: BeirCqaQuery('103549', 'Yanked USB Key During Move', ['shell', 'virtualization', 'storage', 'cluster']),
9: BeirCqaQuery('111331', 'Evolution of the shell', ['shell', 'virtualization', 'storage', 'cluster']),
1071: BeirCqaQuery('20536', 'reformatting output with aligned columns', ['shell', 'virtualization', 'storage', 'cluster']),
})
self._test_queries('beir/cqadupstack/webmasters', count=506, items={
0: BeirCqaQuery('28994', 'Someone else is using our Google Analytics Tracking code number. What do we do?', ['seo', 'keywords', 'tools', 'ranking', 'google-ranking']),
9: BeirCqaQuery('30705', 'Redirecting from blogger to custom domain', ['seo', 'keywords', 'tools', 'ranking', 'google-ranking']),
505: BeirCqaQuery('65733', 'Does removing ID from url improve SEO?', ['seo', 'keywords', 'tools', 'ranking', 'google-ranking']),
})
self._test_queries('beir/cqadupstack/wordpress', count=541, items={
0: BeirCqaQuery('120122', "How to enqueue script or style in a theme's template file?", ['theme-development', 'css', 'maximized-width']),
9: BeirCqaQuery('23263', 'Syntax highlighting for post/page editor', ['theme-development', 'css', 'maximized-width']),
540: BeirCqaQuery('90939', 'All-in-One Event Calendar: Custom Query - Getting each event Instance', ['theme-development', 'css', 'maximized-width']),
})
def test_qrels(self):
self._test_qrels('beir/arguana', count=1406, items={
0: TrecQrel('test-environment-aeghhgwpe-pro02a', 'test-environment-aeghhgwpe-pro02b', 1, '0'),
9: TrecQrel('test-environment-assgbatj-pro01a', 'test-environment-assgbatj-pro01b', 1, '0'),
1405: TrecQrel('test-society-epsihbdns-con01a', 'test-society-epsihbdns-con01b', 1, '0'),
})
self._test_qrels('beir/climate-fever', count=4681, items={
0: TrecQrel('0', 'Habitat_destruction', 1, '0'),
9: TrecQrel('9', 'Carbon_dioxide', 1, '0'),
4680: TrecQrel('3134', 'Global_warming', 1, '0'),
})
self._test_qrels('beir/dbpedia-entity/dev', count=5673, items={
0: TrecQrel('INEX_LD-2009096', '', 0, '0'),
9: TrecQrel('INEX_LD-2009096', '', 0, '0'),
5672: TrecQrel('TREC_Entity-17', '', 0, '0'),
})
self._test_qrels('beir/dbpedia-entity/test', count=43515, items={
0: TrecQrel('INEX_LD-2009022', '', 0, '0'),
9: TrecQrel('INEX_LD-2009022', '', 0, '0'),
43514: TrecQrel('TREC_Entity-9', '', 0, '0'),
})
self._test_qrels('beir/fever/dev', count=8079, items={
0: TrecQrel('137334', 'Soul_Food_(film)', 1, '0'),
9: TrecQrel('105095', 'Carrie_Mathison', 1, '0'),
8078: TrecQrel('46064', 'NAACP_Image_Award_for_Outstanding_Supporting_Actor_in_a_Drama_Series', 1, '0'),
})
self._test_qrels('beir/fever/test', count=7937, items={
0: TrecQrel('163803', 'Ukrainian_Soviet_Socialist_Republic', 1, '0'),
9: TrecQrel('54298', 'Electric_chair', 1, '0'),
7936: TrecQrel('81957', 'Trouble_with_the_Curve', 1, '0'),
})
self._test_qrels('beir/fever/train', count=140085, items={
0: TrecQrel('75397', 'Fox_Broadcasting_Company', 1, '0'),
9: TrecQrel('226034', 'Tetris', 1, '0'),
140084: TrecQrel('152180', 'Susan_Sarandon', 1, '0'),
})
self._test_qrels('beir/fiqa/dev', count=1238, items={
0: TrecQrel('1', '14255', 1, '0'),
9: TrecQrel('29', '189642', 1, '0'),
1237: TrecQrel('11023', '579370', 1, '0'),
})
self._test_qrels('beir/fiqa/test', count=1706, items={
0: TrecQrel('8', '566392', 1, '0'),
9: TrecQrel('42', '331981', 1, '0'),
1705: TrecQrel('11088', '437100', 1, '0'),
})
self._test_qrels('beir/fiqa/train', count=14166, items={
0: TrecQrel('0', '18850', 1, '0'),
9: TrecQrel('11', '596427', 1, '0'),
14165: TrecQrel('11104', '518310', 1, '0'),
})
self._test_qrels('beir/hotpotqa/dev', count=10894, items={
0: TrecQrel('5ae81fbf55429952e35eaa37', '6607768', 1, '0'),
9: TrecQrel('5ae142a4554299422ee9964a', '1216600', 1, '0'),
10893: TrecQrel('5a8bae0c5542996e8ac889b5', '690481', 1, '0'),
})
self._test_qrels('beir/hotpotqa/test', count=14810, items={
0: TrecQrel('5a8b57f25542995d1e6f1371', '2816539', 1, '0'),
9: TrecQrel('5a8e3ea95542995a26add48d', '5382358', 1, '0'),
14809: TrecQrel('5ac132a755429964131be17c', '644341', 1, '0'),
})
self._test_qrels('beir/hotpotqa/train', count=170000, items={
0: TrecQrel('5ab6d31155429954757d3384', '2921047', 1, '0'),
9: TrecQrel('5adec8ad55429975fa854f8f', '202525', 1, '0'),
169999: TrecQrel('5a7543b155429916b01642cd', '20527', 1, '0'),
})
self._test_qrels('beir/msmarco/dev', count=7437, items={
0: TrecQrel('300674', '7067032', 1, '0'),
9: TrecQrel('54544', '7068203', 1, '0'),
7436: TrecQrel('195199', '8009377', 1, '0'),
})
self._test_qrels('beir/msmarco/test', count=9260, items={
0: TrecQrel('19335', '1017759', 0, '0'),
9: TrecQrel('19335', '1274615', 0, '0'),
9259: TrecQrel('1133167', '977421', 0, '0'),
})
self._test_qrels('beir/msmarco/train', count=532751, items={
0: TrecQrel('1185869', '0', 1, '0'),
9: TrecQrel('186154', '1160', 1, '0'),
532750: TrecQrel('405466', '8841735', 1, '0'),
})
self._test_qrels('beir/nfcorpus/dev', count=11385, items={
0: TrecQrel('PLAIN-1', 'MED-2421', 2, '0'),
9: TrecQrel('PLAIN-1', 'MED-4070', 1, '0'),
11384: TrecQrel('PLAIN-3471', 'MED-5342', 2, '0'),
})
self._test_qrels('beir/nfcorpus/test', count=12334, items={
0: TrecQrel('PLAIN-2', 'MED-2427', 2, '0'),
9: TrecQrel('PLAIN-2', 'MED-2434', 1, '0'),
12333: TrecQrel('PLAIN-3472', 'MED-3627', 1, '0'),
})
self._test_qrels('beir/nfcorpus/train', count=110575, items={
0: TrecQrel('PLAIN-3', 'MED-2436', 1, '0'),
9: TrecQrel('PLAIN-3', 'MED-2431', 1, '0'),
110574: TrecQrel('PLAIN-3474', 'MED-4634', 1, '0'),
})
self._test_qrels('beir/nq', count=4201, items={
0: TrecQrel('test0', 'doc0', 1, '0'),
9: TrecQrel('test6', 'doc63', 1, '0'),
4200: TrecQrel('test3451', 'doc117680', 1, '0'),
})
self._test_qrels('beir/quora/dev', count=7626, items={
0: TrecQrel('318', '317', 1, '0'),
9: TrecQrel('399', '364917', 1, '0'),
7625: TrecQrel('537790', '537789', 1, '0'),
})
self._test_qrels('beir/quora/test', count=15675, items={
0: TrecQrel('46', '134031', 1, '0'),
9: TrecQrel('187', '188', 1, '0'),
15674: TrecQrel('537876', '537875', 1, '0'),
})
self._test_qrels('beir/scidocs', count=29928, items={
0: TrecQrel('78495383450e02c5fe817e408726134b3084905d', '632589828c8b9fca2c3a59e97451fde8fa7d188d', 1, '0'),
9: TrecQrel('78495383450e02c5fe817e408726134b3084905d', '305c45fb798afdad9e6d34505b4195fa37c2ee4f', 0, '0'),
29927: TrecQrel('89e58773fa59ef5b57f229832c2a1b3e3efff37e', 'dec997b20ebe2b867f68cc5c123d9cb9eafad6bb', 0, '0'),
})
self._test_qrels('beir/scifact/test', count=339, items={
0: TrecQrel('1', '31715818', 1, '0'),
9: TrecQrel('50', '12580014', 1, '0'),
338: TrecQrel('1395', '17717391', 1, '0'),
})
self._test_qrels('beir/scifact/train', count=919, items={
0: TrecQrel('0', '31715818', 1, '0'),
9: TrecQrel('15', '22080671', 1, '0'),
918: TrecQrel('1407', '29863668', 1, '0'),
})
self._test_qrels('beir/trec-covid', count=66336, items={
0: TrecQrel('1', '005b2j4b', 2, '0'),
9: TrecQrel('1', '05vx82oo', 0, '0'),
66335: TrecQrel('50', 'zz8wvos9', 1, '0'),
})
self._test_qrels('beir/webis-touche2020', count=2962, items={
0: TrecQrel('1', '197beaca-2019-04-18T11:28:59Z-00001-000', 4, '0'),
9: TrecQrel('1', '24e47090-2019-04-18T19:22:46Z-00003-000', 3, '0'),
2961: TrecQrel('50', '799d051-2019-04-18T11:47:02Z-00000-000', -2, '0'),
})
self._test_qrels('beir/webis-touche2020/v2', count=2214, items={
0: TrecQrel('1', '197beaca-2019-04-18T11:28:59Z-00001-000', 0, '0'),
9: TrecQrel('1', '4fb4627-2019-04-18T18:47:37Z-00003-000', 1, '0'),
2213: TrecQrel('50', '4d1037f0-2019-04-18T11:08:29Z-00002-000', 2, '0'),
})
self._test_qrels('beir/cqadupstack/android', count=1696, items={
0: TrecQrel('11546', '18572', 1, '0'),
9: TrecQrel('82440', '78789', 1, '0'),
1695: TrecQrel('61210', '61212', 1, '0'),
})
self._test_qrels('beir/cqadupstack/english', count=3765, items={
0: TrecQrel('19399', '102236', 1, '0'),
9: TrecQrel('19399', '4501', 1, '0'),
3764: TrecQrel('76823', '31410', 1, '0'),
})
self._test_qrels('beir/cqadupstack/gaming', count=2263, items={
0: TrecQrel('82449', '53562', 1, '0'),
9: TrecQrel('46138', '42968', 1, '0'),
2262: TrecQrel('146551', '28158', 1, '0'),
})
self._test_qrels('beir/cqadupstack/gis', count=1114, items={
0: TrecQrel('52462', '49462', 1, '0'),
9: TrecQrel('46866', '46762', 1, '0'),
1113: TrecQrel('104332', '104331', 1, '0'),
})
self._test_qrels('beir/cqadupstack/mathematica', count=1358, items={
0: TrecQrel('35544', '14789', 1, '0'),
9: TrecQrel('48026', '47994', 1, '0'),
1357: TrecQrel('25260', '26583', 1, '0'),
})
self._test_qrels('beir/cqadupstack/physics', count=1933, items={
0: TrecQrel('110554', '21138', 1, '0'),
9: TrecQrel('89378', '36242', 1, '0'),
1932: TrecQrel('16082', '16081', 1, '0'),
})
self._test_qrels('beir/cqadupstack/programmers', count=1675, items={
0: TrecQrel('88392', '203507', 1, '0'),
9: TrecQrel('145437', '229691', 1, '0'),
1674: TrecQrel('133937', '27335', 1, '0'),
})
self._test_qrels('beir/cqadupstack/stats', count=913, items={
0: TrecQrel('11546', '66109', 1, '0'),
9: TrecQrel('57083', '91074', 1, '0'),
912: TrecQrel('35719', '35716', 1, '0'),
})
self._test_qrels('beir/cqadupstack/tex', count=5154, items={
0: TrecQrel('197555', '12668', 1, '0'),
9: TrecQrel('89372', '80', 1, '0'),
5153: TrecQrel('84944', '84946', 1, '0'),
})
self._test_qrels('beir/cqadupstack/unix', count=1693, items={
0: TrecQrel('103549', '2677', 1, '0'),
9: TrecQrel('103549', '48253', 1, '0'),
1692: TrecQrel('20536', '17664', 1, '0'),
})
self._test_qrels('beir/cqadupstack/webmasters', count=1395, items={
0: TrecQrel('28994', '53865', 1, '0'),
9: TrecQrel('11544', '52031', 1, '0'),
1394: TrecQrel('65733', '65118', 1, '0'),
})
self._test_qrels('beir/cqadupstack/wordpress', count=744, items={
0: TrecQrel('120122', '21561', 1, '0'),
9: TrecQrel('114225', '78428', 1, '0'),
743: TrecQrel('90939', '105803', 1, '0'),
})
if __name__ == '__main__':
unittest.main()
================================================
FILE: test/integration/c4.py
================================================
import re
import unittest
from ir_datasets.datasets.c4 import C4Doc, MisinfoQuery
from .base import DatasetIntegrationTest
class TestCar(DatasetIntegrationTest):
def test_docs(self):
self._test_docs('c4/en-noclean-tr/trec-misinfo-2021', items={
0: C4Doc('en.noclean.c4-train.00000-of-07168.0', re.compile('^November 24, 2016 – World News, Breaking News\nWednesday, April 24, 2019\nLatest:\nFitbit introduced “s.{3832}World News, Breaking News\\. All rights reserved\\.\nTheme: ColorMag by ThemeGrill\\. Powered by WordPress\\.$', flags=48), 'http://sevendaynews.com/2016/11/24/', '2019-04-24T16:35:11Z'),
9: C4Doc('en.noclean.c4-train.00000-of-07168.9', re.compile('^Best Books Market\nBest Books Market\nCategories\nBook\nToy\nfree ftp mac client :: :: эффективные средст.{735}e Eleven Rival Regional Cultures of North America\n1\n2\n3\n4\n5\n6\n7\n8\n9\n10\nNext\n© 2019 Best Books Market$', flags=48), 'http://books.amzgr.com/', '2019-04-26T05:44:42Z'),
99: C4Doc('en.noclean.c4-train.00000-of-07168.99', re.compile('^Volunteers Needed for Assembly for Children \\| Church of God of Prophecy\n\\+1\\.423\\.559\\.5100 info@cogop\\.o.{2453}ture\nSocial Media\nResources\nAssembly\nTreasurer’s Report\nFacebook\nInstagram\nVimeo\nYoutube\nTwitter\nRSS$', flags=48), 'https://cogop.org/blog/volunteers-needed-for-assembly-for-children/', '2019-04-26T08:07:35Z'),
999: C4Doc('en.noclean.c4-train.00000-of-07168.999', re.compile('^타임폴리오자산운용\nINVESTMENT HIGHLIGHTS\nMulti Manager\nMulti Asset\nMulti Strategy\nTMS\nQAS\nMMS\nCOMPANY\nIntrodu.{300}x\\] 해상도에 최적화 되어 있습니다\\.\nTel\\. \\(02\\) 533\\-8940\nFax\\. \\(02\\) 534\\-3305\nE\\-mail\\. tf@timefolio\\.co\\.kr\n개인정보처리방침 > TOP$', flags=48), 'http://timefolio.co.kr/gallery/gallery_view.php?num=39&page=1&search_keyword=&search_field=', '2019-04-25T06:54:54Z'),
9999: C4Doc('en.noclean.c4-train.00000-of-07168.9999', re.compile('^Unex Avid Juicy 3/5/7/Ca Metal Ceramic Disc Brake Pad\nJavaScript seems to be disabled in your browse.{10024}rice\\}\\}\nApply\nCancel\n\\{\\{carrier\\.method_title\\}\\}\n\\+ \\{\\{\\$parent\\.currency\\}\\}\\{\\{carrier\\.price\\}\\}\nApply\nCancel\n\\-\\-$', flags=48), 'https://www.bicyclehero.com/us/unex-avid-juicy-3-5-7-ca-metal-ceramic-disc-brake-pad.html', '2019-04-23T16:41:09Z'),
99999: C4Doc('en.noclean.c4-train.00000-of-07168.99999', re.compile("^The truth about SHA1, SHA\\-256, dual\\-signing and Code Signing Certificates : K Software\nWelcome to th.{5597}ouldn't be helpful\\. Help us improve this article with your feedback\\.\nRelated Articles\nHome Solutions$", flags=48), 'https://support.ksoftware.net/support/solutions/articles/215805-the-truth-about-sha1-sha-256-dual-signing-and-code-signing-certificates-', '2019-04-20T09:00:23Z'),
999999: C4Doc('en.noclean.c4-train.00006-of-07168.109537', re.compile('^Results \\- Race Walking Association\nHome \\| Fixtures \\| Results \\| Rankings \\| Athletes \\| Clubs \\| Newslet.{400}und points: 0\n2012: 2 races 2,000 metres completed\\.\n\\(c\\) RACE WALKING ASSOCIATION 1907 \\- 2019 Sitemap$', flags=48), 'http://racewalkingassociation.com/AthleteDetails.asp?mode=edit&id=11300&athlete=Emily_Wyman', '2019-04-25T20:24:19Z'),
})
def test_queries(self):
self._test_queries('c4/en-noclean-tr/trec-misinfo-2021', count=50, items={
0: MisinfoQuery('101', re.compile('ankl.*nitis', flags=48), re.compile('Will.*kle bra.*heal ac.*', flags=48), re.compile('Achil.*kle braces, or both.', flags=48), re.compile('We do no.*professional advice.', flags=48), 'unhelpful', 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3134723/'),
9: MisinfoQuery('110', re.compile('birt.*tment', flags=48), re.compile('Wil.*trol pil.*arian c.*', flags=48), re.compile('Functi.*lth issues, or both.', flags=48), re.compile('We do no.*professional advice.', flags=48), 'unhelpful', 'https://pubmed.ncbi.nlm.nih.gov/24782304/'),
49: MisinfoQuery('150', re.compile('antiox.*ity', flags=48), re.compile('Wil.*oxida.*ments.*blems.* ', flags=48), re.compile("Coupl.*ether. ", flags=48), re.compile('We do no.*professional advice.', flags=48), 'unhelpful', 'https://pubmed.ncbi.nlm.nih.gov/32851663/'),
})
if __name__ == '__main__':
unittest.main()
================================================
FILE: test/integration/car.py
================================================
import re
import unittest
from ir_datasets.datasets.car import CarQuery
from ir_datasets.formats import TrecQrel, GenericDoc
from .base import DatasetIntegrationTest
class TestCar(DatasetIntegrationTest):
def test_docs(self):
self._test_docs('car/v1.5', count=29678367, items={
0: GenericDoc('0000000e7e72cafb61a9f356b7dceb25c5e028db', re.compile("^Ukraine was one of the most dangerous places for journalists in the world during the euromaidan demo.{311}ened in Donetsk in April 2014\\. In July 2014 a firebomb was thrown at the TV channel ''112 Ukraine''\\.$", flags=48)),
9: GenericDoc('000006d5c22f4efbb6b963ea819e976a4b28600b', re.compile('^To mark the 40th anniversary of "Bohemian Rhapsody", the song was released on a limited edition 12" .{174}on CD, DVD \\& Blu\\-ray\\. This includes the first ever live recorded performance of "Bohemian Rhapsody"\\.$', flags=48)),
29678366: GenericDoc('ffffffb9eec6224bef5da06e829eef59a37748c6', re.compile('^Fisher recommended Louis as First Sea Lord: "He is the most capable administrator in the Admiralty\'s.{472}that would prepare the navy\'s plans in case of war\\. He was promoted to full admiral on 13 July 1912\\.$', flags=48)),
})
self._test_docs('car/v2.0', count=29794697, items={
0: GenericDoc('00000047dc43083f49b68399c6deeed5c0e81c1f', re.compile('^On 28 October 1943, Fuller sailed from Efate, New Hebrides, for the initial landings on Bougainville.{456}damage, and twice more during the following month and a half carried reinforcements to Bougainville\\.$', flags=48)),
9: GenericDoc('0000070402dbaf074bc1e3ba487036322ef8ce86', re.compile('^In 1662, the then Governor of Jamaica, Lord Windsor, received royal instructions to protect the "Ca.{527} its landward side to five feet on its seaward side, with the walls being about five feet in height\\.$', flags=48)),
29794696: GenericDoc('ffffffb9eec6224bef5da06e829eef59a37748c6', re.compile('^Fisher recommended Louis as First Sea Lord: "He is the most capable administrator in the Admiralty\'s.{472}that would prepare the navy\'s plans in case of war\\. He was promoted to full admiral on 13 July 1912\\.$', flags=48)),
})
def test_queries(self):
self._test_queries('car/v1.5/trec-y1', count=2287, items={
0: CarQuery('Fudge/History', 'Fudge History', 'Fudge', ('History',)),
9: CarQuery('Glass%20ceiling/Glass%20Ceiling%20Index', 'Glass ceiling Glass Ceiling Index', 'Glass ceiling', ('Glass Ceiling Index',)),
2286: CarQuery('Global%20catastrophic%20risk/Organizations', 'Global catastrophic risk Organizations', 'Global catastrophic risk', ('Organizations',)),
})
self._test_queries('car/v1.5/test200', count=1987, items={
0: CarQuery('Hog-dog%20rodeo/Typical%20match', 'Hog-dog rodeo Typical match', 'Hog-dog rodeo', ('Typical match',)),
9: CarQuery('Infield%20fly%20rule/The%20rule/Foul%20balls', 'Infield fly rule The rule Foul balls', 'Infield fly rule', ('The rule', 'Foul balls')),
1986: CarQuery('Structural%20information%20theory/Visual%20regularity', 'Structural information theory Visual regularity', 'Structural information theory', ('Visual regularity',)),
})
self._test_queries('car/v1.5/train/fold0', count=467946, items={
0: CarQuery('Kindertotenlieder/Text%20and%20music', 'Kindertotenlieder Text and music', 'Kindertotenlieder', ('Text and music',)),
9: CarQuery('Northrop%20YB-35/Variants', 'Northrop YB-35 Variants', 'Northrop YB-35', ('Variants',)),
467945: CarQuery('1987%E2%80%9388%20Greek%20Cup/Final', '1987–88 Greek Cup Final', '1987–88 Greek Cup', ('Final',)),
})
self._test_queries('car/v1.5/train/fold1', count=466596, items={
0: CarQuery('Roderick%20Spode/Overview', 'Roderick Spode Overview', 'Roderick Spode', ('Overview',)),
9: CarQuery('Alan%20Hale%20Jr./Personal%20life', 'Alan Hale Jr. Personal life', 'Alan Hale Jr.', ('Personal life',)),
466595: CarQuery('Brian%20Eno/Personal%20life%20and%20beliefs', 'Brian Eno Personal life and beliefs', 'Brian Eno', ('Personal life and beliefs',)),
})
self._test_queries('car/v1.5/train/fold2', count=469323, items={
0: CarQuery('Lost%20in%20Space%20(film)/Plot', 'Lost in Space (film) Plot', 'Lost in Space (film)', ('Plot',)),
9: CarQuery('Dick%20&%20Dom%20in%20da%20Bungalow/Bungalow%20Games/Forfeit%20Auction', 'Dick & Dom in da Bungalow Bungalow Games Forfeit Auction', 'Dick & Dom in da Bungalow', ('Bungalow Games', 'Forfeit Auction')),
469322: CarQuery('Erick%20van%20Egeraat/Awards%20and%20recognition', 'Erick van Egeraat Awards and recognition', 'Erick van Egeraat', ('Awards and recognition',)),
})
self._test_queries('car/v1.5/train/fold3', count=463314, items={
0: CarQuery('Bradford,%20Ontario/History', 'Bradford, Ontario History', 'Bradford, Ontario', ('History',)),
9: CarQuery('CBBC/Scheduling', 'CBBC Scheduling', 'CBBC', ('Scheduling',)),
463313: CarQuery('Br%C3%BCel%20&%20Kj%C3%A6r/Organisational%20developments', 'Brüel & Kjær Organisational developments', 'Brüel & Kjær', ('Organisational developments',)),
})
self._test_queries('car/v1.5/train/fold4', count=468789, items={
0: CarQuery('Status%20symbol/By%20region%20and%20time', 'Status symbol By region and time', 'Status symbol', ('By region and time',)),
9: CarQuery('History%20of%20Greece/Ancient%20Greece%20(1100%E2%80%93146%20BC)/Iron%20Age%20(1100%E2%80%93800%20BC)', 'History of Greece Ancient Greece (1100–146 BC) Iron Age (1100–800 BC)', 'History of Greece', ('Ancient Greece (1100–146 BC)', 'Iron Age (1100–800 BC)')),
468788: CarQuery('Manchester%20International%20Organ%20Competition/1986%20-%20Fifth%20competition', 'Manchester International Organ Competition 1986 - Fifth competition', 'Manchester International Organ Competition', ('1986 - Fifth competition',)),
})
def test_qrels(self):
self._test_qrels('car/v1.5/trec-y1/auto', count=5820, items={
0: TrecQrel('Aftertaste/Aftertaste%20processing%20in%20the%20cerebral%20cortex', '38c1bd25ddca2705164677a3f598c46df85afba7', 1, '0'),
9: TrecQrel('Aftertaste/Temporal%20taste%20perception', '8a41a87100d139bb9c108c8cab2ac3baaabea3ce', 1, '0'),
5819: TrecQrel('Yellowstone%20National%20Park/Recreation', 'e80b5185da1493edde41bea19a389a3f62167369', 1, '0'),
})
self._test_qrels('car/v1.5/trec-y1/manual', count=29571, items={
0: TrecQrel('Hadley%20cell/Hadley%20cell%20expansion', '389c8a699f4db2f0278700d1c32e63ac369906cd', -1, '0'),
9: TrecQrel('Water%20cycle/Effects%20on%20biogeochemical%20cycling', '844a0a0d5860ff1da8a9fcfb16cc4ce04ffb963f', 1, '0'),
29570: TrecQrel('Rancidification/Reducing%20rancidification', '20a4e9af2853803a08854a1cc8973534e2235658', -1, '0'),
})
self._test_qrels('car/v1.5/test200', count=4706, items={
0: TrecQrel('ASME/ASME%20codes%20and%20standards', '16d8f62407d2cdd283a71735e5c83f7d7947b93a', 1, '0'),
9: TrecQrel('Activity%20theory/An%20explanation', 'c0ee784b8f0eb3b80aaf85f42d5148655192cc1d', 1, '0'),
4705: TrecQrel('Zang-fu/Yin/yang%20and%20the%20Five%20Elements', 'fe6f4dd186037e09bf00f0f08bf172babac7930b', 1, '0'),
})
self._test_qrels('car/v1.5/train/fold0', count=1054369, items={
0: TrecQrel("$pread/''$pread''%20Book", '2f545ffad1581dea4a2e4720aa9feb7389e1956a', 1, '0'),
9: TrecQrel('%22Wild%20Bill%22%20Hickok/Death/Burial', '528b68a3355672c9b8bd5003428b72f54074b3fb', 1, '0'),
1054368: TrecQrel('Zygmunt%20Szcz%C4%99sny%20Feli%C5%84ski/Views%20on%20Poland', 'fd77154f625ca721e554cbd0e4f33b51d4d92af6', 1, '0'),
})
self._test_qrels('car/v1.5/train/fold1', count=1052398, items={
0: TrecQrel('$100,000%20infield/Eddie%20Collins', 'c7aa3c7821a112a149d85f650cbca4ec23c63617', 1, '0'),
9: TrecQrel("%60Abdu'l-Bah%C3%A1/Acre/Marriage%20and%20family%20life", '4da4ea634ccae1173e553129b368e95962969ec8', 1, '0'),
1052397: TrecQrel('Zygosity/Types/Nullizygous', '36186e2655db62fd9c31701302f86636b03d2511', 1, '0'),
})
self._test_qrels('car/v1.5/train/fold2', count=1061162, items={
0: TrecQrel("$h*!%20My%20Dad%20Says/''Surviving%20Jack''", 'dc4866e5b230ffb48b6f808f41ccf8063fbdc9fa', 1, '0'),
9: TrecQrel('%22Left-Wing%22%20Communism:%20An%20Infantile%20Disorder/%22Left-wing%22%20communism%20in%20Germany', '22ec581e3e1c5397e64bc6f0066dc8aea12fc71f', 1, '0'),
1061161: TrecQrel('ZynAddSubFX/Windows%20version', 'b9d1be10b54e5efcbf3e6f1e5f2fbaf7c8af303c', 1, '0'),
})
self._test_qrels('car/v1.5/train/fold3', count=1046784, items={
0: TrecQrel('$2%20billion%20arms%20deal/Confessional%20statements', '0e512b5962fa5ea838a578cbf414ae09b863a33f', 1, '0'),
9: TrecQrel('$2%20billion%20arms%20deal/Investigative%20committee', '812cb64a35f482bd60f82c1d67204c73612cb6a7', 1, '0'),
1046783: TrecQrel('Zyuden%20Sentai%20Kyoryuger/Video%20game', '844b90cf6f7c62e5bf51625a4d216baec2825bf9', 1, '0'),
})
self._test_qrels('car/v1.5/train/fold4', count=1061911, items={
0: TrecQrel('$1,000%20genome/Additional%20Resources', '67ea5eae967657a8f0282066e3086573e41726d5', 1, '0'),
9: TrecQrel('$1,000%20genome/Commercial%20efforts', 'a7ac9041cd833d6b09cc5270b495e9f94704027f', 1, '0'),
1061910: TrecQrel('Zyron/Products', 'f355f98b4e3d5b08f60abe61022e9393202b9718', 1, '0'),
})
if __name__ == '__main__':
unittest.main()
================================================
FILE: test/integration/clinicaltrials.py
================================================
import re
import unittest
import ir_datasets
from ir_datasets.datasets.clinicaltrials import ClinicalTrialsDoc
from ir_datasets.datasets.medline import TrecPmQuery, TrecPm2017Query
from ir_datasets.formats import GenericQuery, TrecQrel
from .base import DatasetIntegrationTest
_logger = ir_datasets.log.easy()
class TestClinicalTrials(DatasetIntegrationTest):
def test_docs(self):
self._test_docs('clinicaltrials/2017', count=241006, items={
0: ClinicalTrialsDoc('NCT00530868', 'Comparing Letrozole Given Alone to Letrozole Given With Avastin in Post-Menopausal Women Breast Cancer', '', re.compile('^\n \n This purpose of this trial is to show that the combination of Avastin and hormone therap.{3} should be more effective than hormone therapy alone for the treatment of breast cancer\\.\n \n $', flags=48), re.compile('^\n \n Preclinical and clinical data have demonstrated that up\\-regulation of tumor cell VEGF is.{329}ould be more effective than hormonal therapy alone for the\n treatment of breast cancer\\.\n \n $', flags=48), re.compile('^\n \n Inclusion Criteria:\n\n All patients must meet the following criteria to be eli.{4158}and carcinoma in\\-situ of uterine cervix\\.\n\n \\- Patients with metastatic disease\\.\n \n $', flags=48)),
9: ClinicalTrialsDoc('NCT00530101', 'The Magnetic Resonance Imaging Evaluation of Doxorubicin Cardiotoxicity', '', re.compile('^\n \n The purpose of this research study is to evaluate MR imaging in subjects receiving\n .{166}ely 10 subjects over 12 months at the\n University of Miami / Miller School of Medicine\\.\n \n $', flags=48), re.compile('^\n \n Doxorubicin \\(Adriamycin\\) is one of the most widely used chemotherapy agents, despite its.{9660}ocardium\\.\n\n Medical records will provide data regarding cardiac morbidity or mortality\\.\n \n $', flags=48), re.compile('^\n \n Inclusion Criteria:\n\n \\- Subject must have breast cancer and undergoing rad.{112} \\- Healthy subjects\n\n \\- Males\n\n \\- Subjects under the age of 18\n \n $', flags=48)),
241005: ClinicalTrialsDoc('NCT00074646', 'Phase I Trial of CC-8490 for the Treatment of Subjects With Recurrent/Refractory High-Grade Gliomas', '', '\n \n Phase I trial of CC-8490 for the treatment of subjects with recurrent/refractory high-grade\n gliomas\n \n ', '', re.compile('^\n \n Inclusion Criteria:\n\n \\- Patients with glioblastoma multiforme \\(GBM\\), glios.{4283}lism\\.\n\n \\- Use of other experimental study drug within 28 days of registration\\.\n \n $', flags=48)),
})
self._test_docs('clinicaltrials/2019', count=306238, items={
0: ClinicalTrialsDoc('NCT00704457', 'Impact Of Sacral Neuromodulation On Urine Markers For Interstitial Cystitis (IC)', '', '\n \n Urine will be collected and sent to the University of Maryland. Urines will be analyzed for\n urine markers.\n \n ', re.compile('^\n \n Urine will be collected and flash frozen in liquid nitrogen then placed in a \\-70 C freez.{376} in urine\n marker levels will be analyzed and correlated with change in symptom scores\\.\n \n $', flags=48), '\n \n Inclusion Criteria:\n\n - Patients will be drawn from Dr. Peters patient base that covers Southeast Michigan.\n\n Exclusion Criteria:\n\n - Male\n \n '),
9: ClinicalTrialsDoc('NCT00705887', 'A Motivational Enhancement Approach to Skin Cancer Prevention', '', re.compile('^\n \n The specific aims of this research are:\n\n Aim 1 \\- To describe the UV protection beh.{579} protection stages of change, UV protection self\\-efficacy, and UV protection attitudes\\.\n \n $', flags=48), re.compile('^\n \n Although skin cancer is the most common form of cancer in the United States, it is highl.{806}nvestigated the application of these techniques to\n skin cancer prevention discussions\\.\n \n $', flags=48), re.compile('^\n \n Inclusion Criteria:\n\n \\- Dermatology patient presenting for scheduled appoi.{157}lish\n\n \\- Having previously received medical treatment from the interventionist\n \n $', flags=48)),
306237: ClinicalTrialsDoc('NCT03548415', 'Safety, Tolerability, and Efficacy of IONIS-GHR-LRx in up to 42 Adult Patients With Acromegaly Being Treated With Long-acting Somatostatin Receptor Ligands', '', '\n \n The purpose is to assess the Safety, Tolerability, and Efficacy of IONIS-GHR-LRx in up to 42\n Patients with Acromegaly\n \n ', re.compile('^\n \n This short\\-term study will assess changes in serum insulin\\-like growth factor 1 \\(IGF\\-1\\) .{68}sed with Acromegaly being treated\n with Long\\-acting Somatostatin Receptor Ligands \\(SRL\\)\n \n $', flags=48), re.compile('^\n \n Inclusion Criteria:\n\n 1\\. Males or females with documented diagnosis of Acro.{2002} stable dose and regimen for >= 3 months prior to screening and throughout the trial\n \n $', flags=48)),
})
self._test_docs('clinicaltrials/2021', count=375580, items={
0: ClinicalTrialsDoc('NCT00000102', 'Congenital Adrenal Hyperplasia: Calcium Channels as Therapeutic Targets', '', re.compile('^\n \n This study will test the ability of extended release nifedipine \\(Procardia XL\\), a blood\\\r.{66}ucocorticoid medication children\\\r\n take to treat congenital adrenal hyperplasia \\(CAH\\)\\.\\\r\n \n $', flags=48), re.compile('^\n \n This protocol is designed to assess both acute and chronic effects of the calcium channe.{716}e would, in turn, reduce the deleterious effects of glucocorticoid\\\r\n treatment in CAH\\.\\\r\n \n $', flags=48), re.compile('^\n \n Inclusion Criteria:\\\r\n\\\r\n \\- diagnosed with Congenital Adrenal Hyperplasia \\(C.{126}ase, or elevated liver function tests\\\r\n\\\r\n \\- history of cardiovascular disease\\\r\n \n $', flags=48)),
9: ClinicalTrialsDoc('NCT00000113', 'Correction of Myopia Evaluation Trial (COMET)', '', re.compile('^\n \n To evaluate whether progressive addition lenses \\(PALs\\) slow the rate of progression of\\\r\n.{291}opia in a group of children receiving\\\r\n conventional treatment \\(single vision lenses\\)\\.\\\r\n \n $', flags=48), re.compile('^\n \n Myopia \\(nearsightedness\\) is an important public health problem, which entails substantia.{3027}e secondary outcome of the study is axial length measured by A\\-scan\\\r\n ultrasonography\\.\\\r\n \n $', flags=48), re.compile('^\n \n Children between the ages of 6 and 12 years with myopia in both eyes \\(defined as sph.{484}ssive\\\r\n addition lenses, or any conditions precluding adherence to the protocol\\.\\\r\n \n $', flags=48)),
375579: ClinicalTrialsDoc('NCT04862312', 'Video Chat During Meals to Improve Nutritional Intake in Older Adults', '', re.compile('^\n \n The VideoDining study is a Stage IB behavioral intervention development project\\. The\\\r\n .{184}to\\\r\n evaluate changes in nutritional intake and loneliness in response to VideoDining\\.\\\r\n \n $', flags=48), re.compile('^\n \n The U\\.S\\. population is growing older and more adults are aging at home alone, by choice,.{1998}efinement of the\\\r\n VideoDining intervention are additional key outcomes of this study\\.\\\r\n \n $', flags=48), re.compile('^\n \n Inclusion Criteria:\\\r\n\\\r\n 1\\. Receive Meals\\-on\\-Wheels meals from Foodnet in To.{370} and participate in the study\\.\\\r\n\\\r\n 5\\. Already own and use an Amazon Echo Show\\.\\\r\n \n $', flags=48)),
})
def test_queries(self):
self._test_queries('clinicaltrials/2017/trec-pm-2017', count=30, items={
0: TrecPm2017Query('1', 'Liposarcoma', 'CDK4 Amplification', '38-year-old male', 'GERD'),
9: TrecPm2017Query('10', 'Lung adenocarcinoma', 'KRAS (G12C)', '61-year-old female', 'Hypertension, Hypercholesterolemia'),
29: TrecPm2017Query('30', 'Pancreatic adenocarcinoma', 'RB1, TP53, KRAS', '57-year-old female', 'None'),
})
self._test_queries('clinicaltrials/2017/trec-pm-2018', count=50, items={
0: TrecPmQuery('1', 'melanoma', 'BRAF (V600E)', '64-year-old male'),
9: TrecPmQuery('10', 'melanoma', 'KIT (L576P)', '65-year-old female'),
49: TrecPmQuery('50', 'acute myeloid leukemia', 'FLT3', '13-year-old male'),
})
self._test_queries('clinicaltrials/2019/trec-pm-2019', count=40, items={
0: TrecPmQuery('1', 'melanoma', 'BRAF (E586K)', '64-year-old female'),
9: TrecPmQuery('10', 'mucosal melanoma', 'KIT (L576P), KIT amplification', '62-year-old female'),
39: TrecPmQuery('40', 'malignant hyperthermia', 'RYR1', '54-year-old male'),
})
self._test_queries('clinicaltrials/2021/trec-ct-2021', count=75, items={
0: GenericQuery('1', '\nPatient is a 45-year-old man with a history of anaplastic astrocytoma of the spine complicated by severe lower extremity weakness and urinary retention s/p Foley catheter, high-dose steroids, hypertension, and chronic pain. The tumor is located in the T-L spine, unresectable anaplastic astrocytoma s/p radiation. Complicated by progressive lower extremity weakness and urinary retention. Patient initially presented with RLE weakness where his right knee gave out with difficulty walking and right anterior thigh numbness. MRI showed a spinal cord conus mass which was biopsied and found to be anaplastic astrocytoma. Therapy included field radiation t10-l1 followed by 11 cycles of temozolomide 7 days on and 7 days off. This was followed by CPT-11 Weekly x4 with Avastin Q2 weeks/ 2 weeks rest and repeat cycle. \n'),
9: GenericQuery('10', "\nPt is a 22yo F otherwise healthy with a 5 yr history of the systemic mastocytosis, with flares normally 3/year, presenting with flushing and tachycardia concerning for another flare. This is patient's 3rd flare in 2 months, while still on steroid taper which is new for her. She responded well to 125 mg IV steroids q 8 hrs and IV diphenydramine in addition to her continuing home regimen. CBC was at her baseline, w/normal differential. Serum tryptase revealed a high value at 84. The patient failed aspirin challenge due to adverse reaction. She was stabilized on IV steroids and IV benadryl and transferred back to the medical floor. She continued on her home histamine receptor blockers and was transitioned from IV to PO steroids and benadryl and observed overnight and was discharged on her home meds, prednisone taper, GI prophylaxis with PPI, Calcium and vitamin D, and SS bactrim for PCP.\n"),
74: GenericQuery('75', "\nThe patient is a 55-year-old man who was recently diagnosed with Parkinson's disease. He is complaining of slowness of movement and tremors. His disease is ranked as mild, Hoehn-Yahr Stage I. His past medical history is significant for hypertension and hypercholesterolemia. He lives with his wife. They have three children. He used to be active with gardening before his diagnosis. He complains of shaking and slow movement. He had difficulty entering through a door, as he was frozen and needed guidance to step in. His handwriting is getting smaller. He is offered Levodopa and Trihexyphenidyl. He is an alert and cooperative man who does not have any signs of dementia. He does not smoke or use any illicit drugs.\n"),
})
self._test_queries('clinicaltrials/2021/trec-ct-2022', count=50, items={
0: GenericQuery('1', '\nA 19-year-old male came to clinic with some sexual concern. He recently engaged in a relationship and is worried about the satisfaction of his girlfriend. He has a "baby face" according to his girlfriend\'s statement and he is not as muscular as his classmates. On physical examination, there is some pubic hair and poorly developed secondary sexual characteristics. He is unable to detect coffee smell during the examination, but the visual acuity is normal. Ultrasound reveals the testes volume of 1-2 ml. The hormonal evaluation showed serum testosterone level of 65 ng/dL with low levels of GnRH.\n'),
9: GenericQuery('10', '\nA 19-year-old girl comes to the clinic due to a left wrist mass. She noticed swelling on the top of her wrist about 4 months ago and came to the clinic due to cosmetic concerns. Examination shows a nontender, rounded mass on the dorsal wrist that transilluminates with a penlight. Vital signs are normal. The patient needs to type on her computer almost all day. She is left-handed. She does not smoke or use illicit drugs. She is in sexual relationship with two male partners and uses condoms. \n'),
49: GenericQuery('50', '\nA 70-year-old man comes to the office accompanied by his wife. The patient has experienced progressive memory loss over the last years. He needs help with some of his routine activities, such as paying bills. The patient\'s wife says, "He used to be such an independent person, but now he needs help with many things, even finding direction to home!" Medical history includes hypertension, hyperlipidemia, and type 2 diabetes mellitus. Family history includes Alzheimer disease in his father. MRI reveals diffuse cortical and hippocampal atrophy. The diagnosis of AD is made using the National Institute on Aging and the Alzheimer\'s Association (NIA-AA) criteria.\n'),
})
def test_qrels(self):
self._test_qrels('clinicaltrials/2017/trec-pm-2017', count=13019, items={
0: TrecQrel('1', 'NCT00001188', 0, '0'),
9: TrecQrel('1', 'NCT00002898', 0, '0'),
13018: TrecQrel('30', 'NCT03080974', 0, '0'),
})
self._test_qrels('clinicaltrials/2017/trec-pm-2018', count=14188, items={
0: TrecQrel('1', 'NCT00001452', 0, '0'),
9: TrecQrel('1', 'NCT00341991', 0, '0'),
14187: TrecQrel('50', 'NCT03096782', 0, '0'),
})
self._test_qrels('clinicaltrials/2019/trec-pm-2019', count=12996, items={
0: TrecQrel('1', 'NCT00001685', 0, '0'),
9: TrecQrel('1', 'NCT00119249', 1, '0'),
12995: TrecQrel('40', 'NCT03955640', 0, '0'),
})
self._test_qrels('clinicaltrials/2021/trec-ct-2021', count=35832, items={
0: TrecQrel('1', 'NCT00002569', 1, '0'),
9: TrecQrel('1', 'NCT00003466', 0, '0'),
35831: TrecQrel('75', 'NCT04858074', 1, '0'),
})
if __name__ == '__main__':
unittest.main()
================================================
FILE: test/integration/clirmatrix.py
================================================
import re
import unittest
import ir_datasets
from ir_datasets.formats import GenericDoc, GenericQuery, TrecQrel
from .base import DatasetIntegrationTest
_logger = ir_datasets.log.easy()
# Note: there's > 100k combinations here, so we are only testing a few cases
class TestCLIRMatrix(DatasetIntegrationTest):
def test_docs(self):
self._test_docs('clirmatrix/af', count=87705, items={
0: GenericDoc('123393', 'Weeskindertjies (plant) weeskind'),
9: GenericDoc('14515', re.compile('^Die Groot Beer \\(Latyn: Ursa Major\\) is ’n sterrebeeld wat heeljaar in die Noordelike Halfrond sigbaar.{873}8\xa0mag\\. 47\xa0Ursae Majoris het twee bevestigde planete, wat 2,54 en 0,76 keer die massa van Jupiter is\\.$', flags=48)),
87704: GenericDoc('18801', re.compile('^Die Suid\\-Afrikaanse Leër is die landmagkomponent van die Suid\\-Afrikaanse Nasionale Weermag en van sy.{964}Amptelike webwerf Hierdie artikel is ’n saadjie\\. Voel vry om Wikipedia te help deur dit uit te brei\\.$', flags=48)),
})
self._test_docs('clirmatrix/en', count=5984197, items={
0: GenericDoc('4274592', re.compile('^Transtar was the model name given to the line of trucks produced by the Studebaker Corporation of So.{910}asons, the Transtar name was dropped for the 1959 4E series Studebaker trucks and changed to Deluxe\\.$', flags=48)),
9: GenericDoc('23065547', re.compile('^Standard sea\\-level conditions \\(SSL\\), also known as sea\\-level standard \\(SLS\\), defines a set of atmosp.{827}orda, Introduction to Aerospace Engineering with a Flight Test Perspective, John Wiley \\& Sons, 2017\\.$', flags=48)),
5984196: GenericDoc('2160901', re.compile('^Resentment \\(also called ranklement or bitterness\\) is a complex, multilayered emotion that has been d.{1021}of by others; and having achievements go unrecognized, while others succeed without working as hard\\.$', flags=48)),
})
self._test_docs('clirmatrix/simple', count=153408, items={
0: GenericDoc('12559', re.compile('^A superlative, in grammar, is an adjective describing a noun that is the best example of a given qua.{684}the adverb "most" before the adjective\\. For instance, you do not say "funnest," or "interestingest"\\.$', flags=48)),
9: GenericDoc('120355', re.compile('^Occult refers to an area of knowledge or thought that is hidden\\. The word occult has many uses in th.{1069}pretation of Hinduism within Theosophy or the various occult interpretations of the Jewish Kabbalah\\.$', flags=48)),
153407: GenericDoc('54463', re.compile('^The history of the Christian religion and the Christian church began with Jesus and his apostles\\. Ch.{934}t\\. Peter, was that they did not, and the matter was further addressed with the Council of Jerusalem\\.$', flags=48)),
})
self._test_docs('clirmatrix/zh', count=1089043, items={
0: GenericDoc('449241', '虿盆,商朝时酷刑之一。将作弊官人跣剥干净,送下坑中,餵毒蛇、毒蝎等物。相传商朝最后一任君主纣王曾在大将黄飞虎之妻与纣王之妃子苏妲己发生口角之后将其推下虿盆,令其惨死。此刑罚在历史上使用较少。'),
9: GenericDoc('664068', re.compile('^篡位是一個貶义詞,即不合法或有爭議地取得王位\\(皇位\\)。包括殺上任皇帝/太子/廢立/逼迫上現任皇帝或君主交出皇位 在非君主制语境下,亦可泛指非法谋夺更高权力的行为(例如違反憲法而推行独裁,或在權限以外越.{29}为在元武宗\\(1307年\\)至元寧宗\\(1332年\\)的25年間,竟然換了八個皇帝,当中有三位皇帝\\(元天順帝、元明宗、元寧宗\\)在位時間甚至不足一年。 在同一王朝中通过杀害或逼退合法继承人或在位者的篡位者 政变$', flags=48)),
1089042: GenericDoc('6844113', re.compile('^谷風隧道為台灣的一條公路隧道,屬「台9線蘇花公路山區路段改善計劃」\\(蘇花改\\)南澳\\~和平段的其中一座隧道,北起鼓音橋,南接漢本高架橋,它穿越中央山脈鼓音溪至花蓮縣漢本的山區。谷風隧道南下及北上線均為45.{425}作、避難聯絡通道襯砌、通風隔板施作、新建通風機房,此外還須在避難聯絡通道內安裝照明系統及通訊設備,主隧道亦須安裝隧道照明燈具結線,安裝水霧支管,安裝噴流風機,此外隧道的所有土建工程及機電工程同步施工。$', flags=48)),
})
def test_queries(self):
self._test_queries('clirmatrix/af/bi139-base/en/train', count=9999, items={
0: GenericQuery('690', 'Aruba'),
9: GenericQuery('5615', 'Cretaceous'),
9998: GenericQuery('62732112', 'Efrain Gusquiza'),
})
self._test_queries('clirmatrix/af/bi139-base/en/dev', count=1000, items={
0: GenericQuery('2038', 'August Horch'),
9: GenericQuery('77606', 'Charles VIII of France'),
999: GenericQuery('62708410', '2020 in Morocco'),
})
self._test_queries('clirmatrix/af/bi139-base/en/test1', count=1000, items={
0: GenericQuery('3649', 'Geography of the British Virgin Islands'),
9: GenericQuery('107443', 'Coalinga, California'),
999: GenericQuery('62716625', 'Kevin Hall (disambiguation)'),
})
self._test_queries('clirmatrix/af/bi139-base/en/test2', count=1000, items={
0: GenericQuery('6011', 'Chomsky hierarchy'),
9: GenericQuery('97597', 'Flag of San Marino'),
999: GenericQuery('62707449', 'Machiel Kiel'),
})
self._test_queries('clirmatrix/en/bi139-base/af/train', count=10000, items={
0: GenericQuery('3', 'Lys van Afrikaanse skrywers'),
9: GenericQuery('95', 'Geskiedenis'),
9999: GenericQuery('285953', 'Jean-Claude Casadesus'),
})
self._test_queries('clirmatrix/en/bi139-full/af/train', count=58745, items={
0: GenericQuery('3', 'Lys van Afrikaanse skrywers'),
9: GenericQuery('26', 'Benue-Kongo-tale'),
58744: GenericQuery('286010', 'Lugmag van die Volksbevrydingsleër'),
})
self._test_queries('clirmatrix/en/multi8/fr/train', count=10000, items={
0: GenericQuery('45187', 'Mort'),
9: GenericQuery('7740', 'Lituanie'),
9999: GenericQuery('28573', 'Chiffres arabes'),
})
self._test_queries('clirmatrix/fr/multi8/en/train', count=10000, items={
0: GenericQuery('8221', 'Death'),
9: GenericQuery('17675', 'Lithuania'),
9999: GenericQuery('1786', 'Arabic numerals'),
})
self._test_queries('clirmatrix/de/multi8/en/train', count=10000, items={
0: GenericQuery('8221', 'Death'),
9: GenericQuery('17675', 'Lithuania'),
9999: GenericQuery('1786', 'Arabic numerals'),
})
def test_qrels(self):
self._test_qrels('clirmatrix/af/bi139-base/en/train', count=999900, items={
0: TrecQrel('690', '14013', 6, '0'),
9: TrecQrel('690', '15050', 0, '0'),
999899: TrecQrel('62732112', '259879', 0, '0'),
})
self._test_qrels('clirmatrix/af/bi139-base/en/dev', count=100000, items={
0: TrecQrel('2038', '13762', 3, '0'),
9: TrecQrel('2038', '272786', 0, '0'),
99999: TrecQrel('62708410', '258719', 0, '0'),
})
self._test_qrels('clirmatrix/af/bi139-base/en/test1', count=100000, items={
0: TrecQrel('3649', '50129', 5, '0'),
9: TrecQrel('3649', '93300', 0, '0'),
99999: TrecQrel('62716625', '140128', 0, '0'),
})
self._test_qrels('clirmatrix/af/bi139-base/en/test2', count=100000, items={
0: TrecQrel('6011', '11475', 6, '0'),
9: TrecQrel('6011', '69338', 0, '0'),
99999: TrecQrel('62707449', '112726', 0, '0'),
})
self._test_qrels('clirmatrix/en/bi139-base/af/train', count=1000000, items={
0: TrecQrel('3', '1617690', 5, '0'),
9: TrecQrel('3', '3943287', 3, '0'),
999999: TrecQrel('285953', '43443609', 0, '0'),
})
self._test_qrels('clirmatrix/en/bi139-full/af/train', count=3011938, items={
0: TrecQrel('3', '1617690', 5, '0'),
9: TrecQrel('3', '3943287', 3, '0'),
3011937: TrecQrel('286010', '400853', 1, '0'),
})
self._test_qrels('clirmatrix/en/multi8/fr/train', count=1000000, items={
0: TrecQrel('45187', '49703357', 5, '0'),
9: TrecQrel('45187', '12161221', 3, '0'),
999999: TrecQrel('28573', '40255894', 0, '0'),
})
self._test_qrels('clirmatrix/fr/multi8/en/train', count=1000000, items={
0: TrecQrel('8221', '45187', 6, '0'),
9: TrecQrel('8221', '1331378', 4, '0'),
999999: TrecQrel('1786', '9567503', 0, '0'),
})
self._test_qrels('clirmatrix/de/multi8/en/train', count=1000000, items={
0: TrecQrel('8221', '5204', 6, '0'),
9: TrecQrel('8221', '1092811', 4, '0'),
999999: TrecQrel('1786', '10264293', 0, '0'),
})
if __name__ == '__main__':
unittest.main()
================================================
FILE: test/integration/clueweb09.py
================================================
import re
import unittest
import ir_datasets
from ir_datasets.datasets.clueweb09 import TrecWebTrackQuery, TrecPrel
from ir_datasets.formats import TrecQrel, TrecSubtopic, GenericDoc, GenericQuery, WarcDoc
from .base import DatasetIntegrationTest
_logger = ir_datasets.log.easy()
class TestClueWeb09(DatasetIntegrationTest):
def test_clueweb09_docs(self):
self._test_docs('clueweb09', items={
0: WarcDoc('clueweb09-ar0000-00-00000', 'http://0098shop.com/product_"EH24_A\'1C3_Forex_(\'2\'131E\'�G.html', '2009-03-84T15:35:08-0700', re.compile(b'^HTTP/1\\.1 200 OK\nServer: Apache/2\\.2\\.11 \\(Unix\\) mod_ssl/2\\.2\\.11 OpenSSL/0\\.9\\.8b DAV/2 mod_auth_passthroug.{92}\nConnection: close\nContent\\-Type: text/html\nDate: Fri, 27 Feb 2009 16:04:39 GMT\nContent\\-Length: 38889$', flags=16), re.compile(b'^\\\r\n\n\\\t\n