Full Code of allenai/ir_datasets for AI

master ae24b5302c56 cached
269 files
3.1 MB
812.3k tokens
2346 symbols
1 requests
Download .txt
Showing preview only (3,332K chars total). Download the full file or copy to clipboard to get everything.
Repository: allenai/ir_datasets
Branch: master
Commit: ae24b5302c56
Files: 269
Total size: 3.1 MB

Directory structure:
gitextract_2j6ggfs5/

├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug_report.md
│   │   ├── dataset-addition.md
│   │   ├── documentation.md
│   │   └── feature_request.md
│   └── workflows/
│       ├── deploy.yml
│       └── test.yml
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── examples/
│   ├── adding_datasets.ipynb
│   ├── clirmatrix_example.py
│   ├── ir_datasets.ipynb
│   └── ir_datasets_cli.ipynb
├── ir_datasets/
│   ├── __init__.py
│   ├── __main__.py
│   ├── commands/
│   │   ├── __init__.py
│   │   ├── build_c4_checkpoints.py
│   │   ├── build_clueweb_warc_indexes.py
│   │   ├── build_download_cache.py
│   │   ├── clean.py
│   │   ├── doc_fifos.py
│   │   ├── export.py
│   │   ├── generate_metadata.py
│   │   ├── list.py
│   │   └── lookup.py
│   ├── datasets/
│   │   ├── __init__.py
│   │   ├── antique.py
│   │   ├── aol_ia.py
│   │   ├── aquaint.py
│   │   ├── argsme.py
│   │   ├── base.py
│   │   ├── beir.py
│   │   ├── c4.py
│   │   ├── car.py
│   │   ├── clinicaltrials.py
│   │   ├── clirmatrix.py
│   │   ├── clueweb09.py
│   │   ├── clueweb12.py
│   │   ├── codec.py
│   │   ├── codesearchnet.py
│   │   ├── cord19.py
│   │   ├── cranfield.py
│   │   ├── csl.py
│   │   ├── disks45.py
│   │   ├── dpr_w100.py
│   │   ├── gov.py
│   │   ├── gov2.py
│   │   ├── hc4.py
│   │   ├── highwire.py
│   │   ├── istella22.py
│   │   ├── kilt.py
│   │   ├── lotte.py
│   │   ├── medline.py
│   │   ├── miracl.py
│   │   ├── mmarco.py
│   │   ├── mr_tydi.py
│   │   ├── msmarco_document.py
│   │   ├── msmarco_document_v2.py
│   │   ├── msmarco_passage.py
│   │   ├── msmarco_passage_v2.py
│   │   ├── msmarco_qna.py
│   │   ├── nano_beir.py
│   │   ├── natural_questions.py
│   │   ├── neuclir.py
│   │   ├── neumarco.py
│   │   ├── nfcorpus.py
│   │   ├── nyt.py
│   │   ├── pmc.py
│   │   ├── sara.py
│   │   ├── touche.py
│   │   ├── touche_image.py
│   │   ├── trec_arabic.py
│   │   ├── trec_cast.py
│   │   ├── trec_fair.py
│   │   ├── trec_mandarin.py
│   │   ├── trec_robust04.py
│   │   ├── trec_spanish.py
│   │   ├── trec_tot.py
│   │   ├── trec_tot_2025.py
│   │   ├── tripclick.py
│   │   ├── tweets2013_ia.py
│   │   ├── vaswani.py
│   │   ├── wapo.py
│   │   ├── wikiclir.py
│   │   └── wikir.py
│   ├── docs/
│   │   ├── antique.yaml
│   │   ├── aol-ia.yaml
│   │   ├── aquaint.yaml
│   │   ├── argsme.yaml
│   │   ├── beir.yaml
│   │   ├── bibliography.bib
│   │   ├── c4.yaml
│   │   ├── car.yaml
│   │   ├── clinicaltrials.yaml
│   │   ├── clirmatrix.yaml
│   │   ├── clueweb09.yaml
│   │   ├── clueweb12.yaml
│   │   ├── codec.yaml
│   │   ├── codesearchnet.yaml
│   │   ├── cord19.yaml
│   │   ├── cranfield.yaml
│   │   ├── csl.yaml
│   │   ├── disks45.yaml
│   │   ├── dpr-w100.yaml
│   │   ├── gov.yaml
│   │   ├── gov2.yaml
│   │   ├── hc4.yaml
│   │   ├── highwire.yaml
│   │   ├── istella22.yaml
│   │   ├── kilt.yaml
│   │   ├── lotte.yaml
│   │   ├── medline.yaml
│   │   ├── miracl.yaml
│   │   ├── mmarco.yaml
│   │   ├── mr-tydi.yaml
│   │   ├── msmarco-document-v2.yaml
│   │   ├── msmarco-document.yaml
│   │   ├── msmarco-passage-v2.yaml
│   │   ├── msmarco-passage.yaml
│   │   ├── msmarco-qna.yaml
│   │   ├── nano-beir.yaml
│   │   ├── natural-questions.yaml
│   │   ├── neuclir.yaml
│   │   ├── neumarco.yaml
│   │   ├── nfcorpus.yaml
│   │   ├── nyt.yaml
│   │   ├── pmc.yaml
│   │   ├── sara.yaml
│   │   ├── touche-image.yaml
│   │   ├── touche.yaml
│   │   ├── trec-arabic.yaml
│   │   ├── trec-cast.yaml
│   │   ├── trec-fair.yaml
│   │   ├── trec-mandarin.yaml
│   │   ├── trec-robust04.yaml
│   │   ├── trec-spanish.yaml
│   │   ├── trec-tot-2025.yaml
│   │   ├── trec-tot.yaml
│   │   ├── tripclick.yaml
│   │   ├── tweets2013-ia.yaml
│   │   ├── vaswani.yaml
│   │   ├── wapo.yaml
│   │   ├── wikiclir.yaml
│   │   └── wikir.yaml
│   ├── etc/
│   │   ├── downloads.json
│   │   └── metadata.json
│   ├── formats/
│   │   ├── __init__.py
│   │   ├── argsme.py
│   │   ├── base.py
│   │   ├── clirmatrix.py
│   │   ├── csv_fmt.py
│   │   ├── extracted_cc.py
│   │   ├── jsonl.py
│   │   ├── ntcir.py
│   │   ├── touche.py
│   │   ├── touche_image.py
│   │   ├── trec.py
│   │   ├── tsv.py
│   │   └── webarc.py
│   ├── indices/
│   │   ├── __init__.py
│   │   ├── base.py
│   │   ├── cache_docstore.py
│   │   ├── clueweb_warc.py
│   │   ├── indexed_tsv_docstore.py
│   │   ├── lz4_pickle.py
│   │   ├── numpy_sorted_index.py
│   │   └── zpickle_docstore.py
│   ├── lazy_libs.py
│   ├── log.py
│   ├── util/
│   │   ├── __init__.py
│   │   ├── docs/
│   │   │   ├── __init__.py
│   │   │   ├── lazy.py
│   │   │   ├── multiple.py
│   │   │   └── subset.py
│   │   ├── download.py
│   │   ├── fileio.py
│   │   ├── hash.py
│   │   ├── html_parsing.py
│   │   ├── metadata.py
│   │   └── registry.py
│   └── wrappers/
│       ├── __init__.py
│       └── html_extractor.py
├── pyproject.toml
├── requirements-test.txt
├── requirements.txt
└── test/
    ├── __init__.py
    ├── downloads.py
    ├── dummy/
    │   ├── docs.tsv
    │   ├── qrels
    │   └── queries.tsv
    ├── formats/
    │   ├── __init__.py
    │   ├── test_trec.py
    │   └── test_tsv.py
    ├── indices/
    │   ├── __init__.py
    │   ├── lz4_pickle.py
    │   └── numpy_sorted.py
    ├── integration/
    │   ├── __init__.py
    │   ├── antique.py
    │   ├── aol_ia.py
    │   ├── aquaint.py
    │   ├── argsme.py
    │   ├── base.py
    │   ├── beir.py
    │   ├── c4.py
    │   ├── car.py
    │   ├── clinicaltrials.py
    │   ├── clirmatrix.py
    │   ├── clueweb09.py
    │   ├── clueweb12.py
    │   ├── codec.py
    │   ├── codesearchnet.py
    │   ├── cord19.py
    │   ├── cranfield.py
    │   ├── csl.py
    │   ├── disks45.py
    │   ├── dpr_w100.py
    │   ├── dummy.py
    │   ├── gov.py
    │   ├── gov2.py
    │   ├── hc4.py
    │   ├── highwire.py
    │   ├── istella22.py
    │   ├── kilt.py
    │   ├── lotte.py
    │   ├── medline.py
    │   ├── miracl.py
    │   ├── mmarco.py
    │   ├── mr_tydi.py
    │   ├── msmarco_document.py
    │   ├── msmarco_document_v2.py
    │   ├── msmarco_passage.py
    │   ├── msmarco_passage_v2.py
    │   ├── msmarco_qna.py
    │   ├── nano_beir.py
    │   ├── natural_questions.py
    │   ├── neuclir.py
    │   ├── neumarco.py
    │   ├── nfcorpus.py
    │   ├── nyt.py
    │   ├── pmc.py
    │   ├── sara.py
    │   ├── touche.py
    │   ├── touche_image.py
    │   ├── trec_arabic.py
    │   ├── trec_cast.py
    │   ├── trec_fair.py
    │   ├── trec_mandarin.py
    │   ├── trec_robust04.py
    │   ├── trec_spanish.py
    │   ├── trec_tot.py
    │   ├── trec_tot_2024.py
    │   ├── trec_tot_2025/
    │   │   ├── test_docs_iter.py
    │   │   ├── test_docs_store.py
    │   │   ├── test_qrel_iter.py
    │   │   └── test_queries_iter.py
    │   ├── tripclick.py
    │   ├── tweets2013_ia.py
    │   ├── vaswani.py
    │   ├── wapo.py
    │   ├── wikiclir.py
    │   └── wikir.py
    ├── metadata.py
    ├── test_defaulttext.py
    ├── util/
    │   └── docs/
    │       ├── __init__.py
    │       ├── data.py
    │       ├── test_multiple.py
    │       └── test_subset.py
    └── util.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: Bug report
about: Errors in behavior or functionality
title: ''
labels: bug
assignees: ''

---

**Describe the bug**
A clear and concise description of what the bug is.

**Affected dataset(s)**

**To Reproduce**
Steps to reproduce the behavior:
1. Go to '...'
2. Click on '....'
3. Scroll down to '....'
4. See error

**Expected behavior**
A clear and concise description of what you expected to happen.

**Additional context**
Add any other context about the problem here.


================================================
FILE: .github/ISSUE_TEMPLATE/dataset-addition.md
================================================
---
name: Dataset Addition
about: Propose adding a new dataset, collection of related datasets, or feature to
  existing dataset
title: ''
labels: add-dataset
assignees: ''

---

**Dataset Information:**

<brief description>

**Links to Resources:**

<links including data websites, repositories, papers, etc. that would help in adding the dataset.>

**Dataset ID(s) & supported entities:**

 - <propose dataset ID(s), and where they fit in the hierarchy, and specify which entity types each will provide (docs, queries, qrels, scoreddocs, docpairs, qlogs)>

**Checklist**

Mark each task once completed. All should be checked prior to merging a new dataset.
  
 - [ ] Dataset definition (in `ir_datasets/datasets/[topid].py`)
 - [ ] Tests (in `tests/integration/[topid].py`)
 - [ ] Metadata generated (using `ir_datasets generate_metadata` command, should appear in `ir_datasets/etc/metadata.json`)
 - [ ] Documentation (in `ir_datasets/etc/[topid].yaml`)
   - [ ] Documentation generated in https://github.com/seanmacavaney/ir-datasets.com/
 - [ ] Downloadable content (in `ir_datasets/etc/downloads.json`)
   - [ ] Download verification action (in `.github/workflows/verify_downloads.yml`). Only one needed per `topid`.
   - [ ] Any small public files from NIST (or other potentially troublesome files) mirrored in https://github.com/seanmacavaney/irds-mirror/. Mirrored status properly reflected in `downloads.json`.
  
**Additional comments/concerns/ideas/etc.**


================================================
FILE: .github/ISSUE_TEMPLATE/documentation.md
================================================
---
name: Documentation
about: Additions to or improvmenets to the documentation
title: ''
labels: documentation
assignees: ''

---

**Dataset(s)**


**Describe the proposed change**


================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.md
================================================
---
name: Feature request
about: Suggest an idea for this project
title: ''
labels: enhancement
assignees: ''

---

**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]

**Describe the solution you'd like**
A clear and concise description of what you want to happen.

**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.

**Additional context**
Add any other context or screenshots about the feature request here.


================================================
FILE: .github/workflows/deploy.yml
================================================
name: deploy

on:
  release:
    types: [created]

jobs:
  pypi:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v2
    - uses: actions/setup-python@v2
      with:
        python-version: '3.x'
    - name: install-deps
      run: |
        python -m pip install --upgrade pip
        pip install build setuptools wheel twine
    - name: build
      run: |
        python -m build
    - name: upload
      env:
        TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
        TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
      run: |
        twine upload dist/*


================================================
FILE: .github/workflows/test.yml
================================================
name: test

on:
  push: {branches: [master]} # pushes to master
  pull_request: {} # all PRs

jobs:
  pytest:
    strategy:
      matrix:
        python-version: ['3.10', '3.12']
        os: ['ubuntu-latest', 'windows-latest', 'macos-latest']

    runs-on: ${{ matrix.os }}
    steps:

    - name: Checkout
      uses: actions/checkout@v4

    - name: Install Python ${{ matrix.python-version }}
      uses: actions/setup-python@v5
      with:
        python-version: ${{ matrix.python-version }}

    - name: Install Dependencies
      run: |
        pip install --upgrade -r requirements.txt -r requirements-test.txt
        pip install -e '.[all]'

    - name: Unit Test
      if: matrix.os == 'ubuntu-latest' || matrix.os == 'macOs-latest'
      run: |
        pip install pytest
        pytest test/util.py test/metadata.py test/integration/dummy.py test/integration/vaswani.py test/formats/ test/test_defaulttext.py

    - name: Unit Test (Windows)
      if: matrix.os == 'windows-latest'
      shell: cmd
      run: |
        pip install pytest
        pytest test\util.py test\metadata.py test\integration\dummy.py test\integration\vaswani.py test\formats\ test\test_defaulttext.py
      env:
        PATH: 'C:/Program Files/zlib/bin/'


================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

.DS_Store


================================================
FILE: LICENSE
================================================

                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: MANIFEST.in
================================================
recursive-include ir_datasets *.yaml
recursive-include ir_datasets *.bib
recursive-include ir_datasets *.json


================================================
FILE: README.md
================================================
# ir_datasets

`ir_datasets` is a python package that provides a common interface to many IR ad-hoc ranking
benchmarks, training datasets, etc.

The package takes care of downloading datasets (including documents, queries, relevance judgments,
etc.) when available from public sources. Instructions on how to obtain datasets are provided when
they are not publicly available.

`ir_datasets` provides a common iterator format to allow them to be easily used in python. It
attempts to provide the data in an unaltered form (i.e., keeping all fields and markup), while
handling differences in file formats, encoding, etc. Adapters provide extra functionality, e.g., to
allow quick lookups of documents by ID.

A command line interface is also available.

You can find a list of datasets and their features [here](https://ir-datasets.com/).
Want a new dataset, added functionality, or a bug fixed? Feel free to post an issue or make a pull request! 

## Getting Started

For a quick start with the Python API, check out our Colab tutorials:
[Python](https://colab.research.google.com/github/allenai/ir_datasets/blob/master/examples/ir_datasets.ipynb)
[Command Line](https://colab.research.google.com/github/allenai/ir_datasets/blob/master/examples/ir_datasets_cli.ipynb)

Install via pip:

```
pip install ir_datasets
```

If you want the main branch, you install as such:

```
pip install git+https://github.com/allenai/ir_datasets.git
```

If you want to run an editable version locally:

```
$ git clone https://github.com/allenai/ir_datasets
$ cd ir_datasets
$ pip install -e .    
```

Tested with python versions 3.7, 3.8, 3.9, and 3.10. (Mininum python version is 3.7.)

## Features

**Python and Command Line Interfaces**. Access datasts both through a simple Python API and
via the command line.

```python
import ir_datasets
dataset = ir_datasets.load('msmarco-passage/train')
# Documents
for doc in dataset.docs_iter():
    print(doc)
# GenericDoc(doc_id='0', text='The presence of communication amid scientific minds was equa...
# GenericDoc(doc_id='1', text='The Manhattan Project and its atomic bomb helped bring an en...
# ...
```

```bash
ir_datasets export msmarco-passage/train docs | head -n2
0 The presence of communication amid scientific minds was equally important to the success of the Manh...
1 The Manhattan Project and its atomic bomb helped bring an end to World War II. Its legacy of peacefu...
```

**Automatically downloads source files** (when available). Will download and verify the source
files for queries, documents, qrels, etc. when they are publicly available, as they are needed.
A CI build checks weekly to ensure that all the downloadable content is available and correct:
[![Downloadable Content](https://github.com/seanmacavaney/ir-datasets.com/actions/workflows/verify_downloads.yml/badge.svg)](https://github.com/seanmacavaney/ir-datasets.com/actions/workflows/verify_downloads.yml).
We mirror some troublesome files on [mirror.ir-datasets.com](https://mirror.ir-datasets.com/), and
automatically switch to the mirror when the original source is not available.

```python
import ir_datasets
dataset = ir_datasets.load('msmarco-passage/train')
for doc in dataset.docs_iter(): # Will download and extract MS-MARCO's collection.tar.gz the first time
    ...
for query in dataset.queries_iter(): # Will download and extract MS-MARCO's queries.tar.gz the first time
    ...
```

**Instructions for dataset access** (when not publicly available). Provides instructions on how
to get a copy of the data when it is not publicly available online (e.g., when it requires a
data usage agreement).

```python
import ir_datasets
dataset = ir_datasets.load('trec-arabic')
for doc in dataset.docs_iter():
    ...
# Provides the following instructions:
# The dataset is based on the Arabic Newswire corpus. It is available from the LDC via: <https://catalog.ldc.upenn.edu/LDC2001T55>
# To proceed, symlink the source file here: [gives path]
```

**Support for datasets big and small**. By using iterators, supports large datasets that may
not fit into system memory, such as ClueWeb.

```python
import ir_datasets
dataset = ir_datasets.load('clueweb09')
for doc in dataset.docs_iter():
    ... # will iterate through all ~1B documents
```

**Fixes known dataset issues**. For instance, automatically corrects the document UTF-8 encoding
problem in the MS-MARCO passage collection.

```python
import ir_datasets
dataset = ir_datasets.load('msmarco-passage')
docstore = dataset.docs_store()
docstore.get('243').text
# "John Maynard Keynes, 1st Baron Keynes, CB, FBA (/ˈkeɪnz/ KAYNZ; 5 June 1883 – 21 April [SNIP]"
# Naïve UTF-8 decoding yields double-encoding artifacts like:
# "John Maynard Keynes, 1st Baron Keynes, CB, FBA (/Ë\x88keɪnz/ KAYNZ; 5 June 1883 â\x80\x93 21 April [SNIP]"
#                                                  ~~~~~~  ~~                       ~~~~~~~~~
```

**Fast Random Document Access.** Builds data structures that allow fast and efficient lookup of
document content. For large datasets, such as ClueWeb, uses
[checkpoint files](https://ir-datasets.com/clueweb_warc_checkpoints.md) to load documents from
source 40x faster than normal. Results are cached for even faster subsequent accesses.

```python
import ir_datasets
dataset = ir_datasets.load('clueweb12')
docstore = dataset.docs_store()
docstore.get_many(['clueweb12-0000tw-05-00014', 'clueweb12-0000tw-05-12119', 'clueweb12-0106wb-18-19516'])
# {'clueweb12-0000tw-05-00014': ..., 'clueweb12-0000tw-05-12119': ..., 'clueweb12-0106wb-18-19516': ...}
```

**Fancy Iter Slicing.** Sometimes it's helpful to be able to select ranges of data (e.g., for processing
document collections in parallel on multiple devices). Efficient implementations of slicing operations
allow for much faster dataset partitioning than using `itertools.slice`.

```python
import ir_datasets
dataset = ir_datasets.load('clueweb12')
dataset.docs_iter()[500:1000] # normal slicing behavior
# WarcDoc(doc_id='clueweb12-0000tw-00-00502', ...), WarcDoc(doc_id='clueweb12-0000tw-00-00503', ...), ...
dataset.docs_iter()[-10:-8] # includes negative indexing
# WarcDoc(doc_id='clueweb12-1914wb-28-24245', ...), WarcDoc(doc_id='clueweb12-1914wb-28-24246', ...)
dataset.docs_iter()[::100] # includes support for skip (only positive values)
# WarcDoc(doc_id='clueweb12-0000tw-00-00000', ...), WarcDoc(doc_id='clueweb12-0000tw-00-00100', ...), ...
dataset.docs_iter()[1/3:2/3] # supports proportional slicing (this takes the middle third of the collection)
# WarcDoc(doc_id='clueweb12-0605wb-28-12714', ...), WarcDoc(doc_id='clueweb12-0605wb-28-12715', ...), ...
```

## Datasets

Available datasets include:
 - [ANTIQUE](https://ir-datasets.com/antique.html)
 - [AQUAINT](https://ir-datasets.com/aquaint.html)
 - [BEIR (benchmark suite)](https://ir-datasets.com/beir.html)
 - [TREC CAR](https://ir-datasets.com/car.html)
 - [C4](https://ir-datasets.com/c4.html)
 - [ClueWeb09](https://ir-datasets.com/clueweb09.html)
 - [ClueWeb12](https://ir-datasets.com/clueweb12.html)
 - [CLIRMatrix](https://ir-datasets.com/clirmatrix.html)
 - [CodeSearchNet](https://ir-datasets.com/codesearchnet.html)
 - [CORD-19](https://ir-datasets.com/cord19.html)
 - [DPR Wiki100](https://ir-datasets.com/dpr-w100.html)
 - [GOV](https://ir-datasets.com/gov.html)
 - [GOV2](https://ir-datasets.com/gov2.html)
 - [HC4](https://ir-datasets.com/hc4.html)
 - [Highwire (TREC Genomics 2006-07)](https://ir-datasets.com/highwire.html)
 - [Medline](https://ir-datasets.com/medline.html)
 - [MSMARCO (document)](https://ir-datasets.com/msmarco-document.html)
 - [MSMARCO (passage)](https://ir-datasets.com/msmarco-passage.html)
 - [MSMARCO (QnA)](https://ir-datasets.com/msmarco-qna.html)
 - [Natural Questions](https://ir-datasets.com/natural-questions.html)
 - [NFCorpus (NutritionFacts)](https://ir-datasets.com/nfcorpus.html)
 - [NYT](https://ir-datasets.com/nyt.html)
 - [PubMed Central (TREC CDS)](https://ir-datasets.com/pmc.html)
 - [TREC Arabic](https://ir-datasets.com/trec-arabic.html)
 - [TREC Fair Ranking 2021](https://ir-datasets.com/trec-fair-2021.html)
 - [TREC Mandarin](https://ir-datasets.com/trec-mandarin.html)
 - [TREC Robust 2004](https://ir-datasets.com/trec-robust04.html)
 - [TREC Spanish](https://ir-datasets.com/trec-spanish.html)
 - [TripClick](https://ir-datasets.com/tripclick.html)
 - [Tweets 2013 (Internet Archive)](https://ir-datasets.com/tweets2013-ia.html)
 - [Vaswani](https://ir-datasets.com/vaswani.html)
 - [Washington Post](https://ir-datasets.com/wapo.html)
 - [WikIR](https://ir-datasets.com/wikir.html)

There are "subsets" under each dataset. For instance, `clueweb12/b13/trec-misinfo-2019` provides the
queries and judgments from the [2019 TREC misinformation track](https://trec.nist.gov/data/misinfo2019.html),
and `msmarco-document/orcas` provides the [ORCAS dataset](https://microsoft.github.io/msmarco/ORCAS). They
tend to be organized with the document collection at the top level.

See the ir_dataets docs ([ir_datasets.com](https://ir-datasets.com/)) for details about each
dataset, its available subsets, and what data they provide.

## Environment variables

 - `IR_DATASETS_HOME`: Home directory for ir_datasets data (default `~/.ir_datasets/`). Contains directories
   for each top-level dataset.
 - `IR_DATASETS_TMP`: Temporary working directory (default `/tmp/ir_datasets/`).
 - `IR_DATASETS_DL_TIMEOUT`: Download stream read timeout, in seconds (default `15`). If no data is received
   within this duration, the connection will be assumed to be dead, and another download may be attempted.
 - `IR_DATASETS_DL_TRIES`: Default number of download attempts before exception is thrown (default `3`).
   When the server accepts Range requests, uses them. Otherwise, will download the entire file again
 - `IR_DATASETS_DL_DISABLE_PBAR`: Set to `true` to disable the progress bar for downloads. Useful in settings
   where an interactive console is not available.
 - `IR_DATASETS_DL_SKIP_SSL`: Set to `true` to disable checking SSL certificates when downloading files.
   Useful as a short-term solution when SSL certificates expire or are otherwise invalid. Note that this
   does not disable hash verification of the downloaded content.
 - `IR_DATASETS_SKIP_DISK_FREE`: Set to `true` to disable checks for enough free space on disk before
   downloading content or otherwise creating large files.
 - `IR_DATASETS_SMALL_FILE_SIZE`: The size of files that are considered "small", in bytes. Instructions for
   linking small files rather then downloading them are not shown. Defaults to 5000000 (5MB).

## Citing

When using datasets provided by this package, be sure to properly cite them. Bibtex for each dataset
can be found on the [datasets documentation page](https://ir-datasets.com/).

If you use this tool, please cite [our SIGIR resource paper](https://arxiv.org/pdf/2103.02280.pdf):

```
@inproceedings{macavaney:sigir2021-irds,
  author = {MacAvaney, Sean and Yates, Andrew and Feldman, Sergey and Downey, Doug and Cohan, Arman and Goharian, Nazli},
  title = {Simplified Data Wrangling with ir_datasets},
  year = {2021},
  booktitle = {SIGIR}
}
```

## Credits

Contributors to this repository:

 - Sean MacAvaney (University of Glasgow)
 - Shuo Sun (Johns Hopkins University)
 - Thomas Jänich (University of Glasgow)
 - Jan Heinrich Reimer (Martin Luther University Halle-Wittenberg)
 - Maik Fröbe (Martin Luther University Halle-Wittenberg)
 - Eugene Yang (Johns Hopkins University)
 - Augustin Godinot (NAVERLABS Europe, ENS Paris-Saclay)


================================================
FILE: examples/adding_datasets.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ir_datasets - Adding Datasets"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This tutorial covers the process for adding a new dataset to the `ir_datasets` package.\n",
    "\n",
    "This tutorial is for datasets that are inteded to be added to the main package. For an example of an extension, see [this example extension](https://github.com/seanmacavaney/dummy-irds-ext).\n",
    "\n",
    "Before starting, we recommend [opening an issue](https://github.com/allenai/ir_datasets/issues/new/choose) so various decisions about how to support the dataset can be discussed.\n",
    "\n",
    "There are four files involved in adding a dataset to the `ir_datasets` package:\n",
    " - `ir_datasets/datasets/[dataset-id].py` - Contains the definition of the dataset and any specialized code for handling it.\n",
    " - `ir_datasets/etc/downloads.json` - Contains information about how to download and verify dataset source files.\n",
    " - `ir_datasets/docs/[dataset-id].yaml` - Contains documentation of the dataset.\n",
    " - `test/integration/[dataset-id].py` - Contains automated tests to ensure the dataset is processed as expected.\n",
    " \n",
    "We will now show examples of each of these files for a toy dataset called `dummy`, with files hosted here: https://github.com/seanmacavaney/dummy-irds-ext/tree/master/data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "File: `ir_datasets/datasets/dummy.py`\n",
    "\n",
    "```python\n",
    "import ir_datasets\n",
    "from ir_datasets.formats import TsvDocs, TsvQueries, TrecQrels\n",
    "\n",
    "# A unique identifier for this dataset. This should match the file name (with \"-\" instead of \"_\")\n",
    "NAME = 'dummy'\n",
    "\n",
    "# What do the relevance levels in qrels mean?\n",
    "QREL_DEFS = {\n",
    "    1: 'relevant',\n",
    "    0: 'not relevant',\n",
    "}\n",
    "\n",
    "# This message is shown to the user before downloads are started\n",
    "DUA = 'Please confirm that you agree to the data usage agreement at <https://some-url/>'\n",
    "\n",
    "# An initialization function is used to keep the namespace clean\n",
    "def _init():\n",
    "    # The directory where this dataset's data files will be stored\n",
    "    base_path = ir_datasets.util.home_path() / NAME\n",
    "    \n",
    "    # Load an object that is used for providing the documentation\n",
    "    documentation = YamlDocumentation(f'docs/{NAME}.yaml')\n",
    "    \n",
    "    # A reference to the downloads file, under the key \"dummy\". (DLC stands for DownLoadable Content)\n",
    "    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)\n",
    "    \n",
    "    # How to process the documents. Since they are in a typical TSV format, we'll use TsvDocs.\n",
    "    # Note that other dataset formats may require you to write a custom docs handler (BaseDocs).\n",
    "    # Note that this doesn't process the documents now; it just defines how they are processed.\n",
    "    docs = TsvDocs(dlc['docs'], namespace=NAME, lang='en')\n",
    "    \n",
    "    # How to process the queries. Similar to the documents, you may need to write a custom\n",
    "    # queries handler (BaseQueries).\n",
    "    queries = TsvQueries(dlc['queries'], namespace=NAME, lang='en')\n",
    "    \n",
    "    # Qrels: The qrels file is in the TREC format, so we'll use TrecQrels to process them\n",
    "    qrels = TrecQrels(dlc['qrels'], QREL_DEFS)\n",
    "    \n",
    "    # Package the docs, queries, qrels, and documentation into a Dataset object\n",
    "    dataset = Dataset(docs, queries, qrels, documentation('_'))\n",
    "    \n",
    "    # Register the dataset in ir_datasets\n",
    "    ir_datasets.registry.register(NAME, dataset)\n",
    "    \n",
    "    return dataset # used for exposing dataset to the namespace\n",
    "\n",
    "dataset = _init()\n",
    "```\n",
    "\n",
    "Note that you also need to add this file to `ir_datasets/datasets/__init__.py`:\n",
    "\n",
    "```python\n",
    "from . import dummy\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "File: `ir_datasets/etc/downloads.json`\n",
    "\n",
    "(add lines like these to the file)\n",
    "\n",
    "```json\n",
    "\"dummy\": {\n",
    "  \"docs\": {\n",
    "    \"url\": \"https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/docs.tsv\",\n",
    "    \"expected_md5\": \"c7bb5a1a3a07d51de50e8414245c2be4\",\n",
    "    \"cache_path\": \"docs.tsv\"\n",
    "  },\n",
    "  \"queries\": {\n",
    "    \"url\": \"https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/queries.tsv\",\n",
    "    \"expected_md5\": \"08ba86d990cbe6890f727946346964db\",\n",
    "    \"cache_path\": \"queries.tsv\"\n",
    "  },\n",
    "  \"qrels\": {\n",
    "    \"url\": \"https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/qrels\",\n",
    "    \"expected_md5\": \"79ed359fe0afa0f67eb39f468d162920\",\n",
    "    \"cache_path\": \"qrels\"\n",
    "  }\n",
    "}\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "File: `ir_datasets/docs/dummy.yaml`\n",
    "\n",
    "```yaml\n",
    "_: # matches documentation key above\n",
    "  pretty_name: 'Dummy' # a more human-readable way to present this dataset than the dataset-id\n",
    "  desc: '\n",
    "<p>\n",
    "HTML-encoded and human-readable information about this dataset.\n",
    "Include a brief description of the dataset.\n",
    "Be sure to include important decisions made when processing it.\n",
    "Also, link to more information, e.g. websites, papers, etc.\n",
    "</p>\n",
    "<ul>\n",
    "  <li><a href=\"https://github.com/seanmacavaney/dummy-irds-ext\">Link to the source</a></li>\n",
    "</ul>' \n",
    "  bibtex: |\n",
    "    @misc{dummy,\n",
    "      title={Dummy: a made-up dataset},\n",
    "      year={2021}\n",
    "    }\n",
    "```\n",
    "\n",
    "To generate the HTML documentation files, run `python -m ir_datasets documentation`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "File: `test/integration/dummy.py`\n",
    "\n",
    "```python\n",
    "from ir_datasets.formats import GenericQuery, GenericDoc, TrecQrel\n",
    "from .base import DatasetIntegrationTest\n",
    "\n",
    "class TestDummy(DatasetIntegrationTest):\n",
    "    def test_docs(self):\n",
    "        # Test that the dataset 'dummy' has 15 documents, and test the specific docs at indices 0, 9, and 14\n",
    "        self._test_docs('dummy', count=15, items={\n",
    "            0: GenericDoc('T1', 'CUT, CAP AND BALANCE. TAXED ENOUGH ALREADY!'),\n",
    "            9: GenericDoc('T10', 'Perhaps this is the kind of thinking we need in Washington ...'),\n",
    "            14: GenericDoc('T15', \"I've been visiting Trump Int'l Golf Links Scotland and the course will be unmatched anywhere in the world. Spectacular!\"),\n",
    "        })\n",
    "\n",
    "    def test_queries(self):\n",
    "        # Test that the dataset 'dummy' has 4 queries, and test the specific queries at indices 0 and 3\n",
    "        self._test_queries('dummy', count=4, items={\n",
    "            0: GenericQuery('1', 'republican party'),\n",
    "            3: GenericQuery('4', 'media'),\n",
    "        })\n",
    "\n",
    "    def test_qrels(self):\n",
    "        # Test that the dataset 'dummy' has 60 qrels, and test the specific qrels at indices 0, 9, and 59\n",
    "        self._test_qrels('dummy', count=60, items={\n",
    "            0: TrecQrel('1', 'T1', 0, '0'),\n",
    "            9: TrecQrel('1', 'T10', 0, '0'),\n",
    "            59: TrecQrel('4', 'T15', 0, '0'),\n",
    "        })\n",
    "```\n",
    "\n",
    "Note that within a DatasetIntegrationTest, you can use `self._build_test_docs('dummy')`, `self._build_test_queries('dummy')`, `self._build_test_qrels('dummy')` to generate sample test cases. But be sure to check that the tests they generate are properly processed, and feel free to add additional test cases, especially to test dataset-specific handlers."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: examples/clirmatrix_example.py
================================================
import ir_datasets
"""
dataset name
clirmatrix/[query language code]/dataset/[doc language code]/[split]

options:
--------
    dataset: bi139-base/bi139-full/multi8
    supported query/doc language codes:
        bi139-base/bi139-full: ['af', 'als', 'am', 'an', 'ar', 'arz', 'ast', 'az', 'azb', 'ba', 'bar', 'be', 'bg', 'bn', 'bpy', 'br', 'bs', 'bug', 'ca', 'cdo', 'ce', 'ceb', 'ckb', 'cs', 'cv', 'cy', 'da', 'de', 'diq', 'el', 'eml', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fo', 'fr', 'fy', 'ga', 'gd', 'gl', 'gu', 'he', 'hi', 'hr', 'hsb', 'ht', 'hu', 'hy', 'ia', 'id', 'ilo', 'io', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'kn', 'ko', 'ku', 'ky', 'la', 'lb', 'li', 'lmo', 'lt', 'lv', 'mai', 'mg', 'mhr', 'min', 'mk', 'ml', 'mn', 'mr', 'mrj', 'ms', 'my', 'mzn', 'nap', 'nds', 'ne', 'new', 'nl', 'nn', 'no', 'oc', 'or', 'os', 'pa', 'pl', 'pms', 'pnb', 'ps', 'pt', 'qu', 'ro', 'ru', 'sa', 'sah', 'scn', 'sco', 'sd', 'sh', 'si', 'simple', 'sk', 'sl', 'sq', 'sr', 'su', 'sv', 'sw', 'szl', 'ta', 'te', 'tg', 'th', 'tl', 'tr', 'tt', 'uk', 'ur', 'uz', 'vec', 'vi', 'vo', 'wa', 'war', 'wuu', 'xmf', 'yi', 'yo', 'zh']
        multi8: ['ar', 'de', 'en', 'es', 'fr', 'ja', 'ru', 'zh']

    split: train/dev/test1/test2
"""

#examples
#reference python notebook: https://colab.research.google.com/github/allenai/ir_datasets/blob/master/examples/ir_datasets.ipynb#scrollTo=n7mY16MRH0hx
dataset = ir_datasets.load("clirmatrix/en/bi139-base/zh/test1")
docstore = dataset.docs_store()

for qrels in dataset.qrels_iter():
    print(docstore.get(qrels.doc_id))
    break

for query in dataset.queries_iter():
    print(query)
    break


dataset = ir_datasets.load("clirmatrix/en/multi8/zh/train")
docstore = dataset.docs_store()

for qrels in dataset.qrels_iter():
    print(docstore.get(qrels.doc_id))
    break

for query in dataset.queries_iter():
    print(query)
    break


dataset = ir_datasets.load("clirmatrix/an/bi139-full/zh/dev")
docstore = dataset.docs_store()

for qrels in dataset.qrels_iter():
    print(docstore.get(qrels.doc_id))
    break

for query in dataset.queries_iter():
    print(query)
    break


================================================
FILE: examples/ir_datasets.ipynb
================================================
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "ir-datasets.ipynb",
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "snL2s_xoHpph"
      },
      "source": [
        "# ir_datasets - Tutorial"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "n7mY16MRH0hx"
      },
      "source": [
        "## Getting Started\n",
        "\n",
        "We'll start out by installing the package. The package is available on pypi,\n",
        "so you can install it with your favorite package manager."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "M_6mg0PbHaFD",
        "outputId": "0764869d-bb51-4a9e-edb2-35c9cf56a876"
      },
      "source": [
        "!pip install ir_datasets"
      ],
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Requirement already satisfied: ir_datasets in /usr/local/lib/python3.6/dist-packages (0.2.0)\n",
            "Requirement already satisfied: pyyaml>=5.3.1 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (5.4.1)\n",
            "Requirement already satisfied: trec-car-tools>=2.5.3 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (2.5.3)\n",
            "Requirement already satisfied: zlib-state>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (0.1.3)\n",
            "Requirement already satisfied: requests>=2.22.0 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (2.23.0)\n",
            "Requirement already satisfied: numpy>=1.18.1 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (1.19.5)\n",
            "Requirement already satisfied: lxml>=4.5.2 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (4.6.2)\n",
            "Requirement already satisfied: tqdm>=4.38.0 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (4.41.1)\n",
            "Requirement already satisfied: lz4>=3.1.1 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (3.1.3)\n",
            "Requirement already satisfied: beautifulsoup4>=4.4.1 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (4.6.3)\n",
            "Requirement already satisfied: warc3-wet-clueweb09>=0.2.5 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (0.2.5)\n",
            "Requirement already satisfied: warc3-wet>=0.2.3 in /usr/local/lib/python3.6/dist-packages (from ir_datasets) (0.2.3)\n",
            "Requirement already satisfied: typing>=3.6.2 in /usr/local/lib/python3.6/dist-packages (from trec-car-tools>=2.5.3->ir_datasets) (3.7.4.3)\n",
            "Requirement already satisfied: cbor>=1.0.0 in /usr/local/lib/python3.6/dist-packages (from trec-car-tools>=2.5.3->ir_datasets) (1.0.0)\n",
            "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests>=2.22.0->ir_datasets) (1.24.3)\n",
            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.22.0->ir_datasets) (3.0.4)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.22.0->ir_datasets) (2020.12.5)\n",
            "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.22.0->ir_datasets) (2.10)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "DH_aBA7hIDZ4"
      },
      "source": [
        "You can now load up your favorite dataset. You can find the full listing of datasets [here](https://ir-datasets.com/all.html). Here's an example for `cord19/trec-covid`:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "dFIuPyqdHVQ0"
      },
      "source": [
        "import ir_datasets\n",
        "dataset = ir_datasets.load('cord19/trec-covid')"
      ],
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ILomHf8CIdOf"
      },
      "source": [
        "## Documents\n",
        "\n",
        "`doc` entities map a `doc_id` to one or more text fields.\n",
        "\n",
        "Let's see how many documents are in this collection. The first time you run this command, it will need to download and process the collection, which may take some time:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "v3rCW-JUHpFz",
        "outputId": "c2cba6ee-3f55-4369-de41-17972b570ad8"
      },
      "source": [
        "dataset.docs_count()"
      ],
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "192509"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 3
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "bd2f31HzI2s5"
      },
      "source": [
        "Now let's see some docments. You can iterate through the documents in the collection using `docs_iter`. Since there's so many, we'll just look at the top 10:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "odfCkvALHXzz",
        "outputId": "3f4241e6-7828-4fc1-d18b-9610b7874eec"
      },
      "source": [
        "for doc in dataset.docs_iter()[:10]:\n",
        "  print(doc)"
      ],
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Cord19Doc(doc_id='ug7v899j', title='Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia', doi='10.1186/1471-2334-1-6', date='2001-07-04', abstract='OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolates (60%) were associated with pneumonia, 14 (35%) with upper respiratory tract infections, and 2 (5%) with bronchiolitis. Cough (82.5%), fever (75%), and malaise (58.8%) were the most common symptoms, and crepitations (60%), and wheezes (40%) were the most common signs. Most patients with pneumonia had crepitations (79.2%) but only 25% had bronchial breathing. Immunocompromised patients were more likely than non-immunocompromised patients to present with pneumonia (8/9 versus 16/31, P = 0.05). Of the 24 patients with pneumonia, 14 (58.3%) had uneventful recovery, 4 (16.7%) recovered following some complications, 3 (12.5%) died because of M pneumoniae infection, and 3 (12.5%) died due to underlying comorbidities. The 3 patients who died of M pneumoniae pneumonia had other comorbidities. CONCLUSION: our results were similar to published data except for the finding that infections were more common in infants and preschool children and that the mortality rate of pneumonia in patients with comorbidities was high.')\n",
            "Cord19Doc(doc_id='02tnwd4m', title='Nitric oxide: a pro-inflammatory mediator in lung disease?', doi='10.1186/rr14', date='2000-08-15', abstract='Inflammatory diseases of the respiratory tract are commonly associated with elevated production of nitric oxide (NO•) and increased indices of NO• -dependent oxidative stress. Although NO• is known to have anti-microbial, anti-inflammatory and anti-oxidant properties, various lines of evidence support the contribution of NO• to lung injury in several disease models. On the basis of biochemical evidence, it is often presumed that such NO• -dependent oxidations are due to the formation of the oxidant peroxynitrite, although alternative mechanisms involving the phagocyte-derived heme proteins myeloperoxidase and eosinophil peroxidase might be operative during conditions of inflammation. Because of the overwhelming literature on NO• generation and activities in the respiratory tract, it would be beyond the scope of this commentary to review this area comprehensively. Instead, it focuses on recent evidence and concepts of the presumed contribution of NO• to inflammatory diseases of the lung.')\n",
            "Cord19Doc(doc_id='ejv2xln0', title='Surfactant protein-D and pulmonary host defense', doi='10.1186/rr19', date='2000-08-25', abstract='Surfactant protein-D (SP-D) participates in the innate response to inhaled microorganisms and organic antigens, and contributes to immune and inflammatory regulation within the lung. SP-D is synthesized and secreted by alveolar and bronchiolar epithelial cells, but is also expressed by epithelial cells lining various exocrine ducts and the mucosa of the gastrointestinal and genitourinary tracts. SP-D, a collagenous calcium-dependent lectin (or collectin), binds to surface glycoconjugates expressed by a wide variety of microorganisms, and to oligosaccharides associated with the surface of various complex organic antigens. SP-D also specifically interacts with glycoconjugates and other molecules expressed on the surface of macrophages, neutrophils, and lymphocytes. In addition, SP-D binds to specific surfactant-associated lipids and can influence the organization of lipid mixtures containing phosphatidylinositol in vitro. Consistent with these diverse in vitro activities is the observation that SP-D-deficient transgenic mice show abnormal accumulations of surfactant lipids, and respond abnormally to challenge with respiratory viruses and bacterial lipopolysaccharides. The phenotype of macrophages isolated from the lungs of SP-D-deficient mice is altered, and there is circumstantial evidence that abnormal oxidant metabolism and/or increased metalloproteinase expression contributes to the development of emphysema. The expression of SP-D is increased in response to many forms of lung injury, and deficient accumulation of appropriately oligomerized SP-D might contribute to the pathogenesis of a variety of human lung diseases.')\n",
            "Cord19Doc(doc_id='2b73a28n', title='Role of endothelin-1 in lung disease', doi='10.1186/rr44', date='2001-02-22', abstract='Endothelin-1 (ET-1) is a 21 amino acid peptide with diverse biological activity that has been implicated in numerous diseases. ET-1 is a potent mitogen regulator of smooth muscle tone, and inflammatory mediator that may play a key role in diseases of the airways, pulmonary circulation, and inflammatory lung diseases, both acute and chronic. This review will focus on the biology of ET-1 and its role in lung disease.')\n",
            "Cord19Doc(doc_id='9785vg6d', title='Gene expression in epithelial cells in response to pneumovirus infection', doi='10.1186/rr61', date='2001-05-11', abstract='Respiratory syncytial virus (RSV) and pneumonia virus of mice (PVM) are viruses of the family Paramyxoviridae, subfamily pneumovirus, which cause clinically important respiratory infections in humans and rodents, respectively. The respiratory epithelial target cells respond to viral infection with specific alterations in gene expression, including production of chemoattractant cytokines, adhesion molecules, elements that are related to the apoptosis response, and others that remain incompletely understood. Here we review our current understanding of these mucosal responses and discuss several genomic approaches, including differential display reverse transcription-polymerase chain reaction (PCR) and gene array strategies, that will permit us to unravel the nature of these responses in a more complete and systematic manner.')\n",
            "Cord19Doc(doc_id='zjufx4fo', title='Sequence requirements for RNA strand transfer during nidovirus discontinuous subgenomic RNA synthesis', doi='10.1093/emboj/20.24.7220', date='2001-12-17', abstract='Nidovirus subgenomic mRNAs contain a leader sequence derived from the 5′ end of the genome fused to different sequences (‘bodies’) derived from the 3′ end. Their generation involves a unique mechanism of discontinuous subgenomic RNA synthesis that resembles copy-choice RNA recombination. During this process, the nascent RNA strand is transferred from one site in the template to another, during either plus or minus strand synthesis, to yield subgenomic RNA molecules. Central to this process are transcription-regulating sequences (TRSs), which are present at both template sites and ensure the fidelity of strand transfer. Here we present results of a comprehensive co-variation mutagenesis study of equine arteritis virus TRSs, demonstrating that discontinuous RNA synthesis depends not only on base pairing between sense leader TRS and antisense body TRS, but also on the primary sequence of the body TRS. While the leader TRS merely plays a targeting role for strand transfer, the body TRS fulfils multiple functions. The sequences of mRNA leader–body junctions of TRS mutants strongly suggested that the discontinuous step occurs during minus strand synthesis.')\n",
            "Cord19Doc(doc_id='5yhe786e', title='Debate: Transfusing to normal haemoglobin levels will not improve outcome', doi='10.1186/cc987', date='2001-03-08', abstract='Recent evidence suggests that critically ill patients are able to tolerate lower levels of haemoglobin than was previously believed. It is our goal to show that transfusing to a level of 100 g/l does not improve mortality and other clinically important outcomes in a critical care setting. Although many questions remain, many laboratory and clinical studies, including a recent randomized controlled trial (RCT), have established that transfusing to normal haemoglobin concentrations does not improve organ failure and mortality in the critically ill patient. In addition, a restrictive transfusion strategy will reduce exposure to allogeneic transfusions, result in more efficient use of red blood cells (RBCs), save blood overall, and decrease health care costs.')\n",
            "Cord19Doc(doc_id='8zchiykl', title='The 21st International Symposium on Intensive Care and Emergency Medicine, Brussels, Belgium, 20-23 March 2001', doi='10.1186/cc1013', date='2001-05-02', abstract=\"The 21st International Symposium on Intensive Care and Emergency Medicine was dominated by the results of recent clinical trials in sepsis and acute respiratory distress syndrome (ARDS). The promise of extracorporeal liver replacement therapy and noninvasive ventilation were other areas of interest. Ethical issues also received attention. Overall, the 'state of the art' lectures, pro/con debates, seminars and tutorials were of a high standard. The meeting was marked by a sense of renewed enthusiasm that positive progress is occurring in intensive care medicine.\")\n",
            "Cord19Doc(doc_id='8qnrcgnk', title='Heme oxygenase-1 and carbon monoxide in pulmonary medicine', doi='10.1186/1465-9921-4-7', date='2003-08-07', abstract='Heme oxygenase-1 (HO-1), an inducible stress protein, confers cytoprotection against oxidative stress in vitro and in vivo. In addition to its physiological role in heme degradation, HO-1 may influence a number of cellular processes, including growth, inflammation, and apoptosis. By virtue of anti-inflammatory effects, HO-1 limits tissue damage in response to proinflammatory stimuli and prevents allograft rejection after transplantation. The transcriptional upregulation of HO-1 responds to many agents, such as hypoxia, bacterial lipopolysaccharide, and reactive oxygen/nitrogen species. HO-1 and its constitutively expressed isozyme, heme oxygenase-2, catalyze the rate-limiting step in the conversion of heme to its metabolites, bilirubin IXα, ferrous iron, and carbon monoxide (CO). The mechanisms by which HO-1 provides protection most likely involve its enzymatic reaction products. Remarkably, administration of CO at low concentrations can substitute for HO-1 with respect to anti-inflammatory and anti-apoptotic effects, suggesting a role for CO as a key mediator of HO-1 function. Chronic, low-level, exogenous exposure to CO from cigarette smoking contributes to the importance of CO in pulmonary medicine. The implications of the HO-1/CO system in pulmonary diseases will be discussed in this review, with an emphasis on inflammatory states.')\n",
            "Cord19Doc(doc_id='jg13scgo', title='Technical Description of RODS: A Real-time Public Health Surveillance System', doi='10.1197/jamia.m1345', date='2003-09-01', abstract='This report describes the design and implementation of the Real-time Outbreak and Disease Surveillance (RODS) system, a computer-based public health surveillance system for early detection of disease outbreaks. Hospitals send RODS data from clinical encounters over virtual private networks and leased lines using the Health Level 7 (HL7) message protocol. The data are sent in real time. RODS automatically classifies the registration chief complaint from the visit into one of seven syndrome categories using Bayesian classifiers. It stores the data in a relational database, aggregates the data for analysis using data warehousing techniques, applies univariate and multivariate statistical detection algorithms to the data, and alerts users of when the algorithms identify anomalous patterns in the syndrome counts. RODS also has a Web-based user interface that supports temporal and spatial analyses. RODS processes sales of over-the-counter health care products in a similar manner but receives such data in batch mode on a daily basis. RODS was used during the 2002 Winter Olympics and currently operates in two states—Pennsylvania and Utah. It has been and continues to be a resource for implementing, evaluating, and applying new methods of public health surveillance.')\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "lUB0AUfWJESJ"
      },
      "source": [
        "You can see each document is represented as a `Cord19Doc`, which is a `namedtuple`. Named tuples are a light-weight data structure that consists of a pre-defined sequence of named fields.\n",
        "\n",
        "If you want more information aobut what document fields are available in this collection, you can\n",
        "[check the documentation](https://ir-datasets.com/cord19.html#cord19) or inspect the dataset's `docs_cls()`:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Cej2STMCI_eh",
        "outputId": "55e06f14-390f-4dce-9ba2-576a33c50b6c"
      },
      "source": [
        "dataset.docs_cls()"
      ],
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "ir_datasets.datasets.cord19.Cord19Doc"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 5
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "p2BxO7-vJWg7",
        "outputId": "ea763b20-59e6-4d97-b28b-0f7bbabb2f65"
      },
      "source": [
        "dataset.docs_cls()._fields"
      ],
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "('doc_id', 'title', 'doi', 'date', 'abstract')"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 6
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "2ORa7nztJXyq",
        "outputId": "a2e836be-ebe4-4f71-f272-2536961ef271"
      },
      "source": [
        "dataset.docs_cls().__annotations__"
      ],
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "OrderedDict([('doc_id', str),\n",
              "             ('title', str),\n",
              "             ('doi', str),\n",
              "             ('date', str),\n",
              "             ('abstract', str)])"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 7
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "jwcOyKP5Juct"
      },
      "source": [
        "Did you notice the `[:10]` above? We can do all sorts of fancy slicing on document iterators. Here, we select every other document from the top 10:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "VDYPMpOVJZmM",
        "outputId": "402d5201-ceb9-4bcb-a985-e0052d650994"
      },
      "source": [
        "for doc in dataset.docs_iter()[:10:2]:\n",
        "  print(doc)"
      ],
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Cord19Doc(doc_id='ug7v899j', title='Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia', doi='10.1186/1471-2334-1-6', date='2001-07-04', abstract='OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolates (60%) were associated with pneumonia, 14 (35%) with upper respiratory tract infections, and 2 (5%) with bronchiolitis. Cough (82.5%), fever (75%), and malaise (58.8%) were the most common symptoms, and crepitations (60%), and wheezes (40%) were the most common signs. Most patients with pneumonia had crepitations (79.2%) but only 25% had bronchial breathing. Immunocompromised patients were more likely than non-immunocompromised patients to present with pneumonia (8/9 versus 16/31, P = 0.05). Of the 24 patients with pneumonia, 14 (58.3%) had uneventful recovery, 4 (16.7%) recovered following some complications, 3 (12.5%) died because of M pneumoniae infection, and 3 (12.5%) died due to underlying comorbidities. The 3 patients who died of M pneumoniae pneumonia had other comorbidities. CONCLUSION: our results were similar to published data except for the finding that infections were more common in infants and preschool children and that the mortality rate of pneumonia in patients with comorbidities was high.')\n",
            "Cord19Doc(doc_id='ejv2xln0', title='Surfactant protein-D and pulmonary host defense', doi='10.1186/rr19', date='2000-08-25', abstract='Surfactant protein-D (SP-D) participates in the innate response to inhaled microorganisms and organic antigens, and contributes to immune and inflammatory regulation within the lung. SP-D is synthesized and secreted by alveolar and bronchiolar epithelial cells, but is also expressed by epithelial cells lining various exocrine ducts and the mucosa of the gastrointestinal and genitourinary tracts. SP-D, a collagenous calcium-dependent lectin (or collectin), binds to surface glycoconjugates expressed by a wide variety of microorganisms, and to oligosaccharides associated with the surface of various complex organic antigens. SP-D also specifically interacts with glycoconjugates and other molecules expressed on the surface of macrophages, neutrophils, and lymphocytes. In addition, SP-D binds to specific surfactant-associated lipids and can influence the organization of lipid mixtures containing phosphatidylinositol in vitro. Consistent with these diverse in vitro activities is the observation that SP-D-deficient transgenic mice show abnormal accumulations of surfactant lipids, and respond abnormally to challenge with respiratory viruses and bacterial lipopolysaccharides. The phenotype of macrophages isolated from the lungs of SP-D-deficient mice is altered, and there is circumstantial evidence that abnormal oxidant metabolism and/or increased metalloproteinase expression contributes to the development of emphysema. The expression of SP-D is increased in response to many forms of lung injury, and deficient accumulation of appropriately oligomerized SP-D might contribute to the pathogenesis of a variety of human lung diseases.')\n",
            "Cord19Doc(doc_id='9785vg6d', title='Gene expression in epithelial cells in response to pneumovirus infection', doi='10.1186/rr61', date='2001-05-11', abstract='Respiratory syncytial virus (RSV) and pneumonia virus of mice (PVM) are viruses of the family Paramyxoviridae, subfamily pneumovirus, which cause clinically important respiratory infections in humans and rodents, respectively. The respiratory epithelial target cells respond to viral infection with specific alterations in gene expression, including production of chemoattractant cytokines, adhesion molecules, elements that are related to the apoptosis response, and others that remain incompletely understood. Here we review our current understanding of these mucosal responses and discuss several genomic approaches, including differential display reverse transcription-polymerase chain reaction (PCR) and gene array strategies, that will permit us to unravel the nature of these responses in a more complete and systematic manner.')\n",
            "Cord19Doc(doc_id='5yhe786e', title='Debate: Transfusing to normal haemoglobin levels will not improve outcome', doi='10.1186/cc987', date='2001-03-08', abstract='Recent evidence suggests that critically ill patients are able to tolerate lower levels of haemoglobin than was previously believed. It is our goal to show that transfusing to a level of 100 g/l does not improve mortality and other clinically important outcomes in a critical care setting. Although many questions remain, many laboratory and clinical studies, including a recent randomized controlled trial (RCT), have established that transfusing to normal haemoglobin concentrations does not improve organ failure and mortality in the critically ill patient. In addition, a restrictive transfusion strategy will reduce exposure to allogeneic transfusions, result in more efficient use of red blood cells (RBCs), save blood overall, and decrease health care costs.')\n",
            "Cord19Doc(doc_id='8qnrcgnk', title='Heme oxygenase-1 and carbon monoxide in pulmonary medicine', doi='10.1186/1465-9921-4-7', date='2003-08-07', abstract='Heme oxygenase-1 (HO-1), an inducible stress protein, confers cytoprotection against oxidative stress in vitro and in vivo. In addition to its physiological role in heme degradation, HO-1 may influence a number of cellular processes, including growth, inflammation, and apoptosis. By virtue of anti-inflammatory effects, HO-1 limits tissue damage in response to proinflammatory stimuli and prevents allograft rejection after transplantation. The transcriptional upregulation of HO-1 responds to many agents, such as hypoxia, bacterial lipopolysaccharide, and reactive oxygen/nitrogen species. HO-1 and its constitutively expressed isozyme, heme oxygenase-2, catalyze the rate-limiting step in the conversion of heme to its metabolites, bilirubin IXα, ferrous iron, and carbon monoxide (CO). The mechanisms by which HO-1 provides protection most likely involve its enzymatic reaction products. Remarkably, administration of CO at low concentrations can substitute for HO-1 with respect to anti-inflammatory and anti-apoptotic effects, suggesting a role for CO as a key mediator of HO-1 function. Chronic, low-level, exogenous exposure to CO from cigarette smoking contributes to the importance of CO in pulmonary medicine. The implications of the HO-1/CO system in pulmonary diseases will be discussed in this review, with an emphasis on inflammatory states.')\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "iizcVwqmJ-TW"
      },
      "source": [
        "Or the last 10 documents:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "iVOvbOGOJ4A6",
        "outputId": "ca1daed3-3394-472f-8bbd-60a3e2faf0a3"
      },
      "source": [
        "for doc in dataset.docs_iter()[-10:]:\n",
        "  print(doc)"
      ],
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Cord19Doc(doc_id='7e8r61e7', title='Can Pediatric COVID-19 Testing Sensitivity Be Improved With Sequential Tests?', doi='10.1213/ane.0000000000004982', date='2020-05-26', abstract='')\n",
            "Cord19Doc(doc_id='r3ud8t8w', title='rAre graphene and graphene-derived products capable of preventing COVID-19 infection?', doi='10.1016/j.mehy.2020.110031', date='2020-06-24', abstract=\"The Severe Acute Respiratory Syndrome CoronaVirus 2 (SARS-CoV-2) causes the new coronavirus disease 2019 (COVID-19). This disease is a severe respiratory tract infection that spread rapidly around the world. In this pandemic situation, the researchers' effort is to understand the targets of the virus, mechanism of their cause, and transmission from animal to human and vice-versa. Therefore, to support COVID-19 research and development, we have proposed approaches based on graphene and graphene-derived nanomaterials against COVID-19.\")\n",
            "Cord19Doc(doc_id='6jittbis', title='Heterogeneity and plasticity of porcine alveolar macrophage and pulmonary interstitial macrophage isolated from healthy pigs in vitro', doi='10.1242/bio.046342', date='2019-10-15', abstract='This study investigated the heterogeneity and plasticity of porcine alveolar macrophages (PAM) and pulmonary interstitial macrophages (IM) isolated from healthy pigs, including phenotype, function and gene expression. Dynamic changes of nitric oxide (NO) levels secreted by PAM and IM with stimulation of different doses of lipopolysaccharide (LPS) were investigated by Griess method, and the viability of the PAM and IM cells was investigated by MTT assay. Flow cytometry, fluorescence quantitative PCR and ELISA techniques were used to measure cell phenotype, gene expression and cytokine secretion, respectively. The PAM and IM cells in normal healthy pigs showed heterogeneity with 95.42±1.51% and 31.99±5.84% of CD163+ macrophage, respectively. The NO level in IM was significantly higher versus PAM after LPS treatment. Consistently, the ratio of Arg I/iNOS in IM was much lower than that in PAM, suggesting that the PAM belong to M2 macrophages and the IM belong to M1 macrophages. The PAM and IM cells in normal healthy pigs also showed plasticity. The Arg I/iNOS ratio and TIMP1/MMP12 ratio were significantly decreased in LPS- or LPS+IFNγ-treated PAM and IM, suggesting that cells were polarized towards M1 macrophages under LPS or LPS+IFNγ stimulation. On the contrary, IL-4 and IL-13 stimulation on PAM and IM lead to M2 polarization. A similar result was found in IL-1β gene expression and TNFα secretion. In conclusion, porcine macrophages have shown heterogeneity and plasticity on polarization under the stimulation of LPS, IFNγ, IL-4 and IL-13.')\n",
            "Cord19Doc(doc_id='kaku49xd', title='Review of Current Advances in Serologic Testing for COVID-19', doi='10.1093/ajcp/aqaa112', date='2020-06-25', abstract='OBJECTIVES: To examine and summarize the current literature on serologic methods for the detection of antibodies to severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2). METHODS: A literature review was performed using searches in databases including PubMed, medRxiv, and bioRxiv. Thirty-two peer-reviewed papers and 23 preprints were examined. RESULTS: The studies included lateral flow immunoassay, enzyme-linked immunosorbent assay, chemiluminescence immunoassay, and neutralizing antibody assays. The use of all major SARS-CoV-2 antigens was demonstrated to have diagnostic value. Assays measuring total antibody reactivity had the highest sensitivity. In addition, all the methods provided opportunities to characterize the humoral immune response by isotype. The combined use of IgM and IgG detection resulted in a higher sensitivity than that observed when detecting either isotype alone. Although IgA was rarely studied, it was also demonstrated to be a sensitive marker of infection, and levels correlated with disease severity and neutralizing activity. CONCLUSIONS: The use of serologic testing, in conjunction with reverse transcription polymerase chain reaction testing, was demonstrated to significantly increase the sensitivity of detection of patients infected with SARS-CoV-2. There was conflicting evidence regarding whether antibody titers correlated with clinical severity. However, preliminary investigations indicated some immunoassays may be a surrogate for the prediction of neutralizing antibody titers and the selection of recovered patients for convalescent serum donation.')\n",
            "Cord19Doc(doc_id='ni94qi4r', title='Liver tests abnormalities in COVID-19: trick or treat?', doi='10.1016/j.jhep.2020.05.033', date='2020-05-27', abstract='')\n",
            "Cord19Doc(doc_id='z4ro6lmh', title='Rapid radiological improvement of COVID-19 pneumonia after treatment with tocilizumab', doi='10.1007/s15010-020-01449-w', date='2020-06-15', abstract='')\n",
            "Cord19Doc(doc_id='hi8k8wvb', title='SARS E protein in phospholipid bilayers: an anomalous X-ray reflectivity study', doi='10.1016/j.physb.2004.11.015', date='2005-02-28', abstract='Abstract We report on an anomalous X-ray reflectivity study to locate a labelled residue of a membrane protein with respect to the lipid bilayer. From such experiments, important constraints on the protein or peptide conformation can be derived. Specifically, our aim is to localize an iodine-labelled phenylalanine in the SARS E protein, incorporated in DMPC phospholipid bilayers, which are deposited in the form of thick multilamellar stacks on silicon surfaces. Here, we discuss the experimental aspects and the difficulties associated with the Fourier synthesis analysis that gives the electron density profile of the membranes.')\n",
            "Cord19Doc(doc_id='ma3ndg41', title='Italian Society of Interventional Cardiology (GISE) position paper for Cath lab‐specific preparedness recommendations for healthcare providers in case of suspected, probable or confirmed cases of COVID‐19', doi='10.1002/ccd.28888', date='2020-04-11', abstract='COVID‐19 pandemic raised the issue to guarantee the proper level of care to patients with acute cardiovascular diseases and concomitant suspected or confirmed COVID‐19 and, in the meantime safety and protection of healthcare providers. The aim of this position paper is to provide standards to healthcare facilities and healthcare providers on infection prevention and control measures during the management of suspected and confirmed cases of 2019‐nCoV infection accessing in cath‐lab. The document represents the view of the Italian Society of Interventional Cardiology (GISE), and it is based on recommendations from the main World and European Health Organizations (WHO, and ECDC) as well as from the Italian Society of Anesthesia, Analgesia, Resuscitation and Intensive Care (SIAARTI).')\n",
            "Cord19Doc(doc_id='wh10285j', title=\"Nimble, Together: A Training Program's Response to the COVID-19 Pandemic\", doi='10.1097/sla.0000000000003994', date='2020-04-29', abstract='')\n",
            "Cord19Doc(doc_id='pnl9th2c', title='Vascular Life during the COVID-19 Pandemic Reminds Us to Prepare for the Unexpected', doi='10.1016/j.ejvs.2020.04.040', date='2020-05-12', abstract='')\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "nm8hUpVWKOwM"
      },
      "source": [
        "You can also select by percentages, e.g., `[:1/3]` slects the first third, `[1/3:2/3]` selects the second third, and `[2/3:]` selects the final third. This is hany when splitting document processing across processes, machines, or GPUs.\n",
        "\n",
        "These slices are smart: they avoid processing each document in the collection and jump to the right position in the source files to process."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "kZjhG-5XKqPR"
      },
      "source": [
        "Now let's say you know a document'd ID and want to find its text. You can use `docs_store()` to accomplish this."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "hXp1nxooJ6uP",
        "outputId": "fc52d452-754c-42fd-ae3f-2f37364c4462"
      },
      "source": [
        "docstore = dataset.docs_store()\n",
        "docstore.get('3wuh6k6g')"
      ],
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "Cord19Doc(doc_id='3wuh6k6g', title='Understand Research Hotspots Surrounding COVID-19 and Other Coronavirus Infections Using Topic Modeling', doi='10.1101/2020.03.26.20044164', date='2020-03-30', abstract='Background: Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) is a virus that causes severe respiratory illness in humans, which eventually results in the current outbreak of novel coronavirus disease (COVID-19) around the world. The research community is interested to know what are the hotspots in coronavirus (CoV) research and how much is known about COVID-19. This study aimed to evaluate the characteristics of publications involving coronaviruses as well as COVID-19 by using a topic modeling analysis. Methods: We extracted all abstracts and retained the most informative words from the COVID-19 Open Research Dataset, which contains all the 35,092 pieces of coronavirus related literature published up to March 20, 2020. Using Latent Dirichlet Allocation modeling, we trained an eight-topic model from the corpus. We then analyzed the semantic relationships between topics and compared the topic distribution between COVID-19 and other CoV infections. Results: Eight topics emerged overall: clinical characterization, pathogenesis research, therapeutics research, epidemiological study, virus transmission, vaccines research, virus diagnostics, and viral genomics. It was observed that COVID-19 research puts more emphasis on clinical characterization, epidemiological study, and virus transmission at present. In contrast, topics about diagnostics, therapeutics, vaccines, genomics and pathogenesis only accounted for less than 10% or even 4% of all the COVID-19 publications, much lower than those of other CoV infections. Conclusions: These results identified knowledge gaps in the area of COVID-19 and offered directions for future research. Keywords: COVID-19, coronavirus, topic modeling, hotspots, text mining')"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 10
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "m7IN1f9_LMS1"
      },
      "source": [
        "Or, a list of IDs. Maybe you're re-ranking these documents."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "oXvKBt6-LPtS",
        "outputId": "733320e0-2762-44ba-ce4c-226295d7878d"
      },
      "source": [
        "docstore.get_many(['ax6v6ham', '44l5q07k', '8xm0kacj', '3wuh6k6g', 'fiievwy7'])"
      ],
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "{'3wuh6k6g': Cord19Doc(doc_id='3wuh6k6g', title='Understand Research Hotspots Surrounding COVID-19 and Other Coronavirus Infections Using Topic Modeling', doi='10.1101/2020.03.26.20044164', date='2020-03-30', abstract='Background: Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) is a virus that causes severe respiratory illness in humans, which eventually results in the current outbreak of novel coronavirus disease (COVID-19) around the world. The research community is interested to know what are the hotspots in coronavirus (CoV) research and how much is known about COVID-19. This study aimed to evaluate the characteristics of publications involving coronaviruses as well as COVID-19 by using a topic modeling analysis. Methods: We extracted all abstracts and retained the most informative words from the COVID-19 Open Research Dataset, which contains all the 35,092 pieces of coronavirus related literature published up to March 20, 2020. Using Latent Dirichlet Allocation modeling, we trained an eight-topic model from the corpus. We then analyzed the semantic relationships between topics and compared the topic distribution between COVID-19 and other CoV infections. Results: Eight topics emerged overall: clinical characterization, pathogenesis research, therapeutics research, epidemiological study, virus transmission, vaccines research, virus diagnostics, and viral genomics. It was observed that COVID-19 research puts more emphasis on clinical characterization, epidemiological study, and virus transmission at present. In contrast, topics about diagnostics, therapeutics, vaccines, genomics and pathogenesis only accounted for less than 10% or even 4% of all the COVID-19 publications, much lower than those of other CoV infections. Conclusions: These results identified knowledge gaps in the area of COVID-19 and offered directions for future research. Keywords: COVID-19, coronavirus, topic modeling, hotspots, text mining'),\n",
              " '44l5q07k': Cord19Doc(doc_id='44l5q07k', title='Rôle des animaux vertébrés dans l’épidémiologie des zoonoses', doi='10.1016/s1773-035x(15)30110-6', date='2015-05-31', abstract='Résumé Les zoonoses, distinguées ici des maladies humaines d’origine animale, représentent un ensemble d’entités pathologiques dont les agents responsables circulent régulièrement entre l’espèce humaine et de nombreuses espèces de vertébrés. L’analyse de divers exemples, quelles que soient les voies de transmission et les causes favorisantes de la contamination, met en avant une régulière rareté du passage direct du réservoir animal vers l’espèce humaine, à opposer à la diversité et surtout à la gravité possible des évolutions sanitaires ultérieures possibles, parfois liées à des comportements humains peu adaptés. D’un point de vue pratique, il semblerait plus pertinent de se pencher d’abord sur un meilleur contrôle de la diffusion des agents pathogènes au sein des populations humaines que d’agir a priori sur le réservoir animal, qu’il soit domestique ou sauvage, source potentielle de ces agents pathogènes mais selon des modalités difficiles à anticiper. La relation entre biodiversité et santé est abordée à l’occasion de la discussion. Summary Zoonoses, to be distinguished here from human diseases with an animal origin, represent a large quantity of pathological entities the corresponding pathogens of which are regularly shared between human beings and many different vertebrates species. Working on selected examples and situations, whatever the contamination routes and the facilitating reasons, puts into light a real rarity of a direct transmission from the animal reservoir to human beings. On the opposite, the diversity and the possible severity of some of the sanitary consequences, quite often in relation to human behaviours, must be stressed. On a practical point of view, it seems more adapted to try first to improve the control of pathogens diffusion within human populations after a contamination than to try to start to work first on the reservoir, be it domestic or wild, the potential source of these pathogens, but following routes quite difficult to anticipate. The relationship between biodiversity and health is discussed.'),\n",
              " '8xm0kacj': Cord19Doc(doc_id='8xm0kacj', title='Host range of SARS-CoV-2 and implications for public health', doi='10.1016/s2666-5247(20)30069-0', date='2020-06-18', abstract=''),\n",
              " 'ax6v6ham': Cord19Doc(doc_id='ax6v6ham', title='Close relationship between SARS-coronavirus and group 2 coronavirus.', doi='', date='2006', abstract='The sudden appearance and potential lethality of severe acute respiratory syndrome (SARS)-associated coronavirus (SARS-CoV) in humans has resulted in a focusing of new attention on the determination of both its origins and evolution. The relationship existing between SARS-CoV and other groups of coronaviruses was determined via analyses of phylogenetic trees and comparative genomic analyses of the coronavirus genes: polymerase (Orf1ab), spike (S), envelope (E), membrane (M) and nucleocapsid (N). Although the coronaviruses are traditionally classed into 3 groups, with SARS-CoV forming a 4th group, the phylogenetic position and origins of SARS-CoV remain a matter of some controversy. Thus, we conducted extensive phylogenetic analyses of the genes common to all coronavirus groups, using the Neighbor-joining, Maximum-likelihood, and Bayesian methods. Our data evidenced largely identical topology for all of the obtained phylogenetic trees, thus supporting the hypothesis that the relationship existing between SARS-CoV and group 2 coronavirus is a monophyletic one. Additional comparative genomic studies, including sequence similarity and protein secondary structure analyses, suggested that SARS-CoV may bear a closer relationship with group 2 than with the other coronavirus groups. Although our data strongly suggest that group 2 coronaviruses are most closely related with SARS-CoV, further and more detailed analyses may provide us with an increased amount of information regarding the origins and evolution of the coronaviruses, most notably SARS-CoV.'),\n",
              " 'fiievwy7': Cord19Doc(doc_id='fiievwy7', title='SARS-CoV-2 will continue to circulate in the human population: an opinion from the point of view of the virus-host relationship', doi='10.1007/s00011-020-01352-y', date='2020-04-30', abstract='At the population level, the virus-host relationship is not set up to end with the complete elimination of either or both. Pathogen-resistant individuals will always remain in the host population. In turn, the virus can never completely eliminate the host population, because evolutionarily such an event is a dead end for the virus as an obligate intracellular parasite. A certain existential balance exists in the virus-host relationship. Against this backdrop, viral epidemics and pandemics only become manifest and egregious to human beings when tens and hundreds of thousands of people die and the question emerges what caused the high mortality peaks on the death chart. The answer seems clear; the emerging strain of the virus is new to the host population, and new mutations of the virus and natural selection will lead to a survival of only genetically resistant individuals in a host population. The dangers inherent to a novel virus are due to new features generally inthe molecular structure of proteins, which enable the virus to infect the cells of the host organism more intensively, dramatically challenging host immunity, and thus be transmitted more readily in the host population. In this article, we will concentrate on the facts currently available about severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2), which has caused COVID-19 (coronavirus disease 2019) pandemic and try to predict its development and consequences based on the virus-host relationship. In fact, only two scenarios will occur simultaneously in the very near future: people who are genetically resistant to the virus will get sick, recover, and develop immunity, while people who are sensitive to the virus will need drugs and vaccines, which will have to be researched and developed if they are to recover. If the pandemic does not stop, in a few decades it is anticipated that SARS-CoV-2 will become as safe as the four non-severe acute respiratory syndrome human coronaviruses (HCoV-NL63, HCoV-HKU1, HCoV-OC43, and HCoV-229E) currently circulating but causing low mortality in the human population.')}"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 11
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "d_oB4z4ALcO0"
      },
      "source": [
        "If you don't care about the order they are returned in, you can use `get_many_iter()`. This avoids keeping all the results in memory, and reads them in the order in which they appear on disk."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "uBIivxhyK2kz",
        "outputId": "3423a7cd-e9ca-448c-e46b-98a057153601"
      },
      "source": [
        "for doc in docstore.get_many_iter(['ax6v6ham', '44l5q07k', '8xm0kacj', '3wuh6k6g', 'fiievwy7']):\n",
        "  print(doc)"
      ],
      "execution_count": 12,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Cord19Doc(doc_id='3wuh6k6g', title='Understand Research Hotspots Surrounding COVID-19 and Other Coronavirus Infections Using Topic Modeling', doi='10.1101/2020.03.26.20044164', date='2020-03-30', abstract='Background: Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) is a virus that causes severe respiratory illness in humans, which eventually results in the current outbreak of novel coronavirus disease (COVID-19) around the world. The research community is interested to know what are the hotspots in coronavirus (CoV) research and how much is known about COVID-19. This study aimed to evaluate the characteristics of publications involving coronaviruses as well as COVID-19 by using a topic modeling analysis. Methods: We extracted all abstracts and retained the most informative words from the COVID-19 Open Research Dataset, which contains all the 35,092 pieces of coronavirus related literature published up to March 20, 2020. Using Latent Dirichlet Allocation modeling, we trained an eight-topic model from the corpus. We then analyzed the semantic relationships between topics and compared the topic distribution between COVID-19 and other CoV infections. Results: Eight topics emerged overall: clinical characterization, pathogenesis research, therapeutics research, epidemiological study, virus transmission, vaccines research, virus diagnostics, and viral genomics. It was observed that COVID-19 research puts more emphasis on clinical characterization, epidemiological study, and virus transmission at present. In contrast, topics about diagnostics, therapeutics, vaccines, genomics and pathogenesis only accounted for less than 10% or even 4% of all the COVID-19 publications, much lower than those of other CoV infections. Conclusions: These results identified knowledge gaps in the area of COVID-19 and offered directions for future research. Keywords: COVID-19, coronavirus, topic modeling, hotspots, text mining')\n",
            "Cord19Doc(doc_id='ax6v6ham', title='Close relationship between SARS-coronavirus and group 2 coronavirus.', doi='', date='2006', abstract='The sudden appearance and potential lethality of severe acute respiratory syndrome (SARS)-associated coronavirus (SARS-CoV) in humans has resulted in a focusing of new attention on the determination of both its origins and evolution. The relationship existing between SARS-CoV and other groups of coronaviruses was determined via analyses of phylogenetic trees and comparative genomic analyses of the coronavirus genes: polymerase (Orf1ab), spike (S), envelope (E), membrane (M) and nucleocapsid (N). Although the coronaviruses are traditionally classed into 3 groups, with SARS-CoV forming a 4th group, the phylogenetic position and origins of SARS-CoV remain a matter of some controversy. Thus, we conducted extensive phylogenetic analyses of the genes common to all coronavirus groups, using the Neighbor-joining, Maximum-likelihood, and Bayesian methods. Our data evidenced largely identical topology for all of the obtained phylogenetic trees, thus supporting the hypothesis that the relationship existing between SARS-CoV and group 2 coronavirus is a monophyletic one. Additional comparative genomic studies, including sequence similarity and protein secondary structure analyses, suggested that SARS-CoV may bear a closer relationship with group 2 than with the other coronavirus groups. Although our data strongly suggest that group 2 coronaviruses are most closely related with SARS-CoV, further and more detailed analyses may provide us with an increased amount of information regarding the origins and evolution of the coronaviruses, most notably SARS-CoV.')\n",
            "Cord19Doc(doc_id='fiievwy7', title='SARS-CoV-2 will continue to circulate in the human population: an opinion from the point of view of the virus-host relationship', doi='10.1007/s00011-020-01352-y', date='2020-04-30', abstract='At the population level, the virus-host relationship is not set up to end with the complete elimination of either or both. Pathogen-resistant individuals will always remain in the host population. In turn, the virus can never completely eliminate the host population, because evolutionarily such an event is a dead end for the virus as an obligate intracellular parasite. A certain existential balance exists in the virus-host relationship. Against this backdrop, viral epidemics and pandemics only become manifest and egregious to human beings when tens and hundreds of thousands of people die and the question emerges what caused the high mortality peaks on the death chart. The answer seems clear; the emerging strain of the virus is new to the host population, and new mutations of the virus and natural selection will lead to a survival of only genetically resistant individuals in a host population. The dangers inherent to a novel virus are due to new features generally inthe molecular structure of proteins, which enable the virus to infect the cells of the host organism more intensively, dramatically challenging host immunity, and thus be transmitted more readily in the host population. In this article, we will concentrate on the facts currently available about severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2), which has caused COVID-19 (coronavirus disease 2019) pandemic and try to predict its development and consequences based on the virus-host relationship. In fact, only two scenarios will occur simultaneously in the very near future: people who are genetically resistant to the virus will get sick, recover, and develop immunity, while people who are sensitive to the virus will need drugs and vaccines, which will have to be researched and developed if they are to recover. If the pandemic does not stop, in a few decades it is anticipated that SARS-CoV-2 will become as safe as the four non-severe acute respiratory syndrome human coronaviruses (HCoV-NL63, HCoV-HKU1, HCoV-OC43, and HCoV-229E) currently circulating but causing low mortality in the human population.')\n",
            "Cord19Doc(doc_id='44l5q07k', title='Rôle des animaux vertébrés dans l’épidémiologie des zoonoses', doi='10.1016/s1773-035x(15)30110-6', date='2015-05-31', abstract='Résumé Les zoonoses, distinguées ici des maladies humaines d’origine animale, représentent un ensemble d’entités pathologiques dont les agents responsables circulent régulièrement entre l’espèce humaine et de nombreuses espèces de vertébrés. L’analyse de divers exemples, quelles que soient les voies de transmission et les causes favorisantes de la contamination, met en avant une régulière rareté du passage direct du réservoir animal vers l’espèce humaine, à opposer à la diversité et surtout à la gravité possible des évolutions sanitaires ultérieures possibles, parfois liées à des comportements humains peu adaptés. D’un point de vue pratique, il semblerait plus pertinent de se pencher d’abord sur un meilleur contrôle de la diffusion des agents pathogènes au sein des populations humaines que d’agir a priori sur le réservoir animal, qu’il soit domestique ou sauvage, source potentielle de ces agents pathogènes mais selon des modalités difficiles à anticiper. La relation entre biodiversité et santé est abordée à l’occasion de la discussion. Summary Zoonoses, to be distinguished here from human diseases with an animal origin, represent a large quantity of pathological entities the corresponding pathogens of which are regularly shared between human beings and many different vertebrates species. Working on selected examples and situations, whatever the contamination routes and the facilitating reasons, puts into light a real rarity of a direct transmission from the animal reservoir to human beings. On the opposite, the diversity and the possible severity of some of the sanitary consequences, quite often in relation to human behaviours, must be stressed. On a practical point of view, it seems more adapted to try first to improve the control of pathogens diffusion within human populations after a contamination than to try to start to work first on the reservoir, be it domestic or wild, the potential source of these pathogens, but following routes quite difficult to anticipate. The relationship between biodiversity and health is discussed.')\n",
            "Cord19Doc(doc_id='8xm0kacj', title='Host range of SARS-CoV-2 and implications for public health', doi='10.1016/s2666-5247(20)30069-0', date='2020-06-18', abstract='')\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Vis7u-VeMX70"
      },
      "source": [
        "## Queries\n",
        "\n",
        "`queries` (topics) map a `query_id` to one or more text fields. Akint to `docs`, you can iterate over queries for a collection using `queries_iter()`:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "-Tl3npjZLEWB",
        "outputId": "57bb154e-cb8a-4e6e-b57d-7ae3e5261c22"
      },
      "source": [
        "for query in dataset.queries_iter():\n",
        "  print(query)"
      ],
      "execution_count": 13,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "TrecQuery(query_id='1', title='coronavirus origin', description='what is the origin of COVID-19', narrative=\"seeking range of information about the SARS-CoV-2 virus's origin, including its evolution, animal source, and first transmission into humans\")\n",
            "TrecQuery(query_id='2', title='coronavirus response to weather changes', description='how does the coronavirus respond to changes in the weather', narrative='seeking range of information about the SARS-CoV-2 virus viability in different weather/climate conditions as well as information related to transmission of the virus in different climate conditions')\n",
            "TrecQuery(query_id='3', title='coronavirus immunity', description='will SARS-CoV2 infected people develop immunity? Is cross protection possible?', narrative='seeking studies of immunity developed due to infection with SARS-CoV2 or cross protection gained due to infection with other coronavirus types')\n",
            "TrecQuery(query_id='4', title='how do people die from the coronavirus', description='what causes death from Covid-19?', narrative='Studies looking at mechanisms of death from Covid-19.')\n",
            "TrecQuery(query_id='5', title='animal models of COVID-19', description='what drugs have been active against SARS-CoV or SARS-CoV-2 in animal studies?', narrative='Papers that describe the results  of testing drugs that bind to spike proteins of the virus or any other drugs in any animal models. Papers about SARS-CoV-2 infection in cell culture assays are also relevant.')\n",
            "TrecQuery(query_id='6', title='coronavirus test rapid testing', description='what types of rapid testing for Covid-19 have been developed?', narrative='Looking for studies identifying ways to diagnose Covid-19 more rapidly.')\n",
            "TrecQuery(query_id='7', title='serological tests for coronavirus', description='are there serological tests that detect antibodies to coronavirus?', narrative='Looking for assays that measure immune response to COVID-19 that will help determine past infection and subsequent possible immunity.')\n",
            "TrecQuery(query_id='8', title='coronavirus under reporting', description='how has lack of testing availability led to underreporting of true incidence of Covid-19?', narrative='Looking for studies answering questions of impact of lack of complete testing for Covid-19 on incidence and prevalence of Covid-19.')\n",
            "TrecQuery(query_id='9', title='coronavirus in Canada', description='how has COVID-19 affected Canada', narrative='seeking data related to infections (confirm, suspected, and projected) and health outcomes (symptoms, hospitalization, intensive care, mortality)')\n",
            "TrecQuery(query_id='10', title='coronavirus social distancing impact', description='has social distancing had an impact on slowing the spread of COVID-19?', narrative=\"seeking specific information on studies that have measured COVID-19's transmission in one or more social distancing (or non-social distancing) approaches\")\n",
            "TrecQuery(query_id='11', title='coronavirus hospital rationing', description='what are the guidelines for triaging patients infected with coronavirus?', narrative='Seeking information on any guidelines for prioritizing COVID-19 patients infected with coronavirus based on demographics, clinical signs, serology and other tests.')\n",
            "TrecQuery(query_id='12', title='coronavirus quarantine', description='what are best practices in hospitals and at home in maintaining quarantine?', narrative='Seeking information on best practices for activities and duration of quarantine for those exposed and/ infected to COVID-19 virus.')\n",
            "TrecQuery(query_id='13', title='how does coronavirus spread', description='what are the transmission routes of coronavirus?', narrative='Looking for information on all possible ways to contract COVID-19 from people, animals and objects')\n",
            "TrecQuery(query_id='14', title='coronavirus super spreaders', description='what evidence is there related to COVID-19 super spreaders', narrative='seeking range of information related to the number and proportion of super spreaders, their patterns of behavior that lead to spread, and potential prevention strategies targeted specifically toward super spreaders')\n",
            "TrecQuery(query_id='15', title='coronavirus outside body', description='how long can the coronavirus live outside the body', narrative=\"seeking range of information on the SARS-CoV-2's virus's survival in different environments (surfaces, liquids, etc.) outside the human body while still being viable for transmission to another human\")\n",
            "TrecQuery(query_id='16', title='how long does coronavirus survive on surfaces', description='how long does coronavirus remain stable  on surfaces?', narrative='Studies of time SARS-CoV-2 remains stable after being deposited from an infected person on everyday surfaces in a household or hospital setting, such as through coughing or touching objects.')\n",
            "TrecQuery(query_id='17', title='coronavirus clinical trials', description='are there any clinical trials available for the coronavirus', narrative='seeking specific COVID-19 clinical trials ranging from trials in recruitment to completed trials with results')\n",
            "TrecQuery(query_id='18', title='masks prevent coronavirus', description='what are the best masks for preventing infection by Covid-19?', narrative='What types of masks should or should not be used to prevent infection by Covid-19?')\n",
            "TrecQuery(query_id='19', title='what alcohol sanitizer kills coronavirus', description='what type of hand sanitizer is needed to destroy Covid-19?', narrative='Studies assessing chemicals and their concentrations needed to destroy the Covid-19 virus.')\n",
            "TrecQuery(query_id='20', title='coronavirus and ACE inhibitors', description='are patients taking Angiotensin-converting enzyme inhibitors (ACE) at increased risk for COVID-19?', narrative='Looking for information on interactions between  coronavirus and  angiotensin converting enzyme 2 (ACE2) receptors, risk for patients taking these medications, and recommendations for these patients.')\n",
            "TrecQuery(query_id='21', title='coronavirus mortality', description='what are the mortality rates overall and in specific populations', narrative='Seeking  information on COVID-19 fatality rates in different countries and in different population groups based on gender, blood types, or other factors')\n",
            "TrecQuery(query_id='22', title='coronavirus heart impacts', description='are cardiac complications likely in patients with COVID-19?', narrative='Seeking information on the types, frequency and mechanisms of cardiac complications caused by coronavirus.')\n",
            "TrecQuery(query_id='23', title='coronavirus hypertension', description='what kinds of complications related to COVID-19 are associated with hypertension?', narrative='seeking specific outcomes that hypertensive  (any type) patients are more/less likely to face if infected with the virus')\n",
            "TrecQuery(query_id='24', title='coronavirus diabetes', description='what kinds of complications related to COVID-19 are associated with diabetes', narrative='seeking specific outcomes that diabetic (any type) patients are more/less likely to face if infected with the virus')\n",
            "TrecQuery(query_id='25', title='coronavirus biomarkers', description='which biomarkers predict the severe clinical course of 2019-nCOV infection?', narrative='Looking for information on biomarkers that predict disease outcomes in people infected with coronavirus, specifically those that predict severe and fatal outcomes.')\n",
            "TrecQuery(query_id='26', title='coronavirus early symptoms', description='what are the initial symptoms of Covid-19?', narrative='Studies of patients and the first clinical manifestations they develop upon active infection?')\n",
            "TrecQuery(query_id='27', title='coronavirus asymptomatic', description='what is known about those infected with Covid-19 but are asymptomatic?', narrative='Studies of people who are known to be infected with Covid-19 but show no symptoms?')\n",
            "TrecQuery(query_id='28', title='coronavirus hydroxychloroquine', description='what evidence is there for the value of hydroxychloroquine in treating Covid-19?', narrative='Basic science or clinical studies assessing the benefit and harms of treating Covid-19 with hydroxychloroquine.')\n",
            "TrecQuery(query_id='29', title='coronavirus drug repurposing', description='which SARS-CoV-2 proteins-human proteins interactions indicate potential for drug targets. Are there approved drugs that can be repurposed based on this information?', narrative='Seeking information about protein-protein interactions for any of the SARS-CoV-2  structural proteins that represent a promising therapeutic target,  and the drug  molecules that may inhibit the virus and the host cell receptors at entry step.')\n",
            "TrecQuery(query_id='30', title='coronavirus remdesivir', description='is remdesivir an effective treatment for COVID-19', narrative='seeking specific information on clinical outcomes in COVID-19 patients treated with remdesivir')\n",
            "TrecQuery(query_id='31', title='difference between coronavirus and flu', description='How does the coronavirus differ from seasonal flu?', narrative='Includes studies ranging from those focusing on genomic differences to global public health impacts, but must draw direct comparisons between COVID-19 and seasonal influenza.')\n",
            "TrecQuery(query_id='32', title='coronavirus subtypes', description='Does SARS-CoV-2 have any subtypes, and if so what are they?', narrative='Papers that discuss subtypes of the virus, from named subtypes to speculative subtypes based on genomic or geographic clustering.')\n",
            "TrecQuery(query_id='33', title='coronavirus vaccine candidates', description='What vaccine candidates are being tested for Covid-19?', narrative='Seeking studies that discuss possible, but specific, COVID-19 vaccines. Includes articles from those describing the mechanisms of action of specific proposed vaccines to actual clinical trials, but excluding articles that do not name a specific vaccine candidate.')\n",
            "TrecQuery(query_id='34', title='coronavirus recovery', description='What are the longer-term complications of those who recover from COVID-19?', narrative='Seeking information on the health outcomes for those that recover from the virus. Excludes studies only focusing on adverse effects related to a particular COVID-19 drug.')\n",
            "TrecQuery(query_id='35', title='coronavirus public datasets', description='What new public datasets are available related to COVID-19?', narrative='Seeking articles that specifically release new data related to SARS-CoV-2 or COVID-19, including genomic data, patient data, public health data, etc. Articles that reference previously existing datasets are not relevant.')\n",
            "TrecQuery(query_id='36', title='SARS-CoV-2 spike structure', description='What is the protein structure of the SARS-CoV-2 spike?', narrative='Looking for studies of the structure of the spike protein on the virus using any methods, such as cryo-EM or crystallography')\n",
            "TrecQuery(query_id='37', title='SARS-CoV-2 phylogenetic analysis', description='What is the result of phylogenetic analysis of SARS-CoV-2 genome sequence?', narrative='Looking for a range of studies which provide the results of phylogenetic network analysis on the SARS-CoV-2 genome')\n",
            "TrecQuery(query_id='38', title='COVID inflammatory response', description='What is the mechanism of inflammatory response and pathogenesis of COVID-19 cases?', narrative='Looking for a range of studies which describes the inflammatory response cells and pathogenesis during the Coronavirus Disease 2019 (COVID-19) outbreak, including the mechanism of anti-inflammatory drugs, corticosteroids, and vitamin supplements')\n",
            "TrecQuery(query_id='39', title='COVID-19 cytokine storm', description='What is the mechanism of cytokine storm syndrome on the COVID-19?', narrative='Looking for studies that describes mechanism of development of cytokine storm syndrome among COVID-19 cases and the range of drugs used for the therapy of cytokine storm')\n",
            "TrecQuery(query_id='40', title='coronavirus mutations', description='What are the observed mutations in the SARS-CoV-2 genome and how often do the mutations occur?', narrative='Looking for studies that describes the emergence of genomic diversity of the coronavirus due to recurrent mutations which explore the potential genomic site of the mutation, mechanisms and its potential or observed clinical implications in the pathogenicity of the virus')\n",
            "TrecQuery(query_id='41', title='COVID-19 in African-Americans', description='What are the impacts of COVID-19 among African-Americans that differ from the rest of the U.S. population?', narrative='Looking for studies that analyze burden of illness and death among the African-American/black racial/ethnic group. This includes potential reasons for transmission, morbidity, and mortality. This may include discussion of other minority groups, but all studies should contain specific information on the health disparities faced by African-Americans in this pandemic.')\n",
            "TrecQuery(query_id='42', title='Vitamin D and COVID-19', description='Does Vitamin D impact COVID-19 prevention and treatment?', narrative='This includes studies describing possible role of Vitamin D in prevention of COVID-19, suppression of cytokine storm, clinical outcomes, and associations between Vitamin D status and COVID-19 mortality.')\n",
            "TrecQuery(query_id='43', title='violence during pandemic', description='How has the COVID-19 pandemic impacted violence in society, including violent crimes?', narrative='Looking for analyses and data on how the pandemic is impacting rates of violence, including domestic/family violence related to quarantine.')\n",
            "TrecQuery(query_id='44', title='impact of masks on coronavirus transmission', description='How much impact do masks have on preventing the spread of the COVID-19?', narrative='Looking for studies of how masks slow SARS-CoV-2 transmission, including impact on R0. Studies can include both lab and population studies.')\n",
            "TrecQuery(query_id='45', title='coronavirus mental health impact', description='How has the COVID-19 pandemic impacted mental health?', narrative='Includes increasing/decreasing rates of depression, anxiety, panic disorder, and other psychiatric and mental health conditions.')\n",
            "TrecQuery(query_id='46', title='dexamethasone coronavirus', description='what evidence is there for dexamethasone as a treatment for COVID-19?', narrative='Looking for studies on the impact of dexamethasone treatment in COVID-19 patients, including health benefits as well as adverse effects. This also includes specific populations that are benefitted/harmed by dexamethasone.')\n",
            "TrecQuery(query_id='47', title='COVID-19 outcomes in children', description='what are the health outcomes for children who contract COVID-19?', narrative='Looking for studies on health outcomes in children related to COVID-19. This includes studies attempting to explain the underlying biological mechanisms for why children differ from adults in response to infection.')\n",
            "TrecQuery(query_id='48', title='school reopening coronavirus', description='what are the benefits and risks of re-opening schools in the midst of the COVID-19 pandemic?', narrative='With the possibility of schools re-opening while the COVID-19 pandemic is still ongoing, this topic is looking for evidence or projections on what the potential implications of this are in terms of COVID-19 cases, hospitalizations, or deaths, as well as other benefits or harms to re-opening schools. This includes both the impact on students, teachers, families, and the wider community.')\n",
            "TrecQuery(query_id='49', title='post-infection COVID-19 immunity', description='do individuals who recover from COVID-19 show sufficient immune response, including antibody levels and T-cell mediated immunity, to prevent re-infection?', narrative='There is concern about re-infection for COVID-19, so this topic is looking for studies suggesting post-infection immunity, including post-infection antibody levels (over time) and evidence for individuals who have been infected more than once.')\n",
            "TrecQuery(query_id='50', title='mRNA vaccine coronavirus', description='what is known about an mRNA vaccine for the SARS-CoV-2 virus?', narrative='Looking for studies specifically focusing on mRNA vaccines for COVID-19, including how mRNA vaccines work, why they are promising, and any results from actual clinical studies.')\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "-0TnBGErO7VS"
      },
      "source": [
        "Iterables of namedtuples are handy structures because they are lightweight and do not load all the content into memory. But in case you need that, you can easily convert them into other data structures. Here's an example building a Pandas DataFrame of the queries:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        },
        "id": "nFsImZY1PNGa",
        "outputId": "6bf596b8-8412-48e7-be75-eaf5e703eb01"
      },
      "source": [
        "import pandas as pd\n",
        "pd.DataFrame(dataset.queries_iter())"
      ],
      "execution_count": 14,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>query_id</th>\n",
              "      <th>title</th>\n",
              "      <th>description</th>\n",
              "      <th>narrative</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1</td>\n",
              "      <td>coronavirus origin</td>\n",
              "      <td>what is the origin of COVID-19</td>\n",
              "      <td>seeking range of information about the SARS-Co...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>2</td>\n",
              "      <td>coronavirus response to weather changes</td>\n",
              "      <td>how does the coronavirus respond to changes in...</td>\n",
              "      <td>seeking range of information about the SARS-Co...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>3</td>\n",
              "      <td>coronavirus immunity</td>\n",
              "      <td>will SARS-CoV2 infected people develop immunit...</td>\n",
              "      <td>seeking studies of immunity developed due to i...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>4</td>\n",
              "      <td>how do people die from the coronavirus</td>\n",
              "      <td>what causes death from Covid-19?</td>\n",
              "      <td>Studies looking at mechanisms of death from Co...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>5</td>\n",
              "      <td>animal models of COVID-19</td>\n",
              "      <td>what drugs have been active against SARS-CoV o...</td>\n",
              "      <td>Papers that describe the results  of testing d...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>5</th>\n",
              "      <td>6</td>\n",
              "      <td>coronavirus test rapid testing</td>\n",
              "      <td>what types of rapid testing for Covid-19 have ...</td>\n",
              "      <td>Looking for studies identifying ways to diagno...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>6</th>\n",
              "      <td>7</td>\n",
              "      <td>serological tests for coronavirus</td>\n",
              "      <td>are there serological tests that detect antibo...</td>\n",
              "      <td>Looking for assays that measure immune respons...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>7</th>\n",
              "      <td>8</td>\n",
              "      <td>coronavirus under reporting</td>\n",
              "      <td>how has lack of testing availability led to un...</td>\n",
              "      <td>Looking for studies answering questions of imp...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>8</th>\n",
              "      <td>9</td>\n",
              "      <td>coronavirus in Canada</td>\n",
              "      <td>how has COVID-19 affected Canada</td>\n",
              "      <td>seeking data related to infections (confirm, s...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>9</th>\n",
              "      <td>10</td>\n",
              "      <td>coronavirus social distancing impact</td>\n",
              "      <td>has social distancing had an impact on slowing...</td>\n",
              "      <td>seeking specific information on studies that h...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>10</th>\n",
              "      <td>11</td>\n",
              "      <td>coronavirus hospital rationing</td>\n",
              "      <td>what are the guidelines for triaging patients ...</td>\n",
              "      <td>Seeking information on any guidelines for prio...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>11</th>\n",
              "      <td>12</td>\n",
              "      <td>coronavirus quarantine</td>\n",
              "      <td>what are best practices in hospitals and at ho...</td>\n",
              "      <td>Seeking information on best practices for acti...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>12</th>\n",
              "      <td>13</td>\n",
              "      <td>how does coronavirus spread</td>\n",
              "      <td>what are the transmission routes of coronavirus?</td>\n",
              "      <td>Looking for information on all possible ways t...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>13</th>\n",
              "      <td>14</td>\n",
              "      <td>coronavirus super spreaders</td>\n",
              "      <td>what evidence is there related to COVID-19 sup...</td>\n",
              "      <td>seeking range of information related to the nu...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>14</th>\n",
              "      <td>15</td>\n",
              "      <td>coronavirus outside body</td>\n",
              "      <td>how long can the coronavirus live outside the ...</td>\n",
              "      <td>seeking range of information on the SARS-CoV-2...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>15</th>\n",
              "      <td>16</td>\n",
              "      <td>how long does coronavirus survive on surfaces</td>\n",
              "      <td>how long does coronavirus remain stable  on su...</td>\n",
              "      <td>Studies of time SARS-CoV-2 remains stable afte...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>16</th>\n",
              "      <td>17</td>\n",
              "      <td>coronavirus clinical trials</td>\n",
              "      <td>are there any clinical trials available for th...</td>\n",
              "      <td>seeking specific COVID-19 clinical trials rang...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>17</th>\n",
              "      <td>18</td>\n",
              "      <td>masks prevent coronavirus</td>\n",
              "      <td>what are the best masks for preventing infecti...</td>\n",
              "      <td>What types of masks should or should not be us...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>18</th>\n",
              "      <td>19</td>\n",
              "      <td>what alcohol sanitizer kills coronavirus</td>\n",
              "      <td>what type of hand sanitizer is needed to destr...</td>\n",
              "      <td>Studies assessing chemicals and their concentr...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>19</th>\n",
              "      <td>20</td>\n",
              "      <td>coronavirus and ACE inhibitors</td>\n",
              "      <td>are patients taking Angiotensin-converting enz...</td>\n",
              "      <td>Looking for information on interactions betwee...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>20</th>\n",
              "      <td>21</td>\n",
              "      <td>coronavirus mortality</td>\n",
              "      <td>what are the mortality rates overall and in sp...</td>\n",
              "      <td>Seeking  information on COVID-19 fatality rate...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>21</th>\n",
              "      <td>22</td>\n",
              "      <td>coronavirus heart impacts</td>\n",
              "      <td>are cardiac complications likely in patients w...</td>\n",
              "      <td>Seeking information on the types, frequency an...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>22</th>\n",
              "      <td>23</td>\n",
              "      <td>coronavirus hypertension</td>\n",
              "      <td>what kinds of complications related to COVID-1...</td>\n",
              "      <td>seeking specific outcomes that hypertensive  (...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>23</th>\n",
              "      <td>24</td>\n",
              "      <td>coronavirus diabetes</td>\n",
              "      <td>what kinds of complications related to COVID-1...</td>\n",
              "      <td>seeking specific outcomes that diabetic (any t...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>24</th>\n",
              "      <td>25</td>\n",
              "      <td>coronavirus biomarkers</td>\n",
              "      <td>which biomarkers predict the severe clinical c...</td>\n",
              "      <td>Looking for information on biomarkers that pre...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>25</th>\n",
              "      <td>26</td>\n",
              "      <td>coronavirus early symptoms</td>\n",
              "      <td>what are the initial symptoms of Covid-19?</td>\n",
              "      <td>Studies of patients and the first clinical man...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>26</th>\n",
              "      <td>27</td>\n",
              "      <td>coronavirus asymptomatic</td>\n",
              "      <td>what is known about those infected with Covid-...</td>\n",
              "      <td>Studies of people who are known to be infected...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>27</th>\n",
              "      <td>28</td>\n",
              "      <td>coronavirus hydroxychloroquine</td>\n",
              "      <td>what evidence is there for the value of hydrox...</td>\n",
              "      <td>Basic science or clinical studies assessing th...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>28</th>\n",
              "      <td>29</td>\n",
              "      <td>coronavirus drug repurposing</td>\n",
              "      <td>which SARS-CoV-2 proteins-human proteins inter...</td>\n",
              "      <td>Seeking information about protein-protein inte...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>29</th>\n",
              "      <td>30</td>\n",
              "      <td>coronavirus remdesivir</td>\n",
              "      <td>is remdesivir an effective treatment for COVID-19</td>\n",
              "      <td>seeking specific information on clinical outco...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>30</th>\n",
              "      <td>31</td>\n",
              "      <td>difference between coronavirus and flu</td>\n",
              "      <td>How does the coronavirus differ from seasonal ...</td>\n",
              "      <td>Includes studies ranging from those focusing o...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>31</th>\n",
              "      <td>32</td>\n",
              "      <td>coronavirus subtypes</td>\n",
              "      <td>Does SARS-CoV-2 have any subtypes, and if so w...</td>\n",
              "      <td>Papers that discuss subtypes of the virus, fro...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>32</th>\n",
              "      <td>33</td>\n",
              "      <td>coronavirus vaccine candidates</td>\n",
              "      <td>What vaccine candidates are being tested for C...</td>\n",
              "      <td>Seeking studies that discuss possible, but spe...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>33</th>\n",
              "      <td>34</td>\n",
              "      <td>coronavirus recovery</td>\n",
              "      <td>What are the longer-term complications of thos...</td>\n",
              "      <td>Seeking information on the health outcomes for...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>34</th>\n",
              "      <td>35</td>\n",
              "      <td>coronavirus public datasets</td>\n",
              "      <td>What new public datasets are available related...</td>\n",
              "      <td>Seeking articles that specifically release new...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>35</th>\n",
              "      <td>36</td>\n",
              "      <td>SARS-CoV-2 spike structure</td>\n",
              "      <td>What is the protein structure of the SARS-CoV-...</td>\n",
              "      <td>Looking for studies of the structure of the sp...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>36</th>\n",
              "      <td>37</td>\n",
              "      <td>SARS-CoV-2 phylogenetic analysis</td>\n",
              "      <td>What is the result of phylogenetic analysis of...</td>\n",
              "      <td>Looking for a range of studies which provide t...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>37</th>\n",
              "      <td>38</td>\n",
              "      <td>COVID inflammatory response</td>\n",
              "      <td>What is the mechanism of inflammatory response...</td>\n",
              "      <td>Looking for a range of studies which describes...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>38</th>\n",
              "      <td>39</td>\n",
              "      <td>COVID-19 cytokine storm</td>\n",
              "      <td>What is the mechanism of cytokine storm syndro...</td>\n",
              "      <td>Looking for studies that describes mechanism o...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>39</th>\n",
              "      <td>40</td>\n",
              "      <td>coronavirus mutations</td>\n",
              "      <td>What are the observed mutations in the SARS-Co...</td>\n",
              "      <td>Looking for studies that describes the emergen...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>40</th>\n",
              "      <td>41</td>\n",
              "      <td>COVID-19 in African-Americans</td>\n",
              "      <td>What are the impacts of COVID-19 among African...</td>\n",
              "      <td>Looking for studies that analyze burden of ill...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>41</th>\n",
              "      <td>42</td>\n",
              "      <td>Vitamin D and COVID-19</td>\n",
              "      <td>Does Vitamin D impact COVID-19 prevention and ...</td>\n",
              "      <td>This includes studies describing possible role...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>42</th>\n",
              "      <td>43</td>\n",
              "      <td>violence during pandemic</td>\n",
              "      <td>How has the COVID-19 pandemic impacted violenc...</td>\n",
              "      <td>Looking for analyses and data on how the pande...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>43</th>\n",
              "      <td>44</td>\n",
              "      <td>impact of masks on coronavirus transmission</td>\n",
              "      <td>How much impact do masks have on preventing th...</td>\n",
              "      <td>Looking for studies of how masks slow SARS-CoV...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>44</th>\n",
              "      <td>45</td>\n",
              "      <td>coronavirus mental health impact</td>\n",
              "      <td>How has the COVID-19 pandemic impacted mental ...</td>\n",
              "      <td>Includes increasing/decreasing rates of depres...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>45</th>\n",
              "      <td>46</td>\n",
              "      <td>dexamethasone coronavirus</td>\n",
              "      <td>what evidence is there for dexamethasone as a ...</td>\n",
              "      <td>Looking for studies on the impact of dexametha...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>46</th>\n",
              "      <td>47</td>\n",
              "      <td>COVID-19 outcomes in children</td>\n",
              "      <td>what are the health outcomes for children who ...</td>\n",
              "      <td>Looking for studies on health outcomes in chil...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>47</th>\n",
              "      <td>48</td>\n",
              "      <td>school reopening coronavirus</td>\n",
              "      <td>what are the benefits and risks of re-opening ...</td>\n",
              "      <td>With the possibility of schools re-opening whi...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>48</th>\n",
              "      <td>49</td>\n",
              "      <td>post-infection COVID-19 immunity</td>\n",
              "      <td>do individuals who recover from COVID-19 show ...</td>\n",
              "      <td>There is concern about re-infection for COVID-...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>49</th>\n",
              "      <td>50</td>\n",
              "      <td>mRNA vaccine coronavirus</td>\n",
              "      <td>what is known about an mRNA vaccine for the SA...</td>\n",
              "      <td>Looking for studies specifically focusing on m...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "   query_id  ...                                          narrative\n",
              "0         1  ...  seeking range of information about the SARS-Co...\n",
              "1         2  ...  seeking range of information about the SARS-Co...\n",
              "2         3  ...  seeking studies of immunity developed due to i...\n",
              "3         4  ...  Studies looking at mechanisms of death from Co...\n",
              "4         5  ...  Papers that describe the results  of testing d...\n",
              "5         6  ...  Looking for studies identifying ways to diagno...\n",
              "6         7  ...  Looking for assays that measure immune respons...\n",
              "7         8  ...  Looking for studies answering questions of imp...\n",
              "8         9  ...  seeking data related to infections (confirm, s...\n",
              "9        10  ...  seeking specific information on studies that h...\n",
              "10       11  ...  Seeking information on any guidelines for prio...\n",
              "11       12  ...  Seeking information on best practices for acti...\n",
              "12       13  ...  Looking for information on all possible ways t...\n",
              "13       14  ...  seeking range of information related to the nu...\n",
              "14       15  ...  seeking range of information on the SARS-CoV-2...\n",
              "15       16  ...  Studies of time SARS-CoV-2 remains stable afte...\n",
              "16       17  ...  seeking specific COVID-19 clinical trials rang...\n",
              "17       18  ...  What types of masks should or should not be us...\n",
              "18       19  ...  Studies assessing chemicals and their concentr...\n",
              "19       20  ...  Looking for information on interactions betwee...\n",
              "20       21  ...  Seeking  information on COVID-19 fatality rate...\n",
              "21       22  ...  Seeking information on the types, frequency an...\n",
              "22       23  ...  seeking specific outcomes that hypertensive  (...\n",
              "23       24  ...  seeking specific outcomes that diabetic (any t...\n",
              "24       25  ...  Looking for information on biomarkers that pre...\n",
              "25       26  ...  Studies of patients and the first clinical man...\n",
              "26       27  ...  Studies of people who are known to be infected...\n",
              "27       28  ...  Basic science or clinical studies assessing th...\n",
              "28       29  ...  Seeking information about protein-protein inte...\n",
              "29       30  ...  seeking specific information on clinical outco...\n",
              "30       31  ...  Includes studies ranging from those focusing o...\n",
              "31       32  ...  Papers that discuss subtypes of the virus, fro...\n",
              "32       33  ...  Seeking studies that discuss possible, but spe...\n",
              "33       34  ...  Seeking information on the health outcomes for...\n",
              "34       35  ...  Seeking articles that specifically release new...\n",
              "35       36  ...  Looking for studies of the structure of the sp...\n",
              "36       37  ...  Looking for a range of studies which provide t...\n",
              "37       38  ...  Looking for a range of studies which describes...\n",
              "38       39  ...  Looking for studies that describes mechanism o...\n",
              "39       40  ...  Looking for studies that describes the emergen...\n",
              "40       41  ...  Looking for studies that analyze burden of ill...\n",
              "41       42  ...  This includes studies describing possible role...\n",
              "42       43  ...  Looking for analyses and data on how the pande...\n",
              "43       44  ...  Looking for studies of how masks slow SARS-CoV...\n",
              "44       45  ...  Includes increasing/decreasing rates of depres...\n",
              "45       46  ...  Looking for studies on the impact of dexametha...\n",
              "46       47  ...  Looking for studies on health outcomes in chil...\n",
              "47       48  ...  With the possibility of schools re-opening whi...\n",
              "48       49  ...  There is concern about re-infection for COVID-...\n",
              "49       50  ...  Looking for studies specifically focusing on m...\n",
              "\n",
              "[50 rows x 4 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 14
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "x8guincjNQA1"
      },
      "source": [
        "Again, we can [check the documentation](https://ir-datasets.com/cord19.html#cord19/trec-covid) for information about what fields are available. Or we can use `queries_cls()`:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "lxSvF03NM2Dt",
        "outputId": "d2ca9dd1-5914-4697-d8b4-8b904c85902a"
      },
      "source": [
        "dataset.queries_cls()"
      ],
      "execution_count": 15,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "ir_datasets.formats.trec.TrecQuery"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 15
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "BRBNpFHZNK70",
        "outputId": "5142178d-5844-4224-e5b1-c7547d7dec86"
      },
      "source": [
        "dataset.queries_cls()._fields"
      ],
      "execution_count": 16,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "('query_id', 'title', 'description', 'narrative')"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 16
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "PycaUfckNM80",
        "outputId": "1a5f74a7-1131-4937-e3e5-1ae06e8015cd"
      },
      "source": [
        "dataset.queries_cls().__annotations__"
      ],
      "execution_count": 17,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "OrderedDict([('query_id', str),\n",
              "             ('title', str),\n",
              "             ('description', str),\n",
              "             ('narrative', str)])"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 17
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "p5IA74C3Nlr0"
      },
      "source": [
        "## Query Relevance Assessments\n",
        "\n",
        "`qrels` (query relevance assessments/judgments) map a `query_id` and `doc_id` to a relevance score.\n",
        "\n",
        "You probably guessed it; we can fetch qrels for a dataset with `qrels_iter()`. There's a lot of them, so we'll just show them in a DataFrame to start with:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 424
        },
        "id": "VYldukNANN8I",
        "outputId": "998a629b-89f2-4d23-9de9-4289c92e3287"
      },
      "source": [
        "pd.DataFrame(dataset.qrels_iter())"
      ],
      "execution_count": 18,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>query_id</th>\n",
              "      <th>doc_id</th>\n",
              "      <th>relevance</th>\n",
              "      <th>iteration</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1</td>\n",
              "      <td>005b2j4b</td>\n",
              "      <td>2</td>\n",
              "      <td>4.5</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1</td>\n",
              "      <td>00fmeepz</td>\n",
              "      <td>1</td>\n",
              "      <td>4</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>1</td>\n",
              "      <td>010vptx3</td>\n",
              "      <td>2</td>\n",
              "      <td>0.5</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>1</td>\n",
              "      <td>0194oljo</td>\n",
              "      <td>1</td>\n",
              "      <td>2.5</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>1</td>\n",
              "      <td>021q9884</td>\n",
              "      <td>1</td>\n",
              "      <td>4</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>69313</th>\n",
              "      <td>50</td>\n",
              "      <td>zvop8bxh</td>\n",
              "      <td>2</td>\n",
              "      <td>5</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>69314</th>\n",
              "      <td>50</td>\n",
              "      <td>zwf26o63</td>\n",
              "      <td>1</td>\n",
              "      <td>5</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>69315</th>\n",
              "      <td>50</td>\n",
              "      <td>zwsvlnwe</td>\n",
              "      <td>0</td>\n",
              "      <td>5</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>69316</th>\n",
              "      <td>50</td>\n",
              "      <td>zxr01yln</td>\n",
              "      <td>1</td>\n",
              "      <td>5</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>69317</th>\n",
              "      <td>50</td>\n",
              "      <td>zz8wvos9</td>\n",
              "      <td>1</td>\n",
              "      <td>5</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>69318 rows × 4 columns</p>\n",
              "</div>"
            ],
            "text/plain": [
              "      query_id    doc_id  relevance iteration\n",
              "0            1  005b2j4b          2       4.5\n",
              "1            1  00fmeepz          1         4\n",
              "2            1  010vptx3          2       0.5\n",
              "3            1  0194oljo          1       2.5\n",
              "4            1  021q9884          1         4\n",
              "...        ...       ...        ...       ...\n",
              "69313       50  zvop8bxh          2         5\n",
              "69314       50  zwf26o63          1         5\n",
              "69315       50  zwsvlnwe          0         5\n",
              "69316       50  zxr01yln          1         5\n",
              "69317       50  zz8wvos9          1         5\n",
              "\n",
              "[69318 rows x 4 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 18
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "piCD2tZkOGaQ"
      },
      "source": [
        "What does relevance=0, 1, and 2 mean? You can find out with `qrels_defs`:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Fy_xf2vQN-Jx",
        "outputId": "4b6264db-7483-4630-a558-a36c8d7420c8"
      },
      "source": [
        "dataset.qrels_defs()"
      ],
      "execution_count": 19,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "{0: 'Not Relevant: everything else.',\n",
              " 1: 'Partially Relevant: the article answers part of the question but would need to be combined with other information to get a complete answer.',\n",
              " 2: 'Relevant: the article is fully responsive to the information need as expressed by the topic, i.e. answers the Question in the topic. The article need not contain all information on the topic, but must, on its own, provide an answer to the question.'}"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 19
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "B8noJu7xOQ23"
      },
      "source": [
        "Of course we can also get information about the `TrecQrel` type using `qrels_cls()`:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "0q8oZBIQOEnX",
        "outputId": "64ff29a9-e273-4ead-b94d-f5b5aa8eda48"
      },
      "source": [
        "dataset.qrels_cls()"
      ],
      "execution_count": 20,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "ir_datasets.formats.trec.TrecQrel"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 20
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "4tJ_aXCBOYBG",
        "outputId": "479d1dc0-5d02-429c-a243-7b24feda1e4e"
      },
      "source": [
        "dataset.qrels_cls()._fields"
      ],
      "execution_count": 21,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "('query_id', 'doc_id', 'relevance', 'iteration')"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 21
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "QAImnTAtOZoz",
        "outputId": "dbf62766-db08-424e-c3fa-a22634500907"
      },
      "source": [
        "dataset.qrels_cls().__annotations__"
      ],
      "execution_count": 22,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "OrderedDict([('query_id', str),\n",
              "             ('doc_id', str),\n",
              "             ('relevance', int),\n",
              "             ('iteration', str)])"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 22
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "GQb4jO-HOf6q"
      },
      "source": [
        "## Wrapping Up\n",
        "\n",
        "So that's the core functionality. You can find more information in the [documentation](https://ir-datasets.com/)."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "EldJW1rhObGy"
      },
      "source": [
        ""
      ],
      "execution_count": 22,
      "outputs": []
    }
  ]
}

================================================
FILE: examples/ir_datasets_cli.ipynb
================================================
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "ir_datasets-cli.ipynb",
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "SP6ophbQq5I0"
      },
      "source": [
        "# ir_datasets - Tutorial - CLI\n",
        "\n",
        "**NOTE: This tutorial is for the command-line interface. See the other tutorial for Python.**"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "cl8KYrJTq-g0"
      },
      "source": [
        "## Getting Started\n",
        "\n",
        "We'll start out by installing the package. The package is available on pypi,\n",
        "so you can install it with your favorite package manager."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "vbGhAIREqw1c",
        "outputId": "1d7fcdb3-93a2-4668-fd7d-787d1471f648"
      },
      "source": [
        "!pip install ir_datasets"
      ],
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Requirement already satisfied: ir_datasets in /usr/local/lib/python3.7/dist-packages (0.3.1)\n",
            "Requirement already satisfied: tqdm>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (4.41.1)\n",
            "Requirement already satisfied: warc3-wet>=0.2.3 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (0.2.3)\n",
            "Requirement already satisfied: warc3-wet-clueweb09>=0.2.5 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (0.2.5)\n",
            "Requirement already satisfied: beautifulsoup4>=4.4.1 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (4.6.3)\n",
            "Requirement already satisfied: requests>=2.22.0 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (2.23.0)\n",
            "Requirement already satisfied: lz4>=3.1.1 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (3.1.3)\n",
            "Requirement already satisfied: ijson>=3.1.3 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (3.1.4)\n",
            "Requirement already satisfied: pyyaml>=5.3.1 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (5.4.1)\n",
            "Requirement already satisfied: zlib-state>=0.1.3 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (0.1.3)\n",
            "Requirement already satisfied: lxml>=4.5.2 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (4.6.2)\n",
            "Requirement already satisfied: trec-car-tools>=2.5.3 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (2.5.3)\n",
            "Requirement already satisfied: numpy>=1.18.1 in /usr/local/lib/python3.7/dist-packages (from ir_datasets) (1.19.5)\n",
            "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.22.0->ir_datasets) (1.24.3)\n",
            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.22.0->ir_datasets) (3.0.4)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.22.0->ir_datasets) (2020.12.5)\n",
            "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.22.0->ir_datasets) (2.10)\n",
            "Requirement already satisfied: cbor>=1.0.0 in /usr/local/lib/python3.7/dist-packages (from trec-car-tools>=2.5.3->ir_datasets) (1.0.0)\n",
            "Requirement already satisfied: typing>=3.6.2 in /usr/local/lib/python3.7/dist-packages (from trec-car-tools>=2.5.3->ir_datasets) (3.7.4.3)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "7v_X6XqlrTan"
      },
      "source": [
        "## export\n",
        "\n",
        "The `ir_datasets export` command outputs data to stdout as TSV,\n",
        "JSON, and other formats.\n",
        "\n",
        "The command format is:\n",
        "\n",
        "```\n",
        "ir_datasets export <dataset-id> <entity-type>\n",
        "```\n",
        "\n",
        "with optional other arguments following entity-type.\n",
        "\n",
        "`<dataset-id>` is the dataset's identifier, found [in the catalog](https://ir-datasets.com/). `<entity-type>` is one of: `docs`, `queries`, `qrels`, `scoreddocs`."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "myI4M6OCsJQL"
      },
      "source": [
        "Let's start by getting the top 10 documents from the `cord19/trec-covid` collection. The first time you run the command, it will automatically download the dataset.\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "wt-QU7q1q-Mn",
        "outputId": "469d6b3f-4a0f-44db-a42b-ee8fef1232fe"
      },
      "source": [
        "!ir_datasets export cord19/trec-covid docs | head -n 10"
      ],
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[INFO] No fields supplied. Using all fields: ('doc_id', 'title', 'doi', 'date', 'abstract')\n",
            "ug7v899j\tClinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia\t10.1186/1471-2334-1-6\t2001-07-04\tOBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolates (60%) were associated with pneumonia, 14 (35%) with upper respiratory tract infections, and 2 (5%) with bronchiolitis. Cough (82.5%), fever (75%), and malaise (58.8%) were the most common symptoms, and crepitations (60%), and wheezes (40%) were the most common signs. Most patients with pneumonia had crepitations (79.2%) but only 25% had bronchial breathing. Immunocompromised patients were more likely than non-immunocompromised patients to present with pneumonia (8/9 versus 16/31, P = 0.05). Of the 24 patients with pneumonia, 14 (58.3%) had uneventful recovery, 4 (16.7%) recovered following some complications, 3 (12.5%) died because of M pneumoniae infection, and 3 (12.5%) died due to underlying comorbidities. The 3 patients who died of M pneumoniae pneumonia had other comorbidities. CONCLUSION: our results were similar to published data except for the finding that infections were more common in infants and preschool children and that the mortality rate of pneumonia in patients with comorbidities was high.\n",
            "02tnwd4m\tNitric oxide: a pro-inflammatory mediator in lung disease?\t10.1186/rr14\t2000-08-15\tInflammatory diseases of the respiratory tract are commonly associated with elevated production of nitric oxide (NO•) and increased indices of NO• -dependent oxidative stress. Although NO• is known to have anti-microbial, anti-inflammatory and anti-oxidant properties, various lines of evidence support the contribution of NO• to lung injury in several disease models. On the basis of biochemical evidence, it is often presumed that such NO• -dependent oxidations are due to the formation of the oxidant peroxynitrite, although alternative mechanisms involving the phagocyte-derived heme proteins myeloperoxidase and eosinophil peroxidase might be operative during conditions of inflammation. Because of the overwhelming literature on NO• generation and activities in the respiratory tract, it would be beyond the scope of this commentary to review this area comprehensively. Instead, it focuses on recent evidence and concepts of the presumed contribution of NO• to inflammatory diseases of the lung.\n",
            "ejv2xln0\tSurfactant protein-D and pulmonary host defense\t10.1186/rr19\t2000-08-25\tSurfactant protein-D (SP-D) participates in the innate response to inhaled microorganisms and organic antigens, and contributes to immune and inflammatory regulation within the lung. SP-D is synthesized and secreted by alveolar and bronchiolar epithelial cells, but is also expressed by epithelial cells lining various exocrine ducts and the mucosa of the gastrointestinal and genitourinary tracts. SP-D, a collagenous calcium-dependent lectin (or collectin), binds to surface glycoconjugates expressed by a wide variety of microorganisms, and to oligosaccharides associated with the surface of various complex organic antigens. SP-D also specifically interacts with glycoconjugates and other molecules expressed on the surface of macrophages, neutrophils, and lymphocytes. In addition, SP-D binds to specific surfactant-associated lipids and can influence the organization of lipid mixtures containing phosphatidylinositol in vitro. Consistent with these diverse in vitro activities is the observation that SP-D-deficient transgenic mice show abnormal accumulations of surfactant lipids, and respond abnormally to challenge with respiratory viruses and bacterial lipopolysaccharides. The phenotype of macrophages isolated from the lungs of SP-D-deficient mice is altered, and there is circumstantial evidence that abnormal oxidant metabolism and/or increased metalloproteinase expression contributes to the development of emphysema. The expression of SP-D is increased in response to many forms of lung injury, and deficient accumulation of appropriately oligomerized SP-D might contribute to the pathogenesis of a variety of human lung diseases.\n",
            "2b73a28n\tRole of endothelin-1 in lung disease\t10.1186/rr44\t2001-02-22\tEndothelin-1 (ET-1) is a 21 amino acid peptide with diverse biological activity that has been implicated in numerous diseases. ET-1 is a potent mitogen regulator of smooth muscle tone, and inflammatory mediator that may play a key role in diseases of the airways, pulmonary circulation, and inflammatory lung diseases, both acute and chronic. This review will focus on the biology of ET-1 and its role in lung disease.\n",
            "9785vg6d\tGene expression in epithelial cells in response to pneumovirus infection\t10.1186/rr61\t2001-05-11\tRespiratory syncytial virus (RSV) and pneumonia virus of mice (PVM) are viruses of the family Paramyxoviridae, subfamily pneumovirus, which cause clinically important respiratory infections in humans and rodents, respectively. The respiratory epithelial target cells respond to viral infection with specific alterations in gene expression, including production of chemoattractant cytokines, adhesion molecules, elements that are related to the apoptosis response, and others that remain incompletely understood. Here we review our current understanding of these mucosal responses and discuss several genomic approaches, including differential display reverse transcription-polymerase chain reaction (PCR) and gene array strategies, that will permit us to unravel the nature of these responses in a more complete and systematic manner.\n",
            "zjufx4fo\tSequence requirements for RNA strand transfer during nidovirus discontinuous subgenomic RNA synthesis\t10.1093/emboj/20.24.7220\t2001-12-17\tNidovirus subgenomic mRNAs contain a leader sequence derived from the 5′ end of the genome fused to different sequences (‘bodies’) derived from the 3′ end. Their generation involves a unique mechanism of discontinuous subgenomic RNA synthesis that resembles copy-choice RNA recombination. During this process, the nascent RNA strand is transferred from one site in the template to another, during either plus or minus strand synthesis, to yield subgenomic RNA molecules. Central to this process are transcription-regulating sequences (TRSs), which are present at both template sites and ensure the fidelity of strand transfer. Here we present results of a comprehensive co-variation mutagenesis study of equine arteritis virus TRSs, demonstrating that discontinuous RNA synthesis depends not only on base pairing between sense leader TRS and antisense body TRS, but also on the primary sequence of the body TRS. While the leader TRS merely plays a targeting role for strand transfer, the body TRS fulfils multiple functions. The sequences of mRNA leader–body junctions of TRS mutants strongly suggested that the discontinuous step occurs during minus strand synthesis.\n",
            "5yhe786e\tDebate: Transfusing to normal haemoglobin levels will not improve outcome\t10.1186/cc987\t2001-03-08\tRecent evidence suggests that critically ill patients are able to tolerate lower levels of haemoglobin than was previously believed. It is our goal to show that transfusing to a level of 100 g/l does not improve mortality and other clinically important outcomes in a critical care setting. Although many questions remain, many laboratory and clinical studies, including a recent randomized controlled trial (RCT), have established that transfusing to normal haemoglobin concentrations does not improve organ failure and mortality in the critically ill patient. In addition, a restrictive transfusion strategy will reduce exposure to allogeneic transfusions, result in more efficient use of red blood cells (RBCs), save blood overall, and decrease health care costs.\n",
            "8zchiykl\tThe 21st International Symposium on Intensive Care and Emergency Medicine, Brussels, Belgium, 20-23 March 2001\t10.1186/cc1013\t2001-05-02\tThe 21st International Symposium on Intensive Care and Emergency Medicine was dominated by the results of recent clinical trials in sepsis and acute respiratory distress syndrome (ARDS). The promise of extracorporeal liver replacement therapy and noninvasive ventilation were other areas of interest. Ethical issues also received attention. Overall, the 'state of the art' lectures, pro/con debates, seminars and tutorials were of a high standard. The meeting was marked by a sense of renewed enthusiasm that positive progress is occurring in intensive care medicine.\n",
            "8qnrcgnk\tHeme oxygenase-1 and carbon monoxide in pulmonary medicine\t10.1186/1465-9921-4-7\t2003-08-07\tHeme oxygenase-1 (HO-1), an inducible stress protein, confers cytoprotection against oxidative stress in vitro and in vivo. In addition to its physiological role in heme degradation, HO-1 may influence a number of cellular processes, including growth, inflammation, and apoptosis. By virtue of anti-inflammatory effects, HO-1 limits tissue damage in response to proinflammatory stimuli and prevents allograft rejection after transplantation. The transcriptional upregulation of HO-1 responds to many agents, such as hypoxia, bacterial lipopolysaccharide, and reactive oxygen/nitrogen species. HO-1 and its constitutively expressed isozyme, heme oxygenase-2, catalyze the rate-limiting step in the conversion of heme to its metabolites, bilirubin IXα, ferrous iron, and carbon monoxide (CO). The mechanisms by which HO-1 provides protection most likely involve its enzymatic reaction products. Remarkably, administration of CO at low concentrations can substitute for HO-1 with respect to anti-inflammatory and anti-apoptotic effects, suggesting a role for CO as a key mediator of HO-1 function. Chronic, low-level, exogenous exposure to CO from cigarette smoking contributes to the importance of CO in pulmonary medicine. The implications of the HO-1/CO system in pulmonary diseases will be discussed in this review, with an emphasis on inflammatory states.\n",
            "jg13scgo\tTechnical Description of RODS: A Real-time Public Health Surveillance System\t10.1197/jamia.m1345\t2003-09-01\tThis report describes the design and implementation of the Real-time Outbreak and Disease Surveillance (RODS) system, a computer-based public health surveillance system for early detection of disease outbreaks. Hospitals send RODS data from clinical encounters over virtual private networks and leased lines using the Health Level 7 (HL7) message protocol. The data are sent in real time. RODS automatically classifies the registration chief complaint from the visit into one of seven syndrome categories using Bayesian classifiers. It stores the data in a relational database, aggregates the data for analysis using data warehousing techniques, applies univariate and multivariate statistical detection algorithms to the data, and alerts users of when the algorithms identify anomalous patterns in the syndrome counts. RODS also has a Web-based user interface that supports temporal and spatial analyses. RODS processes sales of over-the-counter health care products in a similar manner but receives such data in batch mode on a daily basis. RODS was used during the 2002 Winter Olympics and currently operates in two states—Pennsylvania and Utah. It has been and continues to be a resource for implementing, evaluating, and applying new methods of public health surveillance.\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "HTQaik0isguS"
      },
      "source": [
        "You can export in other formats too. Here's an exporting in JSON-Lines."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "XaYh4lwLrTDZ",
        "outputId": "b50827d6-02e6-409c-bdcf-72dfbfdf1529"
      },
      "source": [
        "!ir_datasets export cord19/trec-covid docs --format jsonl | head -n 10"
      ],
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "{\"doc_id\": \"ug7v899j\", \"title\": \"Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia\", \"doi\": \"10.1186/1471-2334-1-6\", \"date\": \"2001-07-04\", \"abstract\": \"OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolates (60%) were associated with pneumonia, 14 (35%) with upper respiratory tract infections, and 2 (5%) with bronchiolitis. Cough (82.5%), fever (75%), and malaise (58.8%) were the most common symptoms, and crepitations (60%), and wheezes (40%) were the most common signs. Most patients with pneumonia had crepitations (79.2%) but only 25% had bronchial breathing. Immunocompromised patients were more likely than non-immunocompromised patients to present with pneumonia (8/9 versus 16/31, P = 0.05). Of the 24 patients with pneumonia, 14 (58.3%) had uneventful recovery, 4 (16.7%) recovered following some complications, 3 (12.5%) died because of M pneumoniae infection, and 3 (12.5%) died due to underlying comorbidities. The 3 patients who died of M pneumoniae pneumonia had other comorbidities. CONCLUSION: our results were similar to published data except for the finding that infections were more common in infants and preschool children and that the mortality rate of pneumonia in patients with comorbidities was high.\"}\n",
            "{\"doc_id\": \"02tnwd4m\", \"title\": \"Nitric oxide: a pro-inflammatory mediator in lung disease?\", \"doi\": \"10.1186/rr14\", \"date\": \"2000-08-15\", \"abstract\": \"Inflammatory diseases of the respiratory tract are commonly associated with elevated production of nitric oxide (NO\\u2022) and increased indices of NO\\u2022 -dependent oxidative stress. Although NO\\u2022 is known to have anti-microbial, anti-inflammatory and anti-oxidant properties, various lines of evidence support the contribution of NO\\u2022 to lung injury in several disease models. On the basis of biochemical evidence, it is often presumed that such NO\\u2022 -dependent oxidations are due to the formation of the oxidant peroxynitrite, although alternative mechanisms involving the phagocyte-derived heme proteins myeloperoxidase and eosinophil peroxidase might be operative during conditions of inflammation. Because of the overwhelming literature on NO\\u2022 generation and activities in the respiratory tract, it would be beyond the scope of this commentary to review this area comprehensively. Instead, it focuses on recent evidence and concepts of the presumed contribution of NO\\u2022 to inflammatory diseases of the lung.\"}\n",
            "{\"doc_id\": \"ejv2xln0\", \"title\": \"Surfactant protein-D and pulmonary host defense\", \"doi\": \"10.1186/rr19\", \"date\": \"2000-08-25\", \"abstract\": \"Surfactant protein-D (SP-D) participates in the innate response to inhaled microorganisms and organic antigens, and contributes to immune and inflammatory regulation within the lung. SP-D is synthesized and secreted by alveolar and bronchiolar epithelial cells, but is also expressed by epithelial cells lining various exocrine ducts and the mucosa of the gastrointestinal and genitourinary tracts. SP-D, a collagenous calcium-dependent lectin (or collectin), binds to surface glycoconjugates expressed by a wide variety of microorganisms, and to oligosaccharides associated with the surface of various complex organic antigens. SP-D also specifically interacts with glycoconjugates and other molecules expressed on the surface of macrophages, neutrophils, and lymphocytes. In addition, SP-D binds to specific surfactant-associated lipids and can influence the organization of lipid mixtures containing phosphatidylinositol in vitro. Consistent with these diverse in vitro activities is the observation that SP-D-deficient transgenic mice show abnormal accumulations of surfactant lipids, and respond abnormally to challenge with respiratory viruses and bacterial lipopolysaccharides. The phenotype of macrophages isolated from the lungs of SP-D-deficient mice is altered, and there is circumstantial evidence that abnormal oxidant metabolism and/or increased metalloproteinase expression contributes to the development of emphysema. The expression of SP-D is increased in response to many forms of lung injury, and deficient accumulation of appropriately oligomerized SP-D might contribute to the pathogenesis of a variety of human lung diseases.\"}\n",
            "{\"doc_id\": \"2b73a28n\", \"title\": \"Role of endothelin-1 in lung disease\", \"doi\": \"10.1186/rr44\", \"date\": \"2001-02-22\", \"abstract\": \"Endothelin-1 (ET-1) is a 21 amino acid peptide with diverse biological activity that has been implicated in numerous diseases. ET-1 is a potent mitogen regulator of smooth muscle tone, and inflammatory mediator that may play a key role in diseases of the airways, pulmonary circulation, and inflammatory lung diseases, both acute and chronic. This review will focus on the biology of ET-1 and its role in lung disease.\"}\n",
            "{\"doc_id\": \"9785vg6d\", \"title\": \"Gene expression in epithelial cells in response to pneumovirus infection\", \"doi\": \"10.1186/rr61\", \"date\": \"2001-05-11\", \"abstract\": \"Respiratory syncytial virus (RSV) and pneumonia virus of mice (PVM) are viruses of the family Paramyxoviridae, subfamily pneumovirus, which cause clinically important respiratory infections in humans and rodents, respectively. The respiratory epithelial target cells respond to viral infection with specific alterations in gene expression, including production of chemoattractant cytokines, adhesion molecules, elements that are related to the apoptosis response, and others that remain incompletely understood. Here we review our current understanding of these mucosal responses and discuss several genomic approaches, including differential display reverse transcription-polymerase chain reaction (PCR) and gene array strategies, that will permit us to unravel the nature of these responses in a more complete and systematic manner.\"}\n",
            "{\"doc_id\": \"zjufx4fo\", \"title\": \"Sequence requirements for RNA strand transfer during nidovirus discontinuous subgenomic RNA synthesis\", \"doi\": \"10.1093/emboj/20.24.7220\", \"date\": \"2001-12-17\", \"abstract\": \"Nidovirus subgenomic mRNAs contain a leader sequence derived from the 5\\u2032 end of the genome fused to different sequences (\\u2018bodies\\u2019) derived from the 3\\u2032 end. Their generation involves a unique mechanism of discontinuous subgenomic RNA synthesis that resembles copy-choice RNA recombination. During this process, the nascent RNA strand is transferred from one site in the template to another, during either plus or minus strand synthesis, to yield subgenomic RNA molecules. Central to this process are transcription-regulating sequences (TRSs), which are present at both template sites and ensure the fidelity of strand transfer. Here we present results of a comprehensive co-variation mutagenesis study of equine arteritis virus TRSs, demonstrating that discontinuous RNA synthesis depends not only on base pairing between sense leader TRS and antisense body TRS, but also on the primary sequence of the body TRS. While the leader TRS merely plays a targeting role for strand transfer, the body TRS fulfils multiple functions. The sequences of mRNA leader\\u2013body junctions of TRS mutants strongly suggested that the discontinuous step occurs during minus strand synthesis.\"}\n",
            "{\"doc_id\": \"5yhe786e\", \"title\": \"Debate: Transfusing to normal haemoglobin levels will not improve outcome\", \"doi\": \"10.1186/cc987\", \"date\": \"2001-03-08\", \"abstract\": \"Recent evidence suggests that critically ill patients are able to tolerate lower levels of haemoglobin than was previously believed. It is our goal to show that transfusing to a level of 100 g/l does not improve mortality and other clinically important outcomes in a critical care setting. Although many questions remain, many laboratory and clinical studies, including a recent randomized controlled trial (RCT), have established that transfusing to normal haemoglobin concentrations does not improve organ failure and mortality in the critically ill patient. In addition, a restrictive transfusion strategy will reduce exposure to allogeneic transfusions, result in more efficient use of red blood cells (RBCs), save blood overall, and decrease health care costs.\"}\n",
            "{\"doc_id\": \"8zchiykl\", \"title\": \"The 21st International Symposium on Intensive Care and Emergency Medicine, Brussels, Belgium, 20-23 March 2001\", \"doi\": \"10.1186/cc1013\", \"date\": \"2001-05-02\", \"abstract\": \"The 21st International Symposium on Intensive Care and Emergency Medicine was dominated by the results of recent clinical trials in sepsis and acute respiratory distress syndrome (ARDS). The promise of extracorporeal liver replacement therapy and noninvasive ventilation were other areas of interest. Ethical issues also received attention. Overall, the 'state of the art' lectures, pro/con debates, seminars and tutorials were of a high standard. The meeting was marked by a sense of renewed enthusiasm that positive progress is occurring in intensive care medicine.\"}\n",
            "{\"doc_id\": \"8qnrcgnk\", \"title\": \"Heme oxygenase-1 and carbon monoxide in pulmonary medicine\", \"doi\": \"10.1186/1465-9921-4-7\", \"date\": \"2003-08-07\", \"abstract\": \"Heme oxygenase-1 (HO-1), an inducible stress protein, confers cytoprotection against oxidative stress in vitro and in vivo. In addition to its physiological role in heme degradation, HO-1 may influence a number of cellular processes, including growth, inflammation, and apoptosis. By virtue of anti-inflammatory effects, HO-1 limits tissue damage in response to proinflammatory stimuli and prevents allograft rejection after transplantation. The transcriptional upregulation of HO-1 responds to many agents, such as hypoxia, bacterial lipopolysaccharide, and reactive oxygen/nitrogen species. HO-1 and its constitutively expressed isozyme, heme oxygenase-2, catalyze the rate-limiting step in the conversion of heme to its metabolites, bilirubin IX\\u03b1, ferrous iron, and carbon monoxide (CO). The mechanisms by which HO-1 provides protection most likely involve its enzymatic reaction products. Remarkably, administration of CO at low concentrations can substitute for HO-1 with respect to anti-inflammatory and anti-apoptotic effects, suggesting a role for CO as a key mediator of HO-1 function. Chronic, low-level, exogenous exposure to CO from cigarette smoking contributes to the importance of CO in pulmonary medicine. The implications of the HO-1/CO system in pulmonary diseases will be discussed in this review, with an emphasis on inflammatory states.\"}\n",
            "{\"doc_id\": \"jg13scgo\", \"title\": \"Technical Description of RODS: A Real-time Public Health Surveillance System\", \"doi\": \"10.1197/jamia.m1345\", \"date\": \"2003-09-01\", \"abstract\": \"This report describes the design and implementation of the Real-time Outbreak and Disease Surveillance (RODS) system, a computer-based public health surveillance system for early detection of disease outbreaks. Hospitals send RODS data from clinical encounters over virtual private networks and leased lines using the Health Level 7 (HL7) message protocol. The data are sent in real time. RODS automatically classifies the registration chief complaint from the visit into one of seven syndrome categories using Bayesian classifiers. It stores the data in a relational database, aggregates the data for analysis using data warehousing techniques, applies univariate and multivariate statistical detection algorithms to the data, and alerts users of when the algorithms identify anomalous patterns in the syndrome counts. RODS also has a Web-based user interface that supports temporal and spatial analyses. RODS processes sales of over-the-counter health care products in a similar manner but receives such data in batch mode on a daily basis. RODS was used during the 2002 Winter Olympics and currently operates in two states\\u2014Pennsylvania and Utah. It has been and continues to be a resource for implementing, evaluating, and applying new methods of public health surveillance.\"}\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "hhVG2gp6sqdZ"
      },
      "source": [
        "If you do not want all the fields, you can specify which ones with `--fields`:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "kFI8UHbzq6Cu",
        "outputId": "a06ac0f5-2248-4c09-e5e6-f8b49f5cf29f"
      },
      "source": [
        "!ir_datasets export cord19/trec-covid docs --format jsonl --fields doc_id date | head -n 10"
      ],
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "{\"doc_id\": \"ug7v899j\", \"date\": \"2001-07-04\"}\n",
            "{\"doc_id\": \"02tnwd4m\", \"date\": \"2000-08-15\"}\n",
            "{\"doc_id\": \"ejv2xln0\", \"date\": \"2000-08-25\"}\n",
            "{\"doc_id\": \"2b73a28n\", \"date\": \"2001-02-22\"}\n",
            "{\"doc_id\": \"9785vg6d\", \"date\": \"2001-05-11\"}\n",
            "{\"doc_id\": \"zjufx4fo\", \"date\": \"2001-12-17\"}\n",
            "{\"doc_id\": \"5yhe786e\", \"date\": \"2001-03-08\"}\n",
            "{\"doc_id\": \"8zchiykl\", \"date\": \"2001-05-02\"}\n",
            "{\"doc_id\": \"8qnrcgnk\", \"date\": \"2003-08-07\"}\n",
            "{\"doc_id\": \"jg13scgo\", \"date\": \"2003-09-01\"}\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "WUjwx7i1s5HD"
      },
      "source": [
        "The export command works the same way for `queries`, `qrels`, and `scoreddocs` (where available). By default, `qrels` and `scoreddocs` output in the TREC format. But you can choose to export as tsv or jsonl as well."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "JoeB2aresxAV",
        "outputId": "872f8a51-ceb2-4c29-84ba-eb503f58ce1d"
      },
      "source": [
        "!ir_datasets export cord19/trec-covid queries --fields query_id title | head -n 10"
      ],
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "1\tcoronavirus origin\n",
            "2\tcoronavirus response to weather changes\n",
            "3\tcoronavirus immunity\n",
            "4\thow do people die from the coronavirus\n",
            "5\tanimal models of COVID-19\n",
            "6\tcoronavirus test rapid testing\n",
            "7\tserological tests for coronavirus\n",
            "8\tcoronavirus under reporting\n",
            "9\tcoronavirus in Canada\n",
            "10\tcoronavirus social distancing impact\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Me_hppfJtRxG",
        "outputId": "b2fdb388-7eea-4e47-f5ea-859e07fe1b74"
      },
      "source": [
        "!ir_datasets export cord19/trec-covid qrels | head -n 10"
      ],
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "1 4.5 005b2j4b 2\n",
            "1 4 00fmeepz 1\n",
            "1 0.5 010vptx3 2\n",
            "1 2.5 0194oljo 1\n",
            "1 4 021q9884 1\n",
            "1 1 02f0opkr 1\n",
            "1 3.5 047xpt2c 0\n",
            "1 1 04ftw7k9 0\n",
            "1 1 05qglt1f 0\n",
            "1 3 05vx82oo 0\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "2zocHeB1tgKu"
      },
      "source": [
        "If you're savvy at the command line, piping can let you capture some dataset statistics pretty easily. Here's an example giving the label proportions using `awk`:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "vqCnPJOVtaWl",
        "outputId": "6b041b9e-9b85-47bc-91c1-1595c9d5968b"
      },
      "source": [
        "!ir_datasets export cord19/trec-covid qrels | awk '{a[$4]+=1; s+=1}END{for (x in a){print x, a[x], a[x]/s}}'"
      ],
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "-1 2 2.88525e-05\n",
            "0 42652 0.615309\n",
            "1 11055 0.159482\n",
            "2 15609 0.22518\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "IgE2qowjuZV8"
      },
      "source": [
        "## lookup\n",
        "\n",
        "You can look up documents by their ID with the `ir_datasets lookup` command. The command format is:\n",
        "\n",
        "```\n",
        "ir_datasets lookup <dataset-id> <doc-ids> ...\n",
        "```\n",
        "\n",
        "These lookups are generally O(1) and memory-efficient."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "dBrzCdwbtug4",
        "outputId": "cd3522f7-3acf-450b-ae68-ac72ce9f0877"
      },
      "source": [
        "!ir_datasets lookup cord19/trec-covid 005b2j4b 00fmeepz 010vptx3"
      ],
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[INFO] No fields supplied. Using all fields: ('doc_id', 'title', 'doi', 'date', 'abstract')\n",
            "005b2j4b\tMonophyletic Relationship between Severe Acute Respiratory Syndrome Coronavirus and Group 2 Coronaviruses\t10.1086/382892\t2004-05-01\tAlthough primary genomic analysis has revealed that severe acute respiratory syndrome coronavirus (SARS CoV) is a new type of coronavirus, the different protein trees published in previous reports have provided no conclusive evidence indicating the phylogenetic position of SARS CoV. To clarify the phylogenetic relationship between SARS CoV and other coronaviruses, we compiled a large data set composed of 7 concatenated protein sequences and performed comprehensive analyses, using the maximum-likelihood, Bayesian-inference, and maximum-parsimony methods. All resulting phylogenetic trees displayed an identical topology and supported the hypothesis that the relationship between SARS CoV and group 2 CoVs is monophyletic. Relationships among all major groups were well resolved and were supported by all statistical analyses.\n",
            "00fmeepz\tComprehensive overview of COVID-19 based on current evidence\t\t2020\tIn December 2019, twenty-seven pneumonia patients with unknown causes originated in South China seafood market in Wuhan. The virus infection spread rapidly and swept through China in less than a month. Subsequently, the virus was proven a novel coronavirus and named SARS-CoV-2. The outbreak of novel coronavirus has been determined as a Public Health Emergency of International Concern (PHEIC) by WHO on January 31, 2020. Similar to other coronaviruses like the Middle East Respiratory Syndrome (MERS) CoV and Severe Acute Respiratory Syndrome (SARS) CoV, the novel coronavirus was reported to spread via respiratory droplets and close contact from human to human, which means the virus is highly infectious and dangerous. Unfortunately, till now the virus has spread to over 200 countries/territories/areas around the world and the Coronavirus Disease 2019 (COVID-19) outbreak is continuing to grow. Currently, information sharing and transparency are essential for risk assessment and epidemic control in all endemic areas. In this article, we compared SARS-CoV-2 with SARS-CoV and influenza virus, discussed current researching progress of COVID-19, including clinical characteristics, pathological changes, treatment measures, and so on.\n",
            "010vptx3\tThe SARS, MERS and novel coronavirus (COVID-19) epidemics, the newest and biggest global health threats: what lessons have we learned?\t10.1093/ije/dyaa033\t2020-02-22\tOBJECTIVES: To provide an overview of the three major deadly coronaviruses and identify areas for improvement of future preparedness plans, as well as provide a critical assessment of the risk factors and actionable items for stopping their spread, utilizing lessons learned from the first two deadly coronavirus outbreaks, as well as initial reports from the current novel coronavirus (COVID-19) epidemic in Wuhan, China. METHODS: Utilizing the Centers for Disease Control and Prevention (CDC, USA) website, and a comprehensive review of PubMed literature, we obtained information regarding clinical signs and symptoms, treatment and diagnosis, transmission methods, protection methods and risk factors for Middle East Respiratory Syndrome (MERS), Severe Acute Respiratory Syndrome (SARS) and COVID-19. Comparisons between the viruses were made. RESULTS: Inadequate risk assessment regarding the urgency of the situation, and limited reporting on the virus within China has, in part, led to the rapid spread of COVID-19 throughout mainland China and into proximal and distant countries. Compared with SARS and MERS, COVID-19 has spread more rapidly, due in part to increased globalization and the focus of the epidemic. Wuhan, China is a large hub connecting the North, South, East and West of China via railways and a major international airport. The availability of connecting flights, the timing of the outbreak during the Chinese (Lunar) New Year, and the massive rail transit hub located in Wuhan has enabled the virus to perforate throughout China, and eventually, globally. CONCLUSIONS: We conclude that we did not learn from the two prior epidemics of coronavirus and were ill-prepared to deal with the challenges the COVID-19 epidemic has posed. Future research should attempt to address the uses and implications of internet of things (IoT) technologies for mapping the spread of infection.\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "v6leewIGvYKf"
      },
      "source": [
        "You can also specify the fields to return."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "GkVpdPsXvFIq",
        "outputId": "106acd40-9d69-495e-e9f8-7191b1c81d78"
      },
      "source": [
        "!ir_datasets lookup cord19/trec-covid 005b2j4b 00fmeepz 010vptx3 --fields doc_id title"
      ],
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "005b2j4b\tMonophyletic Relationship between Severe Acute Respiratory Syndrome Coronavirus and Group 2 Coronaviruses\n",
            "00fmeepz\tComprehensive overview of COVID-19 based on current evidence\n",
            "010vptx3\tThe SARS, MERS and novel coronavirus (COVID-19) epidemics, the newest and biggest global health threats: what lessons have we learned?\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "
Download .txt
gitextract_2j6ggfs5/

├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug_report.md
│   │   ├── dataset-addition.md
│   │   ├── documentation.md
│   │   └── feature_request.md
│   └── workflows/
│       ├── deploy.yml
│       └── test.yml
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── examples/
│   ├── adding_datasets.ipynb
│   ├── clirmatrix_example.py
│   ├── ir_datasets.ipynb
│   └── ir_datasets_cli.ipynb
├── ir_datasets/
│   ├── __init__.py
│   ├── __main__.py
│   ├── commands/
│   │   ├── __init__.py
│   │   ├── build_c4_checkpoints.py
│   │   ├── build_clueweb_warc_indexes.py
│   │   ├── build_download_cache.py
│   │   ├── clean.py
│   │   ├── doc_fifos.py
│   │   ├── export.py
│   │   ├── generate_metadata.py
│   │   ├── list.py
│   │   └── lookup.py
│   ├── datasets/
│   │   ├── __init__.py
│   │   ├── antique.py
│   │   ├── aol_ia.py
│   │   ├── aquaint.py
│   │   ├── argsme.py
│   │   ├── base.py
│   │   ├── beir.py
│   │   ├── c4.py
│   │   ├── car.py
│   │   ├── clinicaltrials.py
│   │   ├── clirmatrix.py
│   │   ├── clueweb09.py
│   │   ├── clueweb12.py
│   │   ├── codec.py
│   │   ├── codesearchnet.py
│   │   ├── cord19.py
│   │   ├── cranfield.py
│   │   ├── csl.py
│   │   ├── disks45.py
│   │   ├── dpr_w100.py
│   │   ├── gov.py
│   │   ├── gov2.py
│   │   ├── hc4.py
│   │   ├── highwire.py
│   │   ├── istella22.py
│   │   ├── kilt.py
│   │   ├── lotte.py
│   │   ├── medline.py
│   │   ├── miracl.py
│   │   ├── mmarco.py
│   │   ├── mr_tydi.py
│   │   ├── msmarco_document.py
│   │   ├── msmarco_document_v2.py
│   │   ├── msmarco_passage.py
│   │   ├── msmarco_passage_v2.py
│   │   ├── msmarco_qna.py
│   │   ├── nano_beir.py
│   │   ├── natural_questions.py
│   │   ├── neuclir.py
│   │   ├── neumarco.py
│   │   ├── nfcorpus.py
│   │   ├── nyt.py
│   │   ├── pmc.py
│   │   ├── sara.py
│   │   ├── touche.py
│   │   ├── touche_image.py
│   │   ├── trec_arabic.py
│   │   ├── trec_cast.py
│   │   ├── trec_fair.py
│   │   ├── trec_mandarin.py
│   │   ├── trec_robust04.py
│   │   ├── trec_spanish.py
│   │   ├── trec_tot.py
│   │   ├── trec_tot_2025.py
│   │   ├── tripclick.py
│   │   ├── tweets2013_ia.py
│   │   ├── vaswani.py
│   │   ├── wapo.py
│   │   ├── wikiclir.py
│   │   └── wikir.py
│   ├── docs/
│   │   ├── antique.yaml
│   │   ├── aol-ia.yaml
│   │   ├── aquaint.yaml
│   │   ├── argsme.yaml
│   │   ├── beir.yaml
│   │   ├── bibliography.bib
│   │   ├── c4.yaml
│   │   ├── car.yaml
│   │   ├── clinicaltrials.yaml
│   │   ├── clirmatrix.yaml
│   │   ├── clueweb09.yaml
│   │   ├── clueweb12.yaml
│   │   ├── codec.yaml
│   │   ├── codesearchnet.yaml
│   │   ├── cord19.yaml
│   │   ├── cranfield.yaml
│   │   ├── csl.yaml
│   │   ├── disks45.yaml
│   │   ├── dpr-w100.yaml
│   │   ├── gov.yaml
│   │   ├── gov2.yaml
│   │   ├── hc4.yaml
│   │   ├── highwire.yaml
│   │   ├── istella22.yaml
│   │   ├── kilt.yaml
│   │   ├── lotte.yaml
│   │   ├── medline.yaml
│   │   ├── miracl.yaml
│   │   ├── mmarco.yaml
│   │   ├── mr-tydi.yaml
│   │   ├── msmarco-document-v2.yaml
│   │   ├── msmarco-document.yaml
│   │   ├── msmarco-passage-v2.yaml
│   │   ├── msmarco-passage.yaml
│   │   ├── msmarco-qna.yaml
│   │   ├── nano-beir.yaml
│   │   ├── natural-questions.yaml
│   │   ├── neuclir.yaml
│   │   ├── neumarco.yaml
│   │   ├── nfcorpus.yaml
│   │   ├── nyt.yaml
│   │   ├── pmc.yaml
│   │   ├── sara.yaml
│   │   ├── touche-image.yaml
│   │   ├── touche.yaml
│   │   ├── trec-arabic.yaml
│   │   ├── trec-cast.yaml
│   │   ├── trec-fair.yaml
│   │   ├── trec-mandarin.yaml
│   │   ├── trec-robust04.yaml
│   │   ├── trec-spanish.yaml
│   │   ├── trec-tot-2025.yaml
│   │   ├── trec-tot.yaml
│   │   ├── tripclick.yaml
│   │   ├── tweets2013-ia.yaml
│   │   ├── vaswani.yaml
│   │   ├── wapo.yaml
│   │   ├── wikiclir.yaml
│   │   └── wikir.yaml
│   ├── etc/
│   │   ├── downloads.json
│   │   └── metadata.json
│   ├── formats/
│   │   ├── __init__.py
│   │   ├── argsme.py
│   │   ├── base.py
│   │   ├── clirmatrix.py
│   │   ├── csv_fmt.py
│   │   ├── extracted_cc.py
│   │   ├── jsonl.py
│   │   ├── ntcir.py
│   │   ├── touche.py
│   │   ├── touche_image.py
│   │   ├── trec.py
│   │   ├── tsv.py
│   │   └── webarc.py
│   ├── indices/
│   │   ├── __init__.py
│   │   ├── base.py
│   │   ├── cache_docstore.py
│   │   ├── clueweb_warc.py
│   │   ├── indexed_tsv_docstore.py
│   │   ├── lz4_pickle.py
│   │   ├── numpy_sorted_index.py
│   │   └── zpickle_docstore.py
│   ├── lazy_libs.py
│   ├── log.py
│   ├── util/
│   │   ├── __init__.py
│   │   ├── docs/
│   │   │   ├── __init__.py
│   │   │   ├── lazy.py
│   │   │   ├── multiple.py
│   │   │   └── subset.py
│   │   ├── download.py
│   │   ├── fileio.py
│   │   ├── hash.py
│   │   ├── html_parsing.py
│   │   ├── metadata.py
│   │   └── registry.py
│   └── wrappers/
│       ├── __init__.py
│       └── html_extractor.py
├── pyproject.toml
├── requirements-test.txt
├── requirements.txt
└── test/
    ├── __init__.py
    ├── downloads.py
    ├── dummy/
    │   ├── docs.tsv
    │   ├── qrels
    │   └── queries.tsv
    ├── formats/
    │   ├── __init__.py
    │   ├── test_trec.py
    │   └── test_tsv.py
    ├── indices/
    │   ├── __init__.py
    │   ├── lz4_pickle.py
    │   └── numpy_sorted.py
    ├── integration/
    │   ├── __init__.py
    │   ├── antique.py
    │   ├── aol_ia.py
    │   ├── aquaint.py
    │   ├── argsme.py
    │   ├── base.py
    │   ├── beir.py
    │   ├── c4.py
    │   ├── car.py
    │   ├── clinicaltrials.py
    │   ├── clirmatrix.py
    │   ├── clueweb09.py
    │   ├── clueweb12.py
    │   ├── codec.py
    │   ├── codesearchnet.py
    │   ├── cord19.py
    │   ├── cranfield.py
    │   ├── csl.py
    │   ├── disks45.py
    │   ├── dpr_w100.py
    │   ├── dummy.py
    │   ├── gov.py
    │   ├── gov2.py
    │   ├── hc4.py
    │   ├── highwire.py
    │   ├── istella22.py
    │   ├── kilt.py
    │   ├── lotte.py
    │   ├── medline.py
    │   ├── miracl.py
    │   ├── mmarco.py
    │   ├── mr_tydi.py
    │   ├── msmarco_document.py
    │   ├── msmarco_document_v2.py
    │   ├── msmarco_passage.py
    │   ├── msmarco_passage_v2.py
    │   ├── msmarco_qna.py
    │   ├── nano_beir.py
    │   ├── natural_questions.py
    │   ├── neuclir.py
    │   ├── neumarco.py
    │   ├── nfcorpus.py
    │   ├── nyt.py
    │   ├── pmc.py
    │   ├── sara.py
    │   ├── touche.py
    │   ├── touche_image.py
    │   ├── trec_arabic.py
    │   ├── trec_cast.py
    │   ├── trec_fair.py
    │   ├── trec_mandarin.py
    │   ├── trec_robust04.py
    │   ├── trec_spanish.py
    │   ├── trec_tot.py
    │   ├── trec_tot_2024.py
    │   ├── trec_tot_2025/
    │   │   ├── test_docs_iter.py
    │   │   ├── test_docs_store.py
    │   │   ├── test_qrel_iter.py
    │   │   └── test_queries_iter.py
    │   ├── tripclick.py
    │   ├── tweets2013_ia.py
    │   ├── vaswani.py
    │   ├── wapo.py
    │   ├── wikiclir.py
    │   └── wikir.py
    ├── metadata.py
    ├── test_defaulttext.py
    ├── util/
    │   └── docs/
    │       ├── __init__.py
    │       ├── data.py
    │       ├── test_multiple.py
    │       └── test_subset.py
    └── util.py
Download .txt
SYMBOL INDEX (2346 symbols across 176 files)

FILE: ir_datasets/__init__.py
  class EntityType (line 2) | class EntityType(Enum):
  function load (line 21) | def load(name):
  function parent_id (line 25) | def parent_id(dataset_id: str, entity_type: EntityType) -> str:
  function docs_parent_id (line 51) | def docs_parent_id(dataset_id: str) -> str:
  function queries_parent_id (line 56) | def queries_parent_id(dataset_id: str) -> str:
  function qrels_parent_id (line 60) | def qrels_parent_id(dataset_id: str) -> str:
  function scoreddocs_parent_id (line 64) | def scoreddocs_parent_id(dataset_id: str) -> str:
  function docpairs_parent_id (line 68) | def docpairs_parent_id(dataset_id: str) -> str:
  function qlogs_parent_id (line 72) | def qlogs_parent_id(dataset_id: str) -> str:
  function create_dataset (line 76) | def create_dataset(docs_tsv=None, queries_tsv=None, qrels_trec=None):
  function main (line 91) | def main(args):
  function main_cli (line 100) | def main_cli():

FILE: ir_datasets/commands/build_c4_checkpoints.py
  function process (line 16) | def process(args):
  function main (line 35) | def main(args):

FILE: ir_datasets/commands/build_clueweb_warc_indexes.py
  function process (line 11) | def process(args):
  function main (line 20) | def main(args):

FILE: ir_datasets/commands/build_download_cache.py
  function tmp_environ (line 15) | def tmp_environ(**kwargs):
  function _build_cache (line 31) | def _build_cache(data, dir, prefix=''):
  function main (line 60) | def main(args):

FILE: ir_datasets/commands/clean.py
  function walk_path (line 16) | def walk_path(start_path='.', skips=[]):
  function clean (line 33) | def clean(dataset, yes=False, list=False, human=True):
  function main (line 79) | def main(args):

FILE: ir_datasets/commands/doc_fifos.py
  function main (line 16) | def main(args):

FILE: ir_datasets/commands/export.py
  function main_docs (line 10) | def main_docs(dataset, args):
  function main_queries (line 19) | def main_queries(dataset, args):
  function main_qrels (line 28) | def main_qrels(dataset, args):
  function main_scoreddocs (line 37) | def main_scoreddocs(dataset, args):
  function main_docpairs (line 48) | def main_docpairs(dataset, args):
  class TsvExporter (line 57) | class TsvExporter:
    method __init__ (line 58) | def __init__(self, data_cls, out, fields=None):
    method next (line 81) | def next(self, record):
    method flush (line 98) | def flush(self):
  class JsonlExporter (line 102) | class JsonlExporter:
    method __init__ (line 103) | def __init__(self, data_cls, out, fields=None):
    method next (line 123) | def next(self, record):
    method encode (line 127) | def encode(self, value):
    method flush (line 134) | def flush(self):
  function is_tuple_elip (line 137) | def is_tuple_elip(annotation):
  class TrecQrelsExporter (line 144) | class TrecQrelsExporter:
    method __init__ (line 145) | def __init__(self, data_cls, out, fields=None):
    method next (line 163) | def next(self, record):
    method flush (line 167) | def flush(self):
  class TrecRunExporter (line 171) | class TrecRunExporter:
    method __init__ (line 172) | def __init__(self, data_cls, out, fields=None):
    method next (line 180) | def next(self, record):
    method flush (line 186) | def flush(self):
  function main (line 201) | def main(args):

FILE: ir_datasets/commands/generate_metadata.py
  function dataset2metadata (line 15) | def dataset2metadata(args):
  function write_metadata_file (line 38) | def write_metadata_file(data, file):
  function main (line 49) | def main(args):

FILE: ir_datasets/commands/list.py
  function main (line 10) | def main(args):

FILE: ir_datasets/commands/lookup.py
  function qid_lookup (line 10) | def qid_lookup(dataset, args):
  function did_lookup (line 23) | def did_lookup(dataset, args):
  function main (line 36) | def main(args):

FILE: ir_datasets/datasets/antique.py
  function _init (line 32) | def _init():

FILE: ir_datasets/datasets/aol_ia.py
  class LogItem (line 27) | class LogItem(NamedTuple):
  class AolQlog (line 33) | class AolQlog(NamedTuple):
  class AolIaDoc (line 41) | class AolIaDoc(NamedTuple):
    method default_text (line 47) | def default_text(self):
  class AolQlogs (line 54) | class AolQlogs(BaseQlogs):
    method __init__ (line 55) | def __init__(self, dlc):
    method qlogs_iter (line 58) | def qlogs_iter(self):
    method qlogs_cls (line 68) | def qlogs_cls(self):
    method qlogs_count (line 71) | def qlogs_count(self):
  class _ManagedDlc (line 75) | class _ManagedDlc:
    method __init__ (line 76) | def __init__(self, manager, path):
    method stream (line 81) | def stream(self):
    method path (line 86) | def path(self, force=True):
  class AolManager (line 92) | class AolManager:
    method __init__ (line 93) | def __init__(self, log_dlcs, id2wb_dlc, base_path):
    method docs_store (line 102) | def docs_store(self, options=DEFAULT_DOCSTORE_OPTIONS):
    method _internal_docs_store (line 106) | def _internal_docs_store(self, options: DocstoreOptions=DEFAULT_DOCSTO...
    method _build_docs (line 111) | def _build_docs(self):
    method build (line 135) | def build(self):
    method file_ref (line 177) | def file_ref(self, path):
  function _init (line 182) | def _init():

FILE: ir_datasets/datasets/aquaint.py
  function _init (line 23) | def _init():

FILE: ir_datasets/datasets/argsme.py
  function _init (line 40) | def _init():

FILE: ir_datasets/datasets/base.py
  class Dataset (line 12) | class Dataset:
    method __init__ (line 13) | def __init__(self, *constituents):
    method __getstate__ (line 17) | def __getstate__(self):
    method __setstate__ (line 20) | def __setstate__(self, state):
    method __getattr__ (line 23) | def __getattr__(self, attr):
    method __repr__ (line 53) | def __repr__(self):
    method __dir__ (line 72) | def __dir__(self):
    method has (line 78) | def has(self, etype: ir_datasets.EntityType) -> bool:
    method has_docs (line 82) | def has_docs(self):
    method has_queries (line 85) | def has_queries(self):
    method has_qrels (line 88) | def has_qrels(self):
    method has_scoreddocs (line 91) | def has_scoreddocs(self):
    method has_docpairs (line 94) | def has_docpairs(self):
    method has_qlogs (line 97) | def has_qlogs(self):
  class _BetaPythonApiDocs (line 101) | class _BetaPythonApiDocs:
    method __init__ (line 102) | def __init__(self, handler):
    method __iter__ (line 108) | def __iter__(self):
    method __len__ (line 111) | def __len__(self):
    method __getitem__ (line 114) | def __getitem__(self, key):
    method __repr__ (line 117) | def __repr__(self):
    method lookup (line 120) | def lookup(self, doc_ids):
    method lookup_iter (line 127) | def lookup_iter(self, doc_ids):
    method metadata (line 136) | def metadata(self):
  class _BetaPythonApiQueries (line 140) | class _BetaPythonApiQueries:
    method __init__ (line 141) | def __init__(self, handler):
    method __iter__ (line 147) | def __iter__(self):
    method __repr__ (line 150) | def __repr__(self):
    method __len__ (line 153) | def __len__(self):
    method lookup (line 163) | def lookup(self, query_ids):
    method lookup_iter (line 170) | def lookup_iter(self, query_ids):
    method metadata (line 181) | def metadata(self):
  class _BetaPythonApiQrels (line 185) | class _BetaPythonApiQrels:
    method __init__ (line 186) | def __init__(self, handler):
    method __iter__ (line 192) | def __iter__(self):
    method __repr__ (line 195) | def __repr__(self):
    method asdict (line 198) | def asdict(self):
    method __len__ (line 203) | def __len__(self):
    method metadata (line 214) | def metadata(self):
  class _BetaPythonApiScoreddocs (line 218) | class _BetaPythonApiScoreddocs:
    method __init__ (line 219) | def __init__(self, handler):
    method __iter__ (line 223) | def __iter__(self):
    method __repr__ (line 226) | def __repr__(self):
    method __len__ (line 229) | def __len__(self):
    method metadata (line 238) | def metadata(self):
  class _BetaPythonApiDocpairs (line 242) | class _BetaPythonApiDocpairs:
    method __init__ (line 243) | def __init__(self, handler):
    method __iter__ (line 247) | def __iter__(self):
    method __repr__ (line 250) | def __repr__(self):
    method __len__ (line 253) | def __len__(self):
    method metadata (line 262) | def metadata(self):
  class _BetaPythonApiQlogs (line 266) | class _BetaPythonApiQlogs:
    method __init__ (line 267) | def __init__(self, handler):
    method __iter__ (line 271) | def __iter__(self):
    method __repr__ (line 274) | def __repr__(self):
    method __len__ (line 277) | def __len__(self):
    method metadata (line 286) | def metadata(self):
  class FilteredQueries (line 290) | class FilteredQueries(BaseQueries):
    method __init__ (line 291) | def __init__(self, queries_handler, lazy_qids, mode='include'):
    method queries_iter (line 296) | def queries_iter(self):
    method queries_cls (line 306) | def queries_cls(self):
    method queries_handler (line 309) | def queries_handler(self):
    method queries_lang (line 312) | def queries_lang(self):
  class FilteredQrels (line 316) | class FilteredQrels(BaseQrels):
    method __init__ (line 317) | def __init__(self, qrels_handler, lazy_qids, mode='include'):
    method qrels_iter (line 322) | def qrels_iter(self):
    method qrels_defs (line 332) | def qrels_defs(self):
    method qrels_handler (line 335) | def qrels_handler(self):
  class FilteredScoredDocs (line 339) | class FilteredScoredDocs(BaseScoredDocs):
    method __init__ (line 340) | def __init__(self, scoreddocs_handler, lazy_qids, mode='include'):
    method scoreddocs_iter (line 345) | def scoreddocs_iter(self):
    method scoreddocs_handler (line 355) | def scoreddocs_handler(self):
  class FilteredDocPairs (line 359) | class FilteredDocPairs(BaseDocPairs):
    method __init__ (line 360) | def __init__(self, docpairs_handler, lazy_qids, mode='include'):
    method docpairs_iter (line 365) | def docpairs_iter(self):
    method docpairs_handler (line 375) | def docpairs_handler(self):
  class YamlDocumentation (line 379) | class YamlDocumentation:
    method __init__ (line 380) | def __init__(self, file):
    method __call__ (line 384) | def __call__(self, key):
    method get_key (line 387) | def get_key(self, key):
  class YamlDocumentationProvider (line 395) | class YamlDocumentationProvider:
    method __init__ (line 396) | def __init__(self, documentation, key):
    method documentation (line 400) | def documentation(self):
  class Deprecated (line 407) | class Deprecated:
    method __init__ (line 408) | def __init__(self, message):
    method deprecated (line 411) | def deprecated(self):
  class ExpectedFile (line 415) | class ExpectedFile:
    method __init__ (line 416) | def __init__(self, path, expected_md5=None, instructions=None):
    method path (line 421) | def path(self, force=True):
    method stream (line 429) | def stream(self):
  class Concat (line 436) | class Concat(Dataset):
    method __getattr__ (line 437) | def __getattr__(self, attr):

FILE: ir_datasets/datasets/beir.py
  class BeirDoc (line 16) | class BeirDoc(NamedTuple):
    method default_text (line 21) | def default_text(self):
  class BeirTitleDoc (line 28) | class BeirTitleDoc(NamedTuple):
    method default_text (line 32) | def default_text(self):
  class BeirTitleUrlDoc (line 38) | class BeirTitleUrlDoc(NamedTuple):
    method default_text (line 43) | def default_text(self):
  class BeirSciDoc (line 49) | class BeirSciDoc(NamedTuple):
    method default_text (line 57) | def default_text(self):
  class BeirCordDoc (line 63) | class BeirCordDoc(NamedTuple):
    method default_text (line 69) | def default_text(self):
  class BeirToucheDoc (line 75) | class BeirToucheDoc(NamedTuple):
    method default_text (line 81) | def default_text(self):
  class BeirCqaDoc (line 87) | class BeirCqaDoc(NamedTuple):
    method default_text (line 92) | def default_text(self):
  class BeirUrlQuery (line 98) | class BeirUrlQuery(NamedTuple):
    method default_text (line 102) | def default_text(self):
  class BeirSciQuery (line 108) | class BeirSciQuery(NamedTuple):
    method default_text (line 115) | def default_text(self):
  class BeirToucheQuery (line 121) | class BeirToucheQuery(NamedTuple):
    method default_text (line 126) | def default_text(self):
  class BeirCovidQuery (line 132) | class BeirCovidQuery(NamedTuple):
    method default_text (line 137) | def default_text(self):
  class BeirCqaQuery (line 143) | class BeirCqaQuery(NamedTuple):
    method default_text (line 147) | def default_text(self):
  function _map_field (line 153) | def _map_field(field, data):
  class BeirDocs (line 163) | class BeirDocs(BaseDocs):
    method __init__ (line 164) | def __init__(self, name, dlc, doc_type):
    method docs_iter (line 170) | def docs_iter(self):
    method _docs_iter (line 173) | def _docs_iter(self):
    method docs_cls (line 179) | def docs_cls(self):
    method docs_store (line 182) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_count (line 193) | def docs_count(self):
    method docs_namespace (line 197) | def docs_namespace(self):
    method docs_lang (line 200) | def docs_lang(self):
  class BeirQueries (line 204) | class BeirQueries(BaseQueries):
    method __init__ (line 205) | def __init__(self, name, dlc, query_type):
    method queries_iter (line 211) | def queries_iter(self):
    method queries_cls (line 217) | def queries_cls(self):
    method queries_namespace (line 220) | def queries_namespace(self):
    method queries_lang (line 223) | def queries_lang(self):
  class BeirQrels (line 227) | class BeirQrels(BaseQrels):
    method __init__ (line 228) | def __init__(self, qrels_dlc, qrels_defs):
    method qrels_path (line 232) | def qrels_path(self):
    method qrels_iter (line 235) | def qrels_iter(self):
    method qrels_cls (line 249) | def qrels_cls(self):
    method qrels_defs (line 252) | def qrels_defs(self):
  function _init (line 256) | def _init():
  function qid_filter (line 333) | def qid_filter(subset_qrels):

FILE: ir_datasets/datasets/c4.py
  class C4Doc (line 20) | class C4Doc(NamedTuple):
    method default_text (line 25) | def default_text(self):
  class MisinfoQuery (line 32) | class MisinfoQuery(NamedTuple):
    method default_text (line 40) | def default_text(self):
  class C4Source (line 47) | class C4Source(DocSource):
    method __init__ (line 48) | def __init__(self, name, dlc, checkpoint_dlc, doc_count, checkpoint_fr...
    method __len__ (line 58) | def __len__(self):
    method __iter__ (line 61) | def __iter__(self):
    method checkpoints (line 64) | def checkpoints(self):
  class C4SourceIter (line 72) | class C4SourceIter(DocSourceSeekableIter):
    method __init__ (line 73) | def __init__(self, source):
    method close (line 78) | def close(self):
    method __next__ (line 83) | def __next__(self):
    method seek (line 92) | def seek(self, idx):
  class C4Docstore (line 114) | class C4Docstore(Docstore):
    method __init__ (line 115) | def __init__(self, docs, options=DEFAULT_DOCSTORE_OPTIONS):
    method get_many_iter (line 119) | def get_many_iter(self, doc_ids):
  class C4Docs (line 144) | class C4Docs(BaseDocs):
    method __init__ (line 145) | def __init__(self, sources_dlc, checkpoint_dlc, base_path, source_name...
    method docs_iter (line 154) | def docs_iter(self):
    method docs_cls (line 157) | def docs_cls(self):
    method docs_store (line 160) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_count (line 164) | def docs_count(self, force=False):
    method docs_namespace (line 168) | def docs_namespace(self):
    method docs_lang (line 171) | def docs_lang(self):
    method docs_source_iter (line 174) | def docs_source_iter(self):
    method _docs_sources (line 177) | def _docs_sources(self):
  function _init (line 208) | def _init():

FILE: ir_datasets/datasets/car.py
  class CarQuery (line 27) | class CarQuery(NamedTuple):
    method default_text (line 32) | def default_text(self):
  class CarDocs (line 39) | class CarDocs(BaseDocs):
    method __init__ (line 40) | def __init__(self, streamer, count_hint=None):
    method docs_iter (line 46) | def docs_iter(self):
    method docs_cls (line 53) | def docs_cls(self):
    method docs_store (line 56) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_count (line 67) | def docs_count(self):
    method docs_namespace (line 71) | def docs_namespace(self):
    method docs_lang (line 74) | def docs_lang(self):
  class CarQueries (line 77) | class CarQueries(BaseQueries):
    method __init__ (line 78) | def __init__(self, streamer):
    method queries_iter (line 82) | def queries_iter(self):
    method queries_namespace (line 93) | def queries_namespace(self):
    method queries_cls (line 96) | def queries_cls(self):
    method queries_lang (line 99) | def queries_lang(self):
  function _init (line 102) | def _init():

FILE: ir_datasets/datasets/clinicaltrials.py
  class ClinicalTrialsDoc (line 37) | class ClinicalTrialsDoc(NamedTuple):
  class ClinicalTrialsDocs (line 46) | class ClinicalTrialsDocs(BaseDocs):
    method __init__ (line 47) | def __init__(self, name, dlcs, compress_format='tgz', count_hint=None):
    method docs_iter (line 53) | def docs_iter(self):
    method _docs_iter (line 56) | def _docs_iter(self):
    method _parse_doc (line 76) | def _parse_doc(self, xml):
    method docs_path (line 93) | def docs_path(self, force=True):
    method docs_store (line 96) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_cls (line 107) | def docs_cls(self):
    method docs_namespace (line 110) | def docs_namespace(self):
    method docs_count (line 113) | def docs_count(self):
    method docs_lang (line 117) | def docs_lang(self):
  function _init (line 121) | def _init():

FILE: ir_datasets/datasets/clirmatrix.py
  function _init (line 24) | def _init():

FILE: ir_datasets/datasets/clueweb09.py
  class TrecWebTrackQuery (line 37) | class TrecWebTrackQuery(NamedTuple):
    method default_text (line 43) | def default_text(self):
  class ClueWeb09Docs (line 50) | class ClueWeb09Docs(WarcDocs):
    method __init__ (line 51) | def __init__(self, docs_dlc, chk_dlc, dirs=None, lang=None):
    method docs_path (line 59) | def docs_path(self, force=True):
    method _docs_iter_source_files (line 62) | def _docs_iter_source_files(self):
    method _docs_id_to_source_file (line 70) | def _docs_id_to_source_file(self, doc_id):
    method _docs_source_file_to_checkpoint (line 85) | def _docs_source_file_to_checkpoint(self, source_file):
    method _docs_warc_file_counts (line 94) | def _docs_warc_file_counts(self):
    method docs_namespace (line 111) | def docs_namespace(self):
  class CatBQrelFilter (line 115) | class CatBQrelFilter(BaseQrels):
    method __init__ (line 116) | def __init__(self, qrels_handler):
    method qrels_iter (line 119) | def qrels_iter(self):
    method qrels_defs (line 126) | def qrels_defs(self):
    method qrels_cls (line 129) | def qrels_cls(self):
    method qrels_path (line 132) | def qrels_path(self):
  function _init (line 136) | def _init():

FILE: ir_datasets/datasets/clueweb12.py
  class TrecWebTrackQuery (line 57) | class TrecWebTrackQuery(NamedTuple):
    method default_text (line 63) | def default_text(self):
  class NtcirQuery (line 70) | class NtcirQuery(NamedTuple):
    method default_text (line 74) | def default_text(self):
  class MisinfoQuery (line 81) | class MisinfoQuery(NamedTuple):
    method default_text (line 87) | def default_text(self):
  class MisinfoQrel (line 94) | class MisinfoQrel(NamedTuple):
  class EhealthQrel (line 102) | class EhealthQrel(NamedTuple):
  class MsinfoQrels (line 111) | class MsinfoQrels(TrecQrels):
    method qrels_iter (line 112) | def qrels_iter(self):
    method qrels_cls (line 124) | def qrels_cls(self):
  class EhealthQrels (line 128) | class EhealthQrels(TrecQrels):
    method __init__ (line 129) | def __init__(self, qrels_dlcs, qtrust_dlcs, qunder_dlcs, qrels_defs, q...
    method qrels_iter (line 136) | def qrels_iter(self):
    method qrels_cls (line 154) | def qrels_cls(self):
  class FixAmp (line 158) | class FixAmp:
    method __init__ (line 159) | def __init__(self, streamer):
    method stream (line 162) | def stream(self):
    method __iter__ (line 165) | def __iter__(self):
  class ClueWeb12Docs (line 171) | class ClueWeb12Docs(WarcDocs):
    method __init__ (line 172) | def __init__(self, docs_dlc, chk_dlc=None):
    method docs_path (line 178) | def docs_path(self, force=True):
    method _docs_iter_source_files (line 181) | def _docs_iter_source_files(self):
    method _docs_id_to_source_file (line 186) | def _docs_id_to_source_file(self, doc_id):
    method _docs_source_file_to_checkpoint (line 195) | def _docs_source_file_to_checkpoint(self, source_file):
    method _docs_warc_file_counts (line 206) | def _docs_warc_file_counts(self):
    method docs_namespace (line 219) | def docs_namespace(self):
  class ClueWeb12b13Extractor (line 223) | class ClueWeb12b13Extractor:
    method __init__ (line 224) | def __init__(self, docs_dlc, extract_jar_dlc):
    method path (line 228) | def path(self, force=True):
    method _create_record_counts_if_needed (line 245) | def _create_record_counts_if_needed(self, path):
    method stream (line 265) | def stream(self):
  function _init (line 269) | def _init():

FILE: ir_datasets/datasets/codec.py
  class CodecDoc (line 25) | class CodecDoc(NamedTuple):
    method default_text (line 30) | def default_text(self):
  class CodecQuery (line 33) | class CodecQuery(NamedTuple):
    method default_text (line 38) | def default_text(self):
  class CodecQueries (line 45) | class CodecQueries(BaseQueries):
    method __init__ (line 46) | def __init__(self, streamer, qid_filter=None):
    method queries_iter (line 51) | def queries_iter(self):
    method queries_cls (line 58) | def queries_cls(self):
    method queries_namespace (line 61) | def queries_namespace(self):
    method queries_lang (line 64) | def queries_lang(self):
  function filter_qids (line 68) | def filter_qids(domain, queries_handler):
  function _init (line 72) | def _init():

FILE: ir_datasets/datasets/codesearchnet.py
  class CodeSearchNetDoc (line 32) | class CodeSearchNetDoc(NamedTuple):
  class CodeSearchNetChallengeQrel (line 41) | class CodeSearchNetChallengeQrel(NamedTuple):
  class CodeSearchNetDocs (line 48) | class CodeSearchNetDocs(BaseDocs):
    method __init__ (line 49) | def __init__(self, docs_dlcs):
    method docs_iter (line 54) | def docs_iter(self):
    method docs_cls (line 70) | def docs_cls(self):
    method docs_store (line 73) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_count (line 84) | def docs_count(self):
    method docs_namespace (line 88) | def docs_namespace(self):
    method docs_lang (line 91) | def docs_lang(self):
  class CodeSearchNetQueries (line 95) | class CodeSearchNetQueries(BaseQueries):
    method __init__ (line 96) | def __init__(self, queries_dlcs, split):
    method queries_iter (line 101) | def queries_iter(self):
    method queries_cls (line 113) | def queries_cls(self):
    method queries_namespace (line 116) | def queries_namespace(self):
    method queries_lang (line 119) | def queries_lang(self):
  class CodeSearchNetQrels (line 123) | class CodeSearchNetQrels(BaseQrels):
    method __init__ (line 124) | def __init__(self, qrels_dlcs, split):
    method qrels_iter (line 129) | def qrels_iter(self):
    method qrels_cls (line 143) | def qrels_cls(self):
    method qrels_defs (line 146) | def qrels_defs(self):
    method queries_lang (line 149) | def queries_lang(self):
  class CodeSearchNetChallengeQueries (line 153) | class CodeSearchNetChallengeQueries(BaseQueries):
    method __init__ (line 154) | def __init__(self, queries_dlc):
    method queries_path (line 158) | def queries_path(self):
    method queries_iter (line 161) | def queries_iter(self):
    method queries_cls (line 169) | def queries_cls(self):
    method queries_namespace (line 172) | def queries_namespace(self):
  class CodeSearchNetChallengeQrels (line 176) | class CodeSearchNetChallengeQrels(BaseQrels):
    method __init__ (line 177) | def __init__(self, qrels_dlc, queries_handler):
    method qrels_path (line 182) | def qrels_path(self):
    method qrels_iter (line 185) | def qrels_iter(self):
    method qrels_cls (line 196) | def qrels_cls(self):
    method qrels_defs (line 199) | def qrels_defs(self):
  function _init (line 203) | def _init():

FILE: ir_datasets/datasets/cord19.py
  class Cord19Doc (line 24) | class Cord19Doc(NamedTuple):
    method default_text (line 30) | def default_text(self):
  class Cord19FullTextSection (line 37) | class Cord19FullTextSection(NamedTuple):
  class Cord19FullTextDoc (line 42) | class Cord19FullTextDoc(NamedTuple):
    method default_text (line 49) | def default_text(self):
  class Cord19Docs (line 71) | class Cord19Docs(BaseDocs):
    method __init__ (line 72) | def __init__(self, streamer, extr_path, date, include_fulltext=False, ...
    method docs_path (line 79) | def docs_path(self, force=True):
    method docs_cls (line 85) | def docs_cls(self):
    method docs_iter (line 88) | def docs_iter(self):
    method _docs_iter (line 91) | def _docs_iter(self):
    method docs_store (line 161) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_count (line 172) | def docs_count(self):
    method docs_namespace (line 176) | def docs_namespace(self):
    method docs_lang (line 179) | def docs_lang(self):
  function _init (line 183) | def _init():

FILE: ir_datasets/datasets/cranfield.py
  class CranfieldDoc (line 25) | class CranfieldDoc(NamedTuple):
    method default_text (line 31) | def default_text(self):
  function prefix_sentinel_splitter (line 38) | def prefix_sentinel_splitter(it, sentinel):
  class CranfieldDocs (line 48) | class CranfieldDocs(BaseDocs):
    method __init__ (line 49) | def __init__(self, docs_dlc):
    method docs_path (line 53) | def docs_path(self, force=True):
    method docs_iter (line 57) | def docs_iter(self):
    method docs_cls (line 77) | def docs_cls(self):
    method docs_store (line 80) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_count (line 91) | def docs_count(self):
    method docs_namespace (line 95) | def docs_namespace():
    method docs_lang (line 98) | def docs_lang(self):
  class CranfieldQueries (line 102) | class CranfieldQueries(BaseQueries):
    method __init__ (line 103) | def __init__(self, queries_dlc):
    method queries_path (line 107) | def queries_path(self):
    method queries_iter (line 110) | def queries_iter(self):
    method queries_cls (line 127) | def queries_cls(self):
    method queries_namespace (line 130) | def queries_namespace(self):
    method queries_lang (line 133) | def queries_lang(self):
  class CranfieldQrels (line 137) | class CranfieldQrels(BaseQrels):
    method __init__ (line 138) | def __init__(self, qrels_dlc):
    method qrels_path (line 141) | def qrels_path(self):
    method qrels_iter (line 144) | def qrels_iter(self):
    method qrels_cls (line 154) | def qrels_cls(self):
    method qrels_defs (line 157) | def qrels_defs(self):
  function _init (line 161) | def _init():

FILE: ir_datasets/datasets/csl.py
  class CslDoc (line 15) | class CslDoc(NamedTuple):
    method default_text (line 24) | def default_text(self):
  function _init (line 34) | def _init():

FILE: ir_datasets/datasets/disks45.py
  function _init (line 35) | def _init():
  function make_filter (line 98) | def make_filter(fold):

FILE: ir_datasets/datasets/dpr_w100.py
  class DprW100Doc (line 23) | class DprW100Doc(NamedTuple):
    method default_text (line 27) | def default_text(self):
  class DprW100Query (line 34) | class DprW100Query(NamedTuple):
    method default_text (line 38) | def default_text(self):
  class DprW100Manager (line 45) | class DprW100Manager:
    method __init__ (line 46) | def __init__(self, dlc, base_path, passage_id_key='passage_id'):
    method build (line 52) | def build(self):
    method file_ref (line 85) | def file_ref(self, path):
  class _ManagedDlc (line 89) | class _ManagedDlc:
    method __init__ (line 90) | def __init__(self, manager, path):
    method stream (line 95) | def stream(self):
    method path (line 100) | def path(self, force=True):
  class DprW100Queries (line 106) | class DprW100Queries(BaseQueries):
    method __init__ (line 107) | def __init__(self, dlc):
    method queries_iter (line 110) | def queries_iter(self):
    method queries_cls (line 116) | def queries_cls(self):
    method queries_namespace (line 119) | def queries_namespace(self):
    method queries_lang (line 122) | def queries_lang(self):
  function _init (line 126) | def _init():

FILE: ir_datasets/datasets/gov.py
  class GovWeb02Query (line 49) | class GovWeb02Query(NamedTuple):
    method default_text (line 53) | def default_text(self):
  class GovDoc (line 60) | class GovDoc(NamedTuple):
    method default_text (line 66) | def default_text(self):
  class GovDocs (line 70) | class GovDocs(BaseDocs):
    method __init__ (line 71) | def __init__(self, docs_dlc):
    method docs_path (line 75) | def docs_path(self, force=True):
    method docs_iter (line 78) | def docs_iter(self):
    method _docs_iter (line 81) | def _docs_iter(self):
    method docs_cls (line 87) | def docs_cls(self):
    method _docs_ctxt_iter_gov (line 90) | def _docs_ctxt_iter_gov(self, gov2f):
    method _process_gov_doc (line 106) | def _process_gov_doc(self, raw_doc):
    method _extract_next_block (line 129) | def _extract_next_block(self, inp, START, END):
    method docs_store (line 138) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_count (line 149) | def docs_count(self):
    method docs_namespace (line 153) | def docs_namespace(self):
    method docs_lang (line 156) | def docs_lang(self):
  function _init (line 160) | def _init():

FILE: ir_datasets/datasets/gov2.py
  class Gov2Doc (line 43) | class Gov2Doc(NamedTuple):
    method default_text (line 49) | def default_text(self):
  class Gov2DocIter (line 54) | class Gov2DocIter:
    method __init__ (line 55) | def __init__(self, gov2_docs, slice):
    method __next__ (line 64) | def __next__(self):
    method close (line 93) | def close(self):
    method __iter__ (line 96) | def __iter__(self):
    method __del__ (line 99) | def __del__(self):
    method __getitem__ (line 102) | def __getitem__(self, key):
  class Gov2Docs (line 118) | class Gov2Docs(BaseDocs):
    method __init__ (line 119) | def __init__(self, docs_dlc, doccount_dlc):
    method docs_path (line 125) | def docs_path(self, force=True):
    method _docs_iter_source_files (line 128) | def _docs_iter_source_files(self):
    method docs_iter (line 134) | def docs_iter(self):
    method docs_cls (line 137) | def docs_cls(self):
    method _docs_ctxt_iter_gov2 (line 140) | def _docs_ctxt_iter_gov2(self, gov2f):
    method _process_gov2_doc (line 156) | def _process_gov2_doc(self, raw_doc):
    method _extract_next_block (line 179) | def _extract_next_block(self, inp, START, END):
    method _docs_id_to_source_file (line 188) | def _docs_id_to_source_file(self, doc_id):
    method _docs_file_counts (line 196) | def _docs_file_counts(self):
    method docs_store (line 208) | def docs_store(self, options=ir_datasets.indices.DEFAULT_DOCSTORE_OPTI...
    method docs_count (line 212) | def docs_count(self):
    method docs_namespace (line 215) | def docs_namespace(self):
    method docs_lang (line 218) | def docs_lang(self):
  class Gov2Docstore (line 222) | class Gov2Docstore(Docstore):
    method __init__ (line 223) | def __init__(self, gov2_docs, options=ir_datasets.indices.DEFAULT_DOCS...
    method get_many_iter (line 227) | def get_many_iter(self, doc_ids):
  class RewriteQids (line 246) | class RewriteQids(BaseQrels):
    method __init__ (line 247) | def __init__(self, base_qrels, qid_map):
    method qrels_iter (line 251) | def qrels_iter(self):
    method qrels_defs (line 258) | def qrels_defs(self):
    method qrels_path (line 261) | def qrels_path(self):
    method qrels_cls (line 264) | def qrels_cls(self):
  class Gov2DocCountFile (line 268) | class Gov2DocCountFile:
    method __init__ (line 269) | def __init__(self, path, docs_dlc):
    method path (line 273) | def path(self, force=True):
    method stream (line 291) | def stream(self):
  function _init (line 295) | def _init():

FILE: ir_datasets/datasets/hc4.py
  function _init (line 22) | def _init():

FILE: ir_datasets/datasets/highwire.py
  class HighwireSpan (line 28) | class HighwireSpan(NamedTuple):
  class HighwireDoc (line 34) | class HighwireDoc(NamedTuple):
    method default_text (line 39) | def default_text(self):
  class TrecGenomicsQrel (line 46) | class TrecGenomicsQrel(NamedTuple):
  class HighwireQrel (line 54) | class HighwireQrel(NamedTuple):
  class HighwireDocs (line 62) | class HighwireDocs(BaseDocs):
    method __init__ (line 63) | def __init__(self, dlcs, legalspans_dlc):
    method docs_iter (line 67) | def docs_iter(self):
    method _docs_iter (line 70) | def _docs_iter(self):
    method docs_path (line 101) | def docs_path(self, force=True):
    method docs_store (line 104) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_cls (line 115) | def docs_cls(self):
    method docs_namespace (line 118) | def docs_namespace(self):
    method docs_count (line 121) | def docs_count(self):
    method docs_lang (line 125) | def docs_lang(self):
  class TrecGenomicsQueries (line 129) | class TrecGenomicsQueries(BaseQueries):
    method __init__ (line 130) | def __init__(self, queries_dlc):
    method queries_iter (line 133) | def queries_iter(self):
    method queries_cls (line 142) | def queries_cls(self):
    method queries_namespace (line 145) | def queries_namespace(self):
    method queries_lang (line 148) | def queries_lang(self):
  class HighwireQrels (line 152) | class HighwireQrels(BaseQrels):
    method __init__ (line 153) | def __init__(self, qrels_dlc, qrel_defs):
    method qrels_iter (line 157) | def qrels_iter(self):
    method qrels_defs (line 172) | def qrels_defs(self):
    method qrels_path (line 175) | def qrels_path(self):
    method qrels_cls (line 178) | def qrels_cls(self):
  function _init (line 182) | def _init():

FILE: ir_datasets/datasets/istella22.py
  class Istella22Doc (line 13) | class Istella22Doc(NamedTuple):
    method default_text (line 21) | def default_text(self):
  function _init (line 34) | def _init():
  function fold_qids_factory (line 71) | def fold_qids_factory(fold, base_dlc):

FILE: ir_datasets/datasets/kilt.py
  class KiltDocAnchor (line 25) | class KiltDocAnchor(NamedTuple):
  class KiltDoc (line 33) | class KiltDoc(NamedTuple):
    method default_text (line 46) | def default_text(self):
  function strip_markup (line 53) | def strip_markup(text):
  class KiltDocs (line 61) | class KiltDocs(BaseDocs):
    method __init__ (line 62) | def __init__(self, streamer, count_hint=None):
    method docs_iter (line 68) | def docs_iter(self):
    method docs_cls (line 90) | def docs_cls(self):
    method docs_store (line 93) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_count (line 104) | def docs_count(self):
    method docs_namespace (line 108) | def docs_namespace(self):
    method docs_lang (line 111) | def docs_lang(self):
    method docs_kilt_raw_iter (line 114) | def docs_kilt_raw_iter(self):
  function _init (line 120) | def _init():

FILE: ir_datasets/datasets/lotte.py
  class LotteQrels (line 18) | class LotteQrels(BaseQrels):
    method __init__ (line 19) | def __init__(self, qrels_dlc):
    method qrels_path (line 22) | def qrels_path(self):
    method qrels_iter (line 25) | def qrels_iter(self):
    method qrels_cls (line 32) | def qrels_cls(self):
    method qrels_defs (line 35) | def qrels_defs(self):
  function _init (line 39) | def _init():

FILE: ir_datasets/datasets/medline.py
  class MedlineDoc (line 35) | class MedlineDoc(NamedTuple):
    method default_text (line 39) | def default_text(self):
  class TrecGenomicsQuery (line 46) | class TrecGenomicsQuery(NamedTuple):
    method default_text (line 51) | def default_text(self):
  class TrecPm2017Query (line 58) | class TrecPm2017Query(NamedTuple):
    method default_text (line 64) | def default_text(self):
  class TrecPmQuery (line 71) | class TrecPmQuery(NamedTuple):
    method default_text (line 76) | def default_text(self):
  class ConcatFile (line 83) | class ConcatFile:
    method __init__ (line 88) | def __init__(self, files):
    method read (line 91) | def read(self, count=None):
  class MedlineDocs (line 100) | class MedlineDocs(BaseDocs):
    method __init__ (line 101) | def __init__(self, name, dlcs, count_hint=None):
    method docs_iter (line 107) | def docs_iter(self):
    method docs_path (line 141) | def docs_path(self, force=True):
    method docs_store (line 144) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_cls (line 156) | def docs_cls(self):
    method docs_namespace (line 159) | def docs_namespace(self):
    method docs_count (line 162) | def docs_count(self):
    method docs_lang (line 166) | def docs_lang(self):
  class AacrAscoDocs (line 170) | class AacrAscoDocs(BaseDocs):
    method __init__ (line 171) | def __init__(self, dlc):
    method docs_iter (line 175) | def docs_iter(self):
    method docs_path (line 194) | def docs_path(self, force=True):
    method docs_store (line 197) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_cls (line 207) | def docs_cls(self):
    method docs_namespace (line 210) | def docs_namespace(self):
    method docs_count (line 213) | def docs_count(self):
    method docs_lang (line 217) | def docs_lang(self):
  class ConcatDocs (line 221) | class ConcatDocs(BaseDocs):
    method __init__ (line 222) | def __init__(self, docs, count_hint=None):
    method docs_iter (line 226) | def docs_iter(self):
    method docs_iter (line 230) | def docs_iter(self):
    method docs_path (line 234) | def docs_path(self, force=True):
    method docs_store (line 237) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_cls (line 248) | def docs_cls(self):
    method docs_namespace (line 251) | def docs_namespace(self):
    method docs_lang (line 254) | def docs_lang(self):
    method docs_count (line 257) | def docs_count(self):
  function _init (line 262) | def _init():

FILE: ir_datasets/datasets/miracl.py
  class MiraclDoc (line 16) | class MiraclDoc(NamedTuple):
    method default_text (line 20) | def default_text(self):
  function _init (line 24) | def _init():

FILE: ir_datasets/datasets/mmarco.py
  function _init (line 19) | def _init():

FILE: ir_datasets/datasets/mr_tydi.py
  class MrTydiDocs (line 20) | class MrTydiDocs(BaseDocs):
    method __init__ (line 21) | def __init__(self, dlc, lang, count_hint=None):
    method docs_iter (line 28) | def docs_iter(self):
    method docs_cls (line 34) | def docs_cls(self):
    method docs_store (line 37) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_count (line 48) | def docs_count(self):
    method docs_namespace (line 52) | def docs_namespace(self):
    method docs_lang (line 55) | def docs_lang(self):
  function _init (line 59) | def _init():

FILE: ir_datasets/datasets/msmarco_document.py
  class MsMarcoDocument (line 30) | class MsMarcoDocument(NamedTuple):
    method default_text (line 35) | def default_text(self):
  class MsMarcoTrecDocs (line 43) | class MsMarcoTrecDocs(TrecDocs):
    method __init__ (line 44) | def __init__(self, docs_dlc):
    method docs_iter (line 48) | def docs_iter(self):
    method docs_cls (line 60) | def docs_cls(self):
    method docs_namespace (line 63) | def docs_namespace(self):
  class MsMarcoAnchorTextDocument (line 67) | class MsMarcoAnchorTextDocument(NamedTuple):
    method default_text (line 71) | def default_text(self):
  class MsMarcoAnchorTextDocs (line 78) | class MsMarcoAnchorTextDocs(BaseDocs):
    method __init__ (line 79) | def __init__(self, dlc, count_hint):
    method docs_iter (line 85) | def docs_iter(self):
    method docs_cls (line 91) | def docs_cls(self):
    method docs_store (line 94) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_count (line 105) | def docs_count(self):
    method docs_namespace (line 109) | def docs_namespace(self):
    method docs_lang (line 112) | def docs_lang(self):
  function _init (line 116) | def _init():

FILE: ir_datasets/datasets/msmarco_document_v2.py
  class MsMarcoV2Document (line 25) | class MsMarcoV2Document(NamedTuple):
    method default_text (line 31) | def default_text(self):
  class MsMarcoV2Docs (line 38) | class MsMarcoV2Docs(BaseDocs):
    method __init__ (line 39) | def __init__(self, dlc):
    method docs_iter (line 44) | def docs_iter(self):
    method docs_cls (line 61) | def docs_cls(self):
    method docs_store (line 64) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_count (line 94) | def docs_count(self):
    method docs_namespace (line 98) | def docs_namespace(self):
    method docs_lang (line 101) | def docs_lang(self):
  class MsMarcoV2AnchorTextDocument (line 106) | class MsMarcoV2AnchorTextDocument(NamedTuple):
    method default_text (line 110) | def default_text(self):
  class MsMarcoV2AnchorTextDocs (line 117) | class MsMarcoV2AnchorTextDocs(BaseDocs):
    method __init__ (line 118) | def __init__(self, dlc, count_hint):
    method docs_iter (line 124) | def docs_iter(self):
    method docs_cls (line 130) | def docs_cls(self):
    method docs_store (line 133) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_count (line 144) | def docs_count(self):
    method docs_namespace (line 148) | def docs_namespace(self):
    method docs_lang (line 151) | def docs_lang(self):
  function _init (line 155) | def _init():

FILE: ir_datasets/datasets/msmarco_passage.py
  class ExtractQidPid (line 44) | class ExtractQidPid:
    method __init__ (line 45) | def __init__(self, streamer):
    method stream (line 48) | def stream(self):
    method __iter__ (line 51) | def __iter__(self):
  class FixEncoding (line 63) | class FixEncoding:
    method __init__ (line 64) | def __init__(self, streamer):
    method stream (line 67) | def stream(self):
    method __iter__ (line 70) | def __iter__(self):
  class MapSmallTriplesQidPid (line 104) | class MapSmallTriplesQidPid:
    method __init__ (line 105) | def __init__(self, streamer, corpus_stream, queries_handler):
    method stream (line 110) | def stream(self):
    method __iter__ (line 113) | def __iter__(self):
  function _init (line 157) | def _init():

FILE: ir_datasets/datasets/msmarco_passage_v2.py
  class MsMarcoV2Passage (line 26) | class MsMarcoV2Passage(NamedTuple):
    method default_text (line 31) | def default_text(self):
  function parse_msmarco_passage (line 38) | def parse_msmarco_passage(line):
  class MsMarcoV2Passages (line 49) | class MsMarcoV2Passages(BaseDocs):
    method __init__ (line 50) | def __init__(self, dlc, pos_dlc=None):
    method docs_iter (line 56) | def docs_iter(self):
    method docs_cls (line 72) | def docs_cls(self):
    method docs_store (line 75) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_count (line 80) | def docs_count(self):
    method docs_namespace (line 84) | def docs_namespace(self):
    method docs_lang (line 87) | def docs_lang(self):
    method docs_path (line 90) | def docs_path(self, force=True):
  class MsMarcoV2DocStore (line 94) | class MsMarcoV2DocStore(ir_datasets.indices.Docstore):
    method __init__ (line 95) | def __init__(self, docs_handler, options=DEFAULT_DOCSTORE_OPTIONS):
    method get_many_iter (line 109) | def get_many_iter(self, keys):
    method build (line 142) | def build(self):
    method built (line 167) | def built(self):
    method __iter__ (line 170) | def __iter__(self):
    method _iter_source_files (line 174) | def _iter_source_files(self):
    method count (line 178) | def count(self):
  class MsMarcoV2PassageIter (line 185) | class MsMarcoV2PassageIter:
    method __init__ (line 186) | def __init__(self, docstore, slice):
    method __next__ (line 197) | def __next__(self):
    method close (line 229) | def close(self):
    method __iter__ (line 236) | def __iter__(self):
    method __del__ (line 239) | def __del__(self):
    method __getitem__ (line 242) | def __getitem__(self, key):
  function _init (line 258) | def _init():

FILE: ir_datasets/datasets/msmarco_qna.py
  class MsMarcoQnAQuery (line 29) | class MsMarcoQnAQuery(NamedTuple):
    method default_text (line 34) | def default_text(self):
  class MsMarcoQnAEvalQuery (line 41) | class MsMarcoQnAEvalQuery(NamedTuple):
    method default_text (line 45) | def default_text(self):
  class MsMarcoQnADoc (line 52) | class MsMarcoQnADoc(NamedTuple):
    method default_text (line 58) | def default_text(self):
  class MsMarcoQnAManager (line 96) | class MsMarcoQnAManager:
    method __init__ (line 97) | def __init__(self, train_dlc, dev_dlc, eval_dlc, base_path):
    method docs_store (line 104) | def docs_store(self, options: DocstoreOptions=DEFAULT_DOCSTORE_OPTIONS):
    method _internal_docs_store (line 108) | def _internal_docs_store(self, options: DocstoreOptions=DEFAULT_DOCSTO...
    method build (line 113) | def build(self):
    method file_ref (line 281) | def file_ref(self, path):
  class _ManagedDlc (line 285) | class _ManagedDlc:
    method __init__ (line 286) | def __init__(self, manager, path):
    method stream (line 291) | def stream(self):
    method path (line 296) | def path(self, force=True):
  function _init (line 302) | def _init():

FILE: ir_datasets/datasets/nano_beir.py
  function _map_field (line 19) | def _map_field(field, data):
  function parquet_iter (line 27) | def parquet_iter(path):
  class NanoBeirDocs (line 37) | class NanoBeirDocs(BaseDocs):
    method __init__ (line 38) | def __init__(self, name, dlc, doc_type):
    method docs_iter (line 44) | def docs_iter(self):
    method _docs_iter (line 47) | def _docs_iter(self):
    method docs_cls (line 51) | def docs_cls(self):
    method docs_store (line 54) | def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_count (line 65) | def docs_count(self):
    method docs_namespace (line 69) | def docs_namespace(self):
    method docs_lang (line 72) | def docs_lang(self):
  class NanoBeirQueries (line 76) | class NanoBeirQueries(BaseQueries):
    method __init__ (line 77) | def __init__(self, name, dlc, query_type):
    method queries_iter (line 83) | def queries_iter(self):
    method queries_cls (line 87) | def queries_cls(self):
    method queries_namespace (line 90) | def queries_namespace(self):
    method queries_lang (line 93) | def queries_lang(self):
  class NanoBeirQrels (line 97) | class NanoBeirQrels(BaseQrels):
    method __init__ (line 98) | def __init__(self, qrels_dlc, qrels_defs):
    method qrels_path (line 102) | def qrels_path(self):
    method qrels_iter (line 105) | def qrels_iter(self):
    method qrels_cls (line 109) | def qrels_cls(self):
    method qrels_defs (line 112) | def qrels_defs(self):
  function _init (line 116) | def _init():

FILE: ir_datasets/datasets/natural_questions.py
  class NqPassageDoc (line 13) | class NqPassageDoc(NamedTuple):
    method default_text (line 24) | def default_text(self):
  class NqQrel (line 32) | class NqQrel(NamedTuple):
  class NqManager (line 40) | class NqManager:
    method __init__ (line 41) | def __init__(self, dlcs, base_path):
    method docs_store (line 46) | def docs_store(self, options: DocstoreOptions = DEFAULT_DOCSTORE_OPTIO...
    method _internal_docs_store (line 50) | def _internal_docs_store(self, options: DocstoreOptions = DEFAULT_DOCS...
    method build (line 55) | def build(self):
    method file_ref (line 138) | def file_ref(self, path):
  class _ManagedDlc (line 142) | class _ManagedDlc:
    method __init__ (line 143) | def __init__(self, manager, path):
    method stream (line 148) | def stream(self):
    method path (line 153) | def path(self, force=True):
  class NqQrels (line 159) | class NqQrels(BaseQrels):
    method __init__ (line 160) | def __init__(self, dlc):
    method qrels_iter (line 164) | def qrels_iter(self):
    method qrels_cls (line 170) | def qrels_cls(self):
    method qrels_defs (line 173) | def qrels_defs(self):
  class NqScoredDocs (line 177) | class NqScoredDocs(BaseScoredDocs):
    method __init__ (line 178) | def __init__(self, dlc):
    method scoreddocs_iter (line 182) | def scoreddocs_iter(self):
    method scoreddocs_cls (line 189) | def scoreddocs_cls(self):
  function _init (line 194) | def _init():

FILE: ir_datasets/datasets/neuclir.py
  function get_ids (line 24) | def get_ids(dlcs):
  class FilteredExctractedCCDocs (line 32) | class FilteredExctractedCCDocs(ExctractedCCDocs):
    method __init__ (line 33) | def __init__(self, docs_dlc, subset_lang, include_doc_id_dlc, filter_n...
    method _doc_store_path (line 38) | def _doc_store_path(self):
    method _internal_docs_iter (line 41) | def _internal_docs_iter(self):
  class FilteredTrecQrels (line 48) | class FilteredTrecQrels(TrecQrels):
    method __init__ (line 49) | def __init__(self, qrels_dlc, qrels_defs, include_doc_id_dlc, format_3...
    method qrels_iter (line 53) | def qrels_iter(self):
  class LangFilteredTrecQrels (line 60) | class LangFilteredTrecQrels(TrecQrels):
    method __init__ (line 61) | def __init__(self, qrels_dlc, qrels_defs, lang, format_3col=False):
    method qrels_iter (line 65) | def qrels_iter(self):
  function _init (line 77) | def _init():
  function _lazy_qids_set (line 138) | def _lazy_qids_set(qrels):

FILE: ir_datasets/datasets/neumarco.py
  function _init (line 13) | def _init():

FILE: ir_datasets/datasets/nfcorpus.py
  class NfCorpusDoc (line 20) | class NfCorpusDoc(NamedTuple):
    method default_text (line 25) | def default_text(self):
  class NfCorpusQuery (line 31) | class NfCorpusQuery(NamedTuple):
    method default_text (line 35) | def default_text(self):
  class NfCorpusVideoQuery (line 41) | class NfCorpusVideoQuery(NamedTuple):
    method default_text (line 45) | def default_text(self):
  class ZipQueries (line 51) | class ZipQueries(BaseQueries):
    method __init__ (line 52) | def __init__(self, queries, idxs, qtype):
    method queries_iter (line 57) | def queries_iter(self):
    method queries_cls (line 62) | def queries_cls(self):
    method queries_path (line 65) | def queries_path(self):
    method queries_namespace (line 68) | def queries_namespace(self):
    method queries_lang (line 71) | def queries_lang(self):
  function _init (line 75) | def _init():

FILE: ir_datasets/datasets/nyt.py
  class NytDoc (line 28) | class NytDoc(NamedTuple):
    method default_text (line 33) | def default_text(self):
  class NytDocs (line 41) | class NytDocs(BaseDocs):
    method __init__ (line 42) | def __init__(self, dlc):
    method docs_path (line 45) | def docs_path(self, force=True):
    method docs_cls (line 48) | def docs_cls(self):
    method docs_iter (line 51) | def docs_iter(self):
    method _docs_iter (line 54) | def _docs_iter(self):
    method docs_store (line 76) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_count (line 87) | def docs_count(self):
    method docs_namespace (line 91) | def docs_namespace(self):
    method docs_lang (line 94) | def docs_lang(self):
  class NytQueries (line 97) | class NytQueries(BaseQueries):
    method __init__ (line 98) | def __init__(self, collection):
    method queries_iter (line 101) | def queries_iter(self):
    method queries_namespace (line 105) | def queries_namespace(self):
    method queries_lang (line 108) | def queries_lang(self):
  class NytQrels (line 112) | class NytQrels(BaseQrels):
    method __init__ (line 113) | def __init__(self, collection):
    method qrels_iter (line 116) | def qrels_iter(self):
    method qrels_defs (line 120) | def qrels_defs(self):
  function _init (line 124) | def _init():

FILE: ir_datasets/datasets/pmc.py
  class PmcDoc (line 33) | class PmcDoc(NamedTuple):
    method default_text (line 39) | def default_text(self):
  class TrecCdsQuery (line 46) | class TrecCdsQuery(NamedTuple):
    method default_text (line 51) | def default_text(self):
  class TrecCds2016Query (line 58) | class TrecCds2016Query(NamedTuple):
    method default_text (line 64) | def default_text(self):
  class PmcDocs (line 71) | class PmcDocs(BaseDocs):
    method __init__ (line 72) | def __init__(self, dlcs, path, duplicate_dlcs=[], count_hint=None):
    method docs_iter (line 78) | def docs_iter(self):
    method _docs_iter (line 81) | def _docs_iter(self):
    method docs_path (line 116) | def docs_path(self, force=True):
    method docs_store (line 119) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_cls (line 130) | def docs_cls(self):
    method docs_namespace (line 133) | def docs_namespace(self):
    method docs_count (line 136) | def docs_count(self):
    method docs_lang (line 140) | def docs_lang(self):
  function _init (line 144) | def _init():

FILE: ir_datasets/datasets/sara.py
  class SaraDoc (line 23) | class SaraDoc(NamedTuple):
    method default_text (line 27) | def default_text(self):
  class SaraDocs (line 30) | class SaraDocs(BaseDocs):
    method __init__ (line 31) | def __init__(self,dlc):
    method docs_iter (line 35) | def docs_iter(self):
    method _docs_iter (line 38) | def _docs_iter(self):
    method docs_store (line 59) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_count (line 70) | def docs_count(self):
    method docs_namespace (line 74) | def docs_namespace(self):
    method docs_lang (line 77) | def docs_lang(self):
    method docs_cls (line 80) | def docs_cls(self):
  function _init (line 84) | def _init():

FILE: ir_datasets/datasets/touche.py
  function _init (line 53) | def _init():

FILE: ir_datasets/datasets/touche_image.py
  function _init (line 9) | def _init():

FILE: ir_datasets/datasets/trec_arabic.py
  function _init (line 22) | def _init():

FILE: ir_datasets/datasets/trec_cast.py
  class CastPassage (line 59) | class CastPassage(NamedTuple):
  class CastDoc (line 65) | class CastDoc(NamedTuple):
    method default_text (line 71) | def default_text(self):
  class CastPassageDoc (line 78) | class CastPassageDoc(NamedTuple):
    method default_text (line 84) | def default_text(self):
  class Cast2019Query (line 91) | class Cast2019Query(NamedTuple):
    method default_text (line 99) | def default_text(self):
  class Cast2020Query (line 106) | class Cast2020Query(NamedTuple):
    method default_text (line 115) | def default_text(self):
  class Cast2021Query (line 122) | class Cast2021Query(NamedTuple):
    method default_text (line 131) | def default_text(self):
  class Cast2022Query (line 138) | class Cast2022Query(NamedTuple):
    method default_text (line 149) | def default_text(self):
  class CastPassageIter (line 156) | class CastPassageIter:
    method __init__ (line 157) | def __init__(self, docstore, doc_psg_offsets, slice):
    method __next__ (line 172) | def __next__(self):
    method __iter__ (line 195) | def __iter__(self):
    method __getitem__ (line 198) | def __getitem__(self, key):
  class CastPassageDocstore (line 214) | class CastPassageDocstore(ir_datasets.indices.Docstore):
    method __init__ (line 215) | def __init__(self, docs_docstore, options=DEFAULT_DOCSTORE_OPTIONS):
    method get_many_iter (line 219) | def get_many_iter(self, doc_ids):
  class LazyCastPassageIter (line 236) | class LazyCastPassageIter:
    method __init__ (line 237) | def __init__(self, docs: "CastPassageDocs"):
    method __iter__ (line 243) | def __iter__(self):
    method __next__ (line 246) | def __next__(self):
    method __getitem__ (line 260) | def __getitem__(self, key):
  class CastPassageDocs (line 293) | class CastPassageDocs(BaseDocs):
    method __init__ (line 294) | def __init__(self, docs, count):
    method docs_iter (line 299) | def docs_iter(self):
    method docs_cls (line 302) | def docs_cls(self):
    method docs_store (line 305) | def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_count (line 308) | def docs_count(self):
    method docs_namespace (line 311) | def docs_namespace(self):
    method docs_lang (line 314) | def docs_lang(self):
  class SegmentedDocs (line 318) | class SegmentedDocs(BaseTransformedDocs):
    method __init__ (line 327) | def __init__(self, docs, segments_dl: BaseDownload, store_name: str):
    method docs_iter (line 331) | def docs_iter(self):
  class CastQueries (line 358) | class CastQueries(BaseQueries):
    method __init__ (line 359) | def __init__(self, dlc, query_type):
    method queries_iter (line 364) | def queries_iter(self):
    method queries_cls (line 416) | def queries_cls(self):
    method queries_namespace (line 419) | def queries_namespace(self):
    method queries_lang (line 422) | def queries_lang(self):
  class WapoV4Docs (line 426) | class WapoV4Docs(IRDSDocuments):
    method __init__ (line 427) | def __init__(self, dsid: str):
    method docs_cls (line 430) | def docs_cls(self):
    method docs_iter (line 433) | def docs_iter(self):
  class KiltCastDocs (line 465) | class KiltCastDocs(TransformedDocs):
    method __init__ (line 466) | def __init__(self, dsid: str):
    method docs_iter (line 469) | def docs_iter(self):
    method transform (line 476) | def transform(self, doc):
  class WapoDupes (line 483) | class WapoDupes(Dupes):
    method doc_ids (line 485) | def doc_ids(self):
  function transform_msmarco_v1 (line 496) | def transform_msmarco_v1(doc):
  function transform_msmarco_v2 (line 502) | def transform_msmarco_v2(doc):
  function _init (line 507) | def _init():

FILE: ir_datasets/datasets/trec_fair.py
  class FairTrecDoc (line 22) | class FairTrecDoc(NamedTuple):
    method default_text (line 31) | def default_text(self):
  class FairTrec2022Doc (line 38) | class FairTrec2022Doc(NamedTuple):
    method default_text (line 62) | def default_text(self):
  class FairTrecQuery (line 69) | class FairTrecQuery(NamedTuple):
    method default_text (line 75) | def default_text(self):
  class FairTrec2022TrainQuery (line 81) | class FairTrec2022TrainQuery(NamedTuple):
    method default_text (line 85) | def default_text(self):
  class FairTrecEvalQuery (line 92) | class FairTrecEvalQuery(NamedTuple):
    method default_text (line 97) | def default_text(self):
  class FairTrecDocs (line 104) | class FairTrecDocs(BaseDocs):
    method __init__ (line 105) | def __init__(self, dlc, mlc):
    method docs_iter (line 110) | def docs_iter(self):
    method _docs_iter (line 113) | def _docs_iter(self):
    method docs_cls (line 147) | def docs_cls(self):
    method docs_store (line 150) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_count (line 162) | def docs_count(self):
    method docs_namespace (line 166) | def docs_namespace(self):
    method docs_lang (line 169) | def docs_lang(self):
  class FairTrecQueries (line 173) | class FairTrecQueries(BaseQueries):
    method __init__ (line 174) | def __init__(self, dlc, qtype):
    method queries_iter (line 179) | def queries_iter(self):
    method queries_cls (line 190) | def queries_cls(self):
    method queries_lang (line 193) | def queries_lang(self):
  class FairTrecQrels (line 196) | class FairTrecQrels(BaseQrels):
    method __init__ (line 197) | def __init__(self, qrels_dlc):
    method qrels_path (line 200) | def qrels_path(self):
    method qrels_iter (line 203) | def qrels_iter(self):
    method qrels_cls (line 210) | def qrels_cls(self):
    method qrels_defs (line 213) | def qrels_defs(self):
  class JsonlDocs (line 217) | class JsonlDocs(BaseDocs):
    method __init__ (line 218) | def __init__(self, dlc, metadata_dlc, doc_type, field_map, count_hint):
    method docs_iter (line 226) | def docs_iter(self):
    method _docs_iter_first (line 229) | def _docs_iter_first(self):
    method docs_cls (line 242) | def docs_cls(self):
    method docs_store (line 245) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_count (line 256) | def docs_count(self):
    method docs_namespace (line 260) | def docs_namespace(self):
    method docs_lang (line 263) | def docs_lang(self):
  function _init (line 266) | def _init():

FILE: ir_datasets/datasets/trec_mandarin.py
  class TrecMandarinQuery (line 11) | class TrecMandarinQuery(NamedTuple):
    method default_text (line 19) | def default_text(self):
  function _init (line 42) | def _init():

FILE: ir_datasets/datasets/trec_robust04.py
  function _init (line 33) | def _init():
  function make_filter (line 67) | def make_filter(fold):

FILE: ir_datasets/datasets/trec_spanish.py
  class TrecDescOnlyQuery (line 10) | class TrecDescOnlyQuery(NamedTuple):
    method default_text (line 13) | def default_text(self):
  class TrecSpanish3Query (line 19) | class TrecSpanish3Query(NamedTuple):
    method default_text (line 27) | def default_text(self):
  class TrecSpanish4Query (line 33) | class TrecSpanish4Query(NamedTuple):
    method default_text (line 39) | def default_text(self):
  class TrecSpanishTranslateQueries (line 65) | class TrecSpanishTranslateQueries:
    method __init__ (line 66) | def __init__(self, parent, query_cls):
    method __getattr__ (line 70) | def __getattr__(self, attr):
    method queries_iter (line 73) | def queries_iter(self):
    method queries_cls (line 92) | def queries_cls(self):
  function _init (line 96) | def _init():

FILE: ir_datasets/datasets/trec_tot.py
  class TipOfTheTongueDoc (line 10) | class TipOfTheTongueDoc(NamedTuple):
    method default_text (line 19) | def default_text(self):
  class TipOfTheTongueDoc2024 (line 25) | class TipOfTheTongueDoc2024(NamedTuple):
    method default_text (line 32) | def default_text(self):
  class TipOfTheTongueQuery2024 (line 38) | class TipOfTheTongueQuery2024(NamedTuple):
    method default_text (line 42) | def default_text(self):
  class TipOfTheTongueQuery (line 46) | class TipOfTheTongueQuery(NamedTuple):
    method default_text (line 54) | def default_text(self):
  function _init (line 61) | def _init():

FILE: ir_datasets/datasets/trec_tot_2025.py
  class JsonlDocumentOffset (line 18) | class JsonlDocumentOffset(NamedTuple):
  class TrecToT2025Doc (line 24) | class TrecToT2025Doc(NamedTuple):
    method _from_json (line 31) | def _from_json(json_doc):
    method default_text (line 34) | def default_text(self):
  class JsonlWithOffsetsDocsStore (line 38) | class JsonlWithOffsetsDocsStore(Docstore):
    method __init__ (line 39) | def __init__(self, docs, offsets, options=DEFAULT_DOCSTORE_OPTIONS):
    method offsets_iter (line 46) | def offsets_iter(self):
    method docs_dict (line 52) | def docs_dict(self):
    method get_many_iter (line 62) | def get_many_iter(self, doc_ids):
  class TrecToT2025DocsStore (line 73) | class TrecToT2025DocsStore(JsonlWithOffsetsDocsStore):
    method get_many_iter (line 74) | def get_many_iter(self, doc_ids):
  class JsonlDocumentsWithOffsets (line 79) | class JsonlDocumentsWithOffsets(BaseDocs):
    method __init__ (line 80) | def __init__(self, docs, offsets):
    method docs_iter (line 84) | def docs_iter(self):
    method docs_cls (line 89) | def docs_cls(self):
    method docs_store (line 92) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_namespace (line 95) | def docs_namespace(self):
    method docs_count (line 98) | def docs_count(self):
    method docs_lang (line 101) | def docs_lang(self):
  class TrecToT2025Dataset (line 105) | class TrecToT2025Dataset(Dataset):
    method __init__ (line 106) | def __init__(self, docs_jsonl_file, offset_jsonl_file, queries=None, q...
  function register_dataset (line 117) | def register_dataset():

FILE: ir_datasets/datasets/tripclick.py
  class ConcatQueries (line 41) | class ConcatQueries(BaseQueries):
    method __init__ (line 42) | def __init__(self, queries):
    method queries_iter (line 45) | def queries_iter(self):
    method queries_path (line 49) | def queries_path(self):
    method queries_cls (line 52) | def queries_cls(self):
    method queries_namespace (line 55) | def queries_namespace(self):
    method queries_lang (line 58) | def queries_lang(self):
  class ConcatQrels (line 62) | class ConcatQrels(BaseQrels):
    method __init__ (line 63) | def __init__(self, qrels):
    method qrels_iter (line 66) | def qrels_iter(self):
    method qrels_path (line 70) | def qrels_path(self):
    method qrels_cls (line 73) | def qrels_cls(self):
    method qrels_defs (line 76) | def qrels_defs(self):
  class ConcatScoreddocs (line 80) | class ConcatScoreddocs(BaseScoredDocs):
    method __init__ (line 81) | def __init__(self, scoreddocs):
    method scoreddocs_iter (line 84) | def scoreddocs_iter(self):
    method scoreddocs_path (line 88) | def scoreddocs_path(self, force=True):
    method scoreddocs_cls (line 91) | def scoreddocs_cls(self):
  class LogItem (line 95) | class LogItem(NamedTuple):
  class TripClickQlog (line 100) | class TripClickQlog(NamedTuple):
  class TripClickPartialDoc (line 108) | class TripClickPartialDoc(NamedTuple):
    method default_text (line 112) | def default_text(self):
  class TripClickQlogs (line 119) | class TripClickQlogs(BaseQlogs):
    method __init__ (line 120) | def __init__(self, dlc):
    method qlogs_iter (line 123) | def qlogs_iter(self):
    method qlogs_cls (line 143) | def qlogs_cls(self):
    method qlogs_count (line 146) | def qlogs_count(self):
  class DocPairGenerator (line 150) | class DocPairGenerator:
    method __init__ (line 151) | def __init__(self, docpair_dlc, collection, queries, cache_path):
    method path (line 157) | def path(self, force=True):
    method stream (line 191) | def stream(self):
  class FixAllarticles (line 200) | class FixAllarticles:
    method __init__ (line 201) | def __init__(self, streamer):
    method stream (line 204) | def stream(self):
    method __iter__ (line 207) | def __iter__(self):
  function _init (line 240) | def _init():

FILE: ir_datasets/datasets/tweets2013_ia.py
  class TweetDoc (line 49) | class TweetDoc(NamedTuple):
    method default_text (line 59) | def default_text(self):
  class TrecMb13Query (line 66) | class TrecMb13Query(NamedTuple):
    method default_text (line 71) | def default_text(self):
  class TrecMb14Query (line 78) | class TrecMb14Query(NamedTuple):
    method default_text (line 84) | def default_text(self):
  class TweetWriter (line 143) | class TweetWriter:
    method __init__ (line 144) | def __init__(self, base_path, max_tweets_per_block=100):
    method add (line 150) | def add(self, file_name, tweet_id, tweet_data):
    method flush (line 156) | def flush(self):
  class Tweets2013IaDocIter (line 171) | class Tweets2013IaDocIter:
    method __init__ (line 172) | def __init__(self, tweets_docs, slice):
    method __next__ (line 181) | def __next__(self):
    method close (line 210) | def close(self):
    method __iter__ (line 214) | def __iter__(self):
    method __del__ (line 217) | def __del__(self):
    method __getitem__ (line 220) | def __getitem__(self, key):
  class TweetsDocstore (line 236) | class TweetsDocstore(Docstore):
    method __init__ (line 237) | def __init__(self, tweets_docs, options=DEFAULT_DOCSTORE_OPTIONS):
    method get_many_iter (line 241) | def get_many_iter(self, doc_ids):
  class Tweets2013IaDocs (line 288) | class Tweets2013IaDocs(BaseDocs):
    method __init__ (line 289) | def __init__(self, docs_dlcs, base_path):
    method _id2file (line 294) | def _id2file(self, snowflake_id):
    method _docs_build (line 300) | def _docs_build(self):
    method docs_iter (line 360) | def docs_iter(self):
    method docs_cls (line 363) | def docs_cls(self):
    method docs_store (line 366) | def docs_store(self, options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_path (line 369) | def docs_path(self, force=False):
    method docs_count (line 372) | def docs_count(self, force=False):
    method docs_namespace (line 377) | def docs_namespace(self):
    method docs_lang (line 380) | def docs_lang(self):
    method _docs_file_counts (line 383) | def _docs_file_counts(self):
    method _docs_iter_source_files (line 394) | def _docs_iter_source_files(self):
    method _docs_ctxt_iter_tweets (line 397) | def _docs_ctxt_iter_tweets(self, source_file):
    method _docs_source_to_doc (line 405) | def _docs_source_to_doc(self, source, data):
  function _init (line 410) | def _init():

FILE: ir_datasets/datasets/vaswani.py
  function sentinel_splitter (line 19) | def sentinel_splitter(it, sentinel):
  class VaswaniDocs (line 25) | class VaswaniDocs(BaseDocs):
    method __init__ (line 26) | def __init__(self, docs_dlc):
    method docs_path (line 30) | def docs_path(self, force=True):
    method docs_iter (line 34) | def docs_iter(self):
    method docs_cls (line 42) | def docs_cls(self):
    method docs_store (line 45) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_count (line 56) | def docs_count(self):
    method docs_namespace (line 60) | def docs_namespace():
    method docs_lang (line 63) | def docs_lang(self):
  class VaswaniQueries (line 67) | class VaswaniQueries(BaseQueries):
    method __init__ (line 68) | def __init__(self, queries_dlc):
    method queries_path (line 72) | def queries_path(self):
    method queries_iter (line 75) | def queries_iter(self):
    method queries_cls (line 83) | def queries_cls(self):
    method queries_namespace (line 86) | def queries_namespace(self):
    method queries_lang (line 89) | def queries_lang(self):
  class VaswaniQrels (line 93) | class VaswaniQrels(BaseQrels):
    method __init__ (line 94) | def __init__(self, qrels_dlc):
    method qrels_path (line 97) | def qrels_path(self):
    method qrels_iter (line 100) | def qrels_iter(self):
    method qrels_cls (line 109) | def qrels_cls(self):
    method qrels_defs (line 112) | def qrels_defs(self):
  function _init (line 116) | def _init():

FILE: ir_datasets/datasets/wapo.py
  class WapoDocMedia (line 38) | class WapoDocMedia(NamedTuple):
  class WapoDoc (line 44) | class WapoDoc(NamedTuple):
    method default_text (line 54) | def default_text(self):
  class TrecBackgroundLinkingQuery (line 61) | class TrecBackgroundLinkingQuery(NamedTuple):
  class WapoDocs (line 67) | class WapoDocs(BaseDocs):
    method __init__ (line 68) | def __init__(self, dlc, file_name):
    method docs_path (line 72) | def docs_path(self, force=True):
    method docs_cls (line 75) | def docs_cls(self):
    method docs_iter (line 78) | def docs_iter(self):
    method _docs_iter (line 81) | def _docs_iter(self):
    method docs_wapo_raw_iter (line 129) | def docs_wapo_raw_iter(self):
    method docs_store (line 140) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_count (line 150) | def docs_count(self):
    method docs_namespace (line 154) | def docs_namespace(self):
    method docs_lang (line 157) | def docs_lang(self):
  function _init (line 161) | def _init():

FILE: ir_datasets/datasets/wikiclir.py
  class WikiClirQuery (line 19) | class WikiClirQuery(NamedTuple):
    method default_text (line 23) | def default_text(self):
  class WikiClirDoc (line 30) | class WikiClirDoc(NamedTuple):
    method default_text (line 34) | def default_text(self):
  function _init (line 41) | def _init():
  function _qid_filter (line 99) | def _qid_filter(qrels):

FILE: ir_datasets/datasets/wikir.py
  function _init (line 20) | def _init():

FILE: ir_datasets/formats/argsme.py
  class ArgsMeStance (line 17) | class ArgsMeStance(Enum):
    method from_json (line 26) | def from_json(json: str) -> "ArgsMeStance":
  class ArgsMeMode (line 35) | class ArgsMeMode(Enum):
    method from_json (line 40) | def from_json(json: str) -> "ArgsMeMode":
  class ArgsMeSourceDomain (line 49) | class ArgsMeSourceDomain(Enum):
    method from_json (line 57) | def from_json(json: str) -> "ArgsMeSourceDomain":
  class ArgsMePremiseAnnotation (line 72) | class ArgsMePremiseAnnotation(NamedTuple):
    method from_json (line 82) | def from_json(json: dict) -> "ArgsMePremiseAnnotation":
  class ArgsMePremise (line 89) | class ArgsMePremise(NamedTuple):
    method from_json (line 99) | def from_json(json: dict) -> "ArgsMePremise":
  class ArgsMeAspect (line 110) | class ArgsMeAspect(NamedTuple):
    method from_json (line 117) | def from_json(json: dict) -> "ArgsMeAspect":
  class ArgsMeSentence (line 125) | class ArgsMeSentence(NamedTuple):
    method from_json (line 130) | def from_json(json: dict) -> "ArgsMeSentence":
  class ArgsMeDoc (line 137) | class ArgsMeDoc(NamedTuple):
    method default_text (line 169) | def default_text(self):
    method from_json (line 176) | def from_json(json: dict) -> "ArgsMeDoc":
  class ArgsMeProcessedDoc (line 315) | class ArgsMeProcessedDoc(NamedTuple):
    method from_csv (line 349) | def from_csv(csv: dict) -> "ArgsMeProcessedDoc":
  class ArgsMeDocs (line 363) | class ArgsMeDocs(BaseDocs):
    method __init__ (line 369) | def __init__(
    method docs_path (line 381) | def docs_path(self):
    method docs_iter (line 385) | def docs_iter(self):
    method docs_store (line 393) | def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_count (line 404) | def docs_count(self):
    method docs_cls (line 407) | def docs_cls(self):
    method docs_namespace (line 410) | def docs_namespace(self):
    method docs_lang (line 413) | def docs_lang(self):
  class ArgsMeProcessedDocs (line 417) | class ArgsMeProcessedDocs(BaseDocs):
    method __init__ (line 423) | def __init__(
    method docs_path (line 435) | def docs_path(self):
    method docs_iter (line 439) | def docs_iter(self):
    method docs_store (line 448) | def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_count (line 459) | def docs_count(self):
    method docs_cls (line 462) | def docs_cls(self):
    method docs_namespace (line 465) | def docs_namespace(self):
    method docs_lang (line 468) | def docs_lang(self):
  class ArgsMeCombinedDocs (line 472) | class ArgsMeCombinedDocs(BaseDocs):
    method __init__ (line 479) | def __init__(
    method docs_path (line 493) | def docs_path(self):
    method docs_iter (line 497) | def docs_iter(self):
    method docs_store (line 502) | def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_count (line 513) | def docs_count(self):
    method docs_cls (line 520) | def docs_cls(self):
    method docs_namespace (line 527) | def docs_namespace(self):
    method docs_lang (line 534) | def docs_lang(self):

FILE: ir_datasets/formats/base.py
  class GenericDoc (line 10) | class GenericDoc(NamedTuple):
    method default_text (line 13) | def default_text(self):
  class GenericQuery (line 16) | class GenericQuery(NamedTuple):
    method default_text (line 19) | def default_text(self):
  class GenericQrel (line 22) | class GenericQrel(NamedTuple):
  class GenericScoredDoc (line 27) | class GenericScoredDoc(NamedTuple):
  class GenericDocPair (line 32) | class GenericDocPair(NamedTuple):
  class BaseDocs (line 38) | class BaseDocs:
    method __getattr__ (line 42) | def __getattr__(self, attr):
    method docs_iter (line 48) | def docs_iter(self):
    method docs_count (line 51) | def docs_count(self):
    method docs_handler (line 54) | def docs_handler(self):
    method docs_cls (line 57) | def docs_cls(self):
    method docs_namespace (line 60) | def docs_namespace(self):
    method docs_lang (line 63) | def docs_lang(self):
  class BaseQueries (line 67) | class BaseQueries:
    method __getattr__ (line 71) | def __getattr__(self, attr):
    method queries_iter (line 77) | def queries_iter(self):
    method queries_handler (line 80) | def queries_handler(self):
    method queries_cls (line 83) | def queries_cls(self):
    method queries_namespace (line 86) | def queries_namespace(self):
    method queries_lang (line 89) | def queries_lang(self):
  class BaseQrels (line 93) | class BaseQrels:
    method __getattr__ (line 97) | def __getattr__(self, attr):
    method qrels_iter (line 103) | def qrels_iter(self):
    method qrels_defs (line 106) | def qrels_defs(self):
    method qrels_path (line 109) | def qrels_path(self):
    method qrels_cls (line 112) | def qrels_cls(self):
    method qrels_handler (line 115) | def qrels_handler(self):
  class BaseScoredDocs (line 119) | class BaseScoredDocs:
    method __getattr__ (line 123) | def __getattr__(self, attr):
    method scoreddocs_path (line 129) | def scoreddocs_path(self):
    method scoreddocs_iter (line 132) | def scoreddocs_iter(self):
    method scoreddocs_cls (line 135) | def scoreddocs_cls(self):
    method scoreddocs_handler (line 138) | def scoreddocs_handler(self):
  class BaseDocPairs (line 142) | class BaseDocPairs:
    method __getattr__ (line 146) | def __getattr__(self, attr):
    method docpairs_path (line 152) | def docpairs_path(self):
    method docpairs_iter (line 155) | def docpairs_iter(self):
    method docpairs_cls (line 158) | def docpairs_cls(self):
    method docpairs_handler (line 161) | def docpairs_handler(self):
  class BaseQlogs (line 165) | class BaseQlogs:
    method __getattr__ (line 169) | def __getattr__(self, attr):
    method qlogs_iter (line 175) | def qlogs_iter(self):
    method qlogs_cls (line 178) | def qlogs_cls(self):
    method qlogs_count (line 181) | def qlogs_count(self):
    method qlogs_handler (line 184) | def qlogs_handler(self):
  function qrels_dict (line 191) | def qrels_dict(qrels_handler):
  function hasher (line 201) | def hasher(iter_fn, hashfn=hashlib.md5):
  function _calc_metadata (line 218) | def _calc_metadata(iter_fn, metadata_fields=(), count_by_value_field=None):
  class DocstoreBackedDocs (line 263) | class DocstoreBackedDocs(BaseDocs):
    method __init__ (line 267) | def __init__(self, docstore_lazy, docs_cls=GenericDoc, namespace=None,...
    method docs_iter (line 274) | def docs_iter(self):
    method docs_count (line 277) | def docs_count(self):
    method docs_cls (line 281) | def docs_cls(self):
    method docs_namespace (line 284) | def docs_namespace(self):
    method docs_lang (line 287) | def docs_lang(self):
    method docs_store (line 290) | def docs_store(self):
  class DocSourceSeekableIter (line 296) | class DocSourceSeekableIter:
    method __next__ (line 297) | def __next__(self) -> NamedTuple:
    method seek (line 303) | def seek(self, pos):
    method close (line 309) | def close(self):
    method __enter__ (line 315) | def __enter__(self):
    method __exit__ (line 318) | def __exit__(self, exc_type, exc_val, exc_tb):
    method __iter__ (line 321) | def __iter__(self):
  class DocSource (line 325) | class DocSource:
    method __len__ (line 326) | def __len__(self) -> int:
    method __iter__ (line 332) | def __iter__(self) -> DocSourceSeekableIter:
  class SourceDocIter (line 339) | class SourceDocIter:
    method __init__ (line 340) | def __init__(self, docs, slice):
    method __next__ (line 349) | def __next__(self):
    method close (line 373) | def close(self):
    method __iter__ (line 378) | def __iter__(self):
    method __del__ (line 381) | def __del__(self):
    method __getitem__ (line 384) | def __getitem__(self, key):

FILE: ir_datasets/formats/clirmatrix.py
  class CLIRMatrixQueries (line 7) | class CLIRMatrixQueries(BaseQueries):
    method __init__ (line 8) | def __init__(self, streamer, query_lang):
    method queries_iter (line 13) | def queries_iter(self):
    method queries_namespace (line 25) | def queries_namespace(self):
    method queries_cls (line 28) | def queries_cls(self):
    method queries_lang (line 31) | def queries_lang(self):
  class CLIRMatrixQrels (line 35) | class CLIRMatrixQrels(TrecQrels):
    method qrels_iter (line 36) | def qrels_iter(self):

FILE: ir_datasets/formats/csv_fmt.py
  class _CsvBase (line 12) | class _CsvBase:
    method __init__ (line 13) | def __init__(self, dlc, cls, datatype):
    method _path (line 19) | def _path(self, force=True):
    method _iter (line 22) | def _iter(self):
  class CsvDocs (line 34) | class CsvDocs(_CsvBase, BaseDocs):
    method __init__ (line 35) | def __init__(self, docs_dlc, doc_cls=GenericDoc, doc_store_index_field...
    method docs_path (line 43) | def docs_path(self, force=True):
    method docs_iter (line 47) | def docs_iter(self):
    method docs_cls (line 50) | def docs_cls(self):
    method docs_store (line 53) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_namespace (line 65) | def docs_namespace(self):
    method docs_count (line 68) | def docs_count(self):
    method docs_lang (line 73) | def docs_lang(self):
  class CsvQueries (line 77) | class CsvQueries(_CsvBase, BaseQueries):
    method __init__ (line 78) | def __init__(self, queries_dlc, query_cls=GenericQuery, namespace=None...
    method queries_path (line 83) | def queries_path(self):
    method queries_iter (line 86) | def queries_iter(self):
    method queries_cls (line 89) | def queries_cls(self):
    method queries_namespace (line 92) | def queries_namespace(self):
    method queries_lang (line 95) | def queries_lang(self):
  class CsvDocPairs (line 99) | class CsvDocPairs(_CsvBase, BaseDocPairs):
    method __init__ (line 100) | def __init__(self, docpairs_dlc, docpair_cls=GenericDocPair):
    method docpairs_path (line 103) | def docpairs_path(self):
    method docpairs_iter (line 106) | def docpairs_iter(self):
    method docpairs_cls (line 109) | def docpairs_cls(self):

FILE: ir_datasets/formats/extracted_cc.py
  class ExctractedCCDoc (line 14) | class ExctractedCCDoc(NamedTuple):
    method default_text (line 21) | def default_text(self):
  class ExctractedCCDocs (line 28) | class ExctractedCCDocs(BaseDocs):
    method __init__ (line 30) | def __init__(self, docs_dlc, subset_lang=None, namespace=None, count=N...
    method docs_path (line 37) | def docs_path(self, force=True):
    method docs_iter (line 43) | def docs_iter(self):
    method _doc_store_path (line 46) | def _doc_store_path(self):
    method docs_store (line 49) | def docs_store(self):
    method _internal_docs_iter (line 59) | def _internal_docs_iter(self):
    method docs_cls (line 72) | def docs_cls(self):
    method docs_namespace (line 75) | def docs_namespace(self):
    method docs_count (line 78) | def docs_count(self):
    method docs_lang (line 81) | def docs_lang(self):
  class ExctractedCCQuery (line 85) | class ExctractedCCQuery(NamedTuple):
    method default_text (line 98) | def default_text(self):
  class ExctractedCCNoReportQuery (line 104) | class ExctractedCCNoReportQuery(NamedTuple):
    method default_text (line 116) | def default_text(self):
  class ExctractedCCNoReportNoHtNarQuery (line 122) | class ExctractedCCNoReportNoHtNarQuery(NamedTuple):
    method default_text (line 133) | def default_text(self):
  class ExctractedCCMultiMtQuery (line 140) | class ExctractedCCMultiMtQuery(NamedTuple):
    method default_text (line 154) | def default_text(self):
  class ExctractedCCQueries (line 161) | class ExctractedCCQueries(BaseQueries):
    method __init__ (line 162) | def __init__(self, queries_dlc, subset_lang=None, filter_lwq=True, cls...
    method queries_path (line 171) | def queries_path(self):
    method queries_cls (line 174) | def queries_cls(self):
    method queries_namespace (line 177) | def queries_namespace(self):
    method queries_iter (line 180) | def queries_iter(self):
    method _internal_queries_iter (line 184) | def _internal_queries_iter(self, dlc):
    method _produce_query (line 191) | def _produce_query(self, line):

FILE: ir_datasets/formats/jsonl.py
  class _JsonlBase (line 12) | class _JsonlBase:
    method __init__ (line 13) | def __init__(self, dlcs, cls, datatype, mapping=None):
    method _path (line 23) | def _path(self, force=True):
    method _iter (line 26) | def _iter(self):
  class JsonlDocs (line 34) | class JsonlDocs(_JsonlBase, BaseDocs):
    method __init__ (line 35) | def __init__(self, docs_dlcs, doc_cls=GenericDoc, mapping=None, doc_st...
    method docs_path (line 43) | def docs_path(self, force=True):
    method docs_iter (line 47) | def docs_iter(self):
    method docs_cls (line 50) | def docs_cls(self):
    method docs_store (line 53) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_namespace (line 65) | def docs_namespace(self):
    method docs_count (line 68) | def docs_count(self):
    method docs_lang (line 73) | def docs_lang(self):
  class JsonlQueries (line 77) | class JsonlQueries(_JsonlBase, BaseQueries):
    method __init__ (line 78) | def __init__(self, query_dlcs, query_cls=GenericQuery, mapping=None, l...
    method queries_path (line 83) | def queries_path(self, force=True):
    method queries_iter (line 86) | def queries_iter(self):
    method queries_cls (line 89) | def queries_cls(self):
    method queries_namespace (line 92) | def queries_namespace(self):
    method queries_lang (line 95) | def queries_lang(self):

FILE: ir_datasets/formats/ntcir.py
  class NtcirQrels (line 5) | class NtcirQrels(TrecQrels):
    method qrels_iter (line 6) | def qrels_iter(self):

FILE: ir_datasets/formats/touche.py
  class ToucheQuery (line 12) | class ToucheQuery(NamedTuple):
    method default_text (line 17) | def default_text(self):
  class ToucheTitleQuery (line 24) | class ToucheTitleQuery(NamedTuple):
    method default_text (line 27) | def default_text(self):
  class ToucheComparativeQuery (line 34) | class ToucheComparativeQuery(NamedTuple):
    method default_text (line 40) | def default_text(self):
  class ToucheQualityQrel (line 47) | class ToucheQualityQrel(NamedTuple):
  class ToucheQualityCoherenceQrel (line 55) | class ToucheQualityCoherenceQrel(NamedTuple):
  class ToucheComparativeStance (line 64) | class ToucheComparativeStance(Enum):
  class ToucheQualityComparativeStanceQrel (line 71) | class ToucheQualityComparativeStanceQrel(NamedTuple):
  class ToucheControversialStance (line 80) | class ToucheControversialStance(Enum):
  class ToucheControversialStanceQrel (line 86) | class ToucheControversialStanceQrel(NamedTuple):
  class TouchePassageDoc (line 93) | class TouchePassageDoc(NamedTuple):
    method default_text (line 97) | def default_text(self):
  class ToucheQueries (line 104) | class ToucheQueries(BaseQueries):
    method __init__ (line 109) | def __init__(
    method queries_path (line 119) | def queries_path(self):
    method queries_iter (line 122) | def queries_iter(self):
    method queries_cls (line 142) | def queries_cls(self):
    method queries_namespace (line 145) | def queries_namespace(self):
    method queries_lang (line 148) | def queries_lang(self):
  class ToucheTitleQueries (line 152) | class ToucheTitleQueries(BaseQueries):
    method __init__ (line 157) | def __init__(
    method queries_path (line 167) | def queries_path(self):
    method queries_iter (line 170) | def queries_iter(self):
    method queries_cls (line 186) | def queries_cls(self):
    method queries_namespace (line 189) | def queries_namespace(self):
    method queries_lang (line 192) | def queries_lang(self):
  class ToucheComparativeQueries (line 196) | class ToucheComparativeQueries(BaseQueries):
    method __init__ (line 201) | def __init__(
    method queries_path (line 211) | def queries_path(self):
    method queries_iter (line 214) | def queries_iter(self):
    method queries_cls (line 238) | def queries_cls(self):
    method queries_namespace (line 241) | def queries_namespace(self):
    method queries_lang (line 244) | def queries_lang(self):
  class ToucheQrels (line 248) | class ToucheQrels(BaseQrels):
    method __init__ (line 253) | def __init__(
    method qrels_path (line 263) | def qrels_path(self):
    method qrels_iter (line 266) | def qrels_iter(self):
    method qrels_cls (line 299) | def qrels_cls(self):
    method qrels_defs (line 302) | def qrels_defs(self):
  class ToucheQualityQrels (line 306) | class ToucheQualityQrels(BaseQrels):
    method __init__ (line 311) | def __init__(
    method qrels_path (line 321) | def qrels_path(self):
    method qrels_iter (line 324) | def qrels_iter(self):
    method qrels_cls (line 384) | def qrels_cls(self):
    method qrels_defs (line 387) | def qrels_defs(self):
  class ToucheQualityCoherenceQrels (line 391) | class ToucheQualityCoherenceQrels(BaseQrels):
    method __init__ (line 397) | def __init__(
    method qrels_path (line 409) | def qrels_path(self):
    method qrels_iter (line 412) | def qrels_iter(self):
    method qrels_cls (line 506) | def qrels_cls(self):
    method qrels_defs (line 509) | def qrels_defs(self):
  class ToucheQualityComparativeStanceQrels (line 513) | class ToucheQualityComparativeStanceQrels(BaseQrels):
    method __init__ (line 519) | def __init__(
    method qrels_path (line 531) | def qrels_path(self):
    method qrels_iter (line 534) | def qrels_iter(self):
    method qrels_cls (line 626) | def qrels_cls(self):
    method qrels_defs (line 629) | def qrels_defs(self):
  class ToucheControversialStanceQrels (line 633) | class ToucheControversialStanceQrels(BaseQrels):
    method __init__ (line 637) | def __init__(self, source: Any, definitions: Dict[int, str]):
    method qrels_path (line 641) | def qrels_path(self):
    method qrels_iter (line 644) | def qrels_iter(self):
    method qrels_cls (line 669) | def qrels_cls(self):
    method qrels_defs (line 672) | def qrels_defs(self):
  class TouchePassageDocs (line 676) | class TouchePassageDocs(BaseDocs):
    method __init__ (line 682) | def __init__(
    method docs_path (line 694) | def docs_path(self):
    method docs_iter (line 698) | def docs_iter(self):
    method docs_store (line 709) | def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_count (line 720) | def docs_count(self):
    method docs_cls (line 723) | def docs_cls(self):
    method docs_namespace (line 726) | def docs_namespace(self):
    method docs_lang (line 729) | def docs_lang(self):

FILE: ir_datasets/formats/touche_image.py
  class ToucheImageRanking (line 13) | class ToucheImageRanking(NamedTuple):
  class ToucheImageNode (line 19) | class ToucheImageNode(NamedTuple):
  class ToucheImagePage (line 29) | class ToucheImagePage(NamedTuple):
  class ToucheImageDoc (line 41) | class ToucheImageDoc(NamedTuple):
  class ToucheImageDocs (line 56) | class ToucheImageDocs(BaseDocs):
    method __init__ (line 64) | def __init__(
    method docs_path (line 80) | def docs_path(self):
    method docs_iter (line 84) | def docs_iter(self):
    method docs_store (line 222) | def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_count (line 233) | def docs_count(self):
    method docs_cls (line 236) | def docs_cls(self):
    method docs_namespace (line 239) | def docs_namespace(self):
    method docs_lang (line 242) | def docs_lang(self):

FILE: ir_datasets/formats/trec.py
  class TrecDoc (line 16) | class TrecDoc(NamedTuple):
    method default_text (line 20) | def default_text(self):
  class TitleUrlTextDoc (line 26) | class TitleUrlTextDoc(NamedTuple):
    method default_text (line 31) | def default_text(self):
  class TrecParsedDoc (line 37) | class TrecParsedDoc(NamedTuple):
    method default_text (line 42) | def default_text(self):
  class TrecQuery (line 48) | class TrecQuery(NamedTuple):
    method default_text (line 53) | def default_text(self):
  class TrecSubtopic (line 59) | class TrecSubtopic(NamedTuple):
  class TrecQrel (line 64) | class TrecQrel(NamedTuple):
  class TrecSubQrel (line 70) | class TrecSubQrel(NamedTuple):
  class TrecPrel (line 76) | class TrecPrel(NamedTuple):
  class TrecDocs (line 86) | class TrecDocs(BaseDocs):
    method __init__ (line 87) | def __init__(self, docs_dlc, encoding=None, path_globs=None, content_t...
    method docs_path (line 113) | def docs_path(self, force=True):
    method docs_iter (line 117) | def docs_iter(self):
    method _docs_iter (line 153) | def _docs_iter(self, path):
    method _parser_bs (line 171) | def _parser_bs(self, stream):
    method _parser_text (line 196) | def _parser_text(self, stream):
    method _parser_tut (line 216) | def _parser_tut(self, stream):
    method _parser_sax (line 238) | def _parser_sax(self, stream):
    method docs_cls (line 257) | def docs_cls(self):
    method docs_store (line 260) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_count (line 276) | def docs_count(self):
    method docs_namespace (line 280) | def docs_namespace(self):
    method docs_lang (line 283) | def docs_lang(self):
  class TrecQueries (line 293) | class TrecQueries(BaseQueries):
    method __init__ (line 294) | def __init__(self, queries_dlc, qtype=TrecQuery, qtype_map=None, encod...
    method queries_path (line 303) | def queries_path(self):
    method queries_iter (line 306) | def queries_iter(self):
    method queries_cls (line 328) | def queries_cls(self):
    method queries_namespace (line 331) | def queries_namespace(self):
    method queries_lang (line 334) | def queries_lang(self):
  class TrecXmlQueries (line 338) | class TrecXmlQueries(BaseQueries):
    method __init__ (line 339) | def __init__(self, queries_dlc, qtype=TrecQuery, qtype_map=None, encod...
    method queries_path (line 348) | def queries_path(self):
    method queries_iter (line 351) | def queries_iter(self):
    method queries_cls (line 382) | def queries_cls(self):
    method queries_namespace (line 385) | def queries_namespace(self):
    method queries_lang (line 388) | def queries_lang(self):
  class TrecColonQueries (line 392) | class TrecColonQueries(BaseQueries):
    method __init__ (line 393) | def __init__(self, queries_dlc, encoding=None, namespace=None, lang=No...
    method queries_iter (line 399) | def queries_iter(self):
    method queries_path (line 407) | def queries_path(self):
    method queries_cls (line 410) | def queries_cls(self):
    method queries_namespace (line 413) | def queries_namespace(self):
    method queries_lang (line 416) | def queries_lang(self):
  class TrecQrels (line 420) | class TrecQrels(BaseQrels):
    method __init__ (line 421) | def __init__(self, qrels_dlc, qrels_defs, format_3col=False):
    method qrels_path (line 426) | def qrels_path(self):
    method qrels_iter (line 429) | def qrels_iter(self):
    method _qrels_internal_iter (line 436) | def _qrels_internal_iter(self, dlc):
    method qrels_cls (line 454) | def qrels_cls(self):
    method qrels_defs (line 457) | def qrels_defs(self):
  class TrecPrels (line 461) | class TrecPrels(TrecQrels):
    method qrels_iter (line 462) | def qrels_iter(self):
    method qrels_cls (line 474) | def qrels_cls(self):
  class TrecSubQrels (line 478) | class TrecSubQrels(BaseQrels):
    method __init__ (line 479) | def __init__(self, qrels_dlc, qrels_defs):
    method qrels_path (line 483) | def qrels_path(self):
    method qrels_iter (line 486) | def qrels_iter(self):
    method _qrels_internal_iter (line 493) | def _qrels_internal_iter(self, dlc):
    method qrels_cls (line 506) | def qrels_cls(self):
    method qrels_defs (line 509) | def qrels_defs(self):
  class TrecScoredDocs (line 513) | class TrecScoredDocs(BaseScoredDocs):
    method __init__ (line 514) | def __init__(self, scoreddocs_dlc, negate_score=False):
    method scoreddocs_path (line 518) | def scoreddocs_path(self):
    method scoreddocs_iter (line 521) | def scoreddocs_iter(self):

FILE: ir_datasets/formats/tsv.py
  class FileLineIter (line 9) | class FileLineIter:
    method __init__ (line 10) | def __init__(self, dlc, start=None, stop=None, step=1):
    method __next__ (line 20) | def __next__(self):
    method __iter__ (line 47) | def __iter__(self):
    method __del__ (line 50) | def __del__(self):
    method __getitem__ (line 53) | def __getitem__(self, key):
  class TsvIter (line 84) | class TsvIter:
    method __init__ (line 85) | def __init__(self, cls, line_iter):
    method __iter__ (line 89) | def __iter__(self):
    method __next__ (line 92) | def __next__(self):
    method __getitem__ (line 110) | def __getitem__(self, key):
  class _TsvBase (line 114) | class _TsvBase:
    method __init__ (line 115) | def __init__(self, dlc, cls, datatype, skip_first_line=False):
    method _path (line 122) | def _path(self, force=True):
    method _iter (line 125) | def _iter(self):
  class TsvDocs (line 133) | class TsvDocs(_TsvBase, BaseDocs):
    method __init__ (line 134) | def __init__(self, docs_dlc, doc_cls=GenericDoc, doc_store_index_field...
    method docs_path (line 142) | def docs_path(self, force=True):
    method docs_iter (line 146) | def docs_iter(self):
    method docs_cls (line 149) | def docs_cls(self):
    method docs_store (line 152) | def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_namespace (line 165) | def docs_namespace(self):
    method docs_count (line 168) | def docs_count(self):
    method docs_lang (line 173) | def docs_lang(self):
  class TsvQueries (line 177) | class TsvQueries(_TsvBase, BaseQueries):
    method __init__ (line 178) | def __init__(self, queries_dlc, query_cls=GenericQuery, namespace=None...
    method queries_path (line 183) | def queries_path(self):
    method queries_iter (line 186) | def queries_iter(self):
    method queries_cls (line 189) | def queries_cls(self):
    method queries_namespace (line 192) | def queries_namespace(self):
    method queries_lang (line 195) | def queries_lang(self):
  class TsvDocPairs (line 199) | class TsvDocPairs(_TsvBase, BaseDocPairs):
    method __init__ (line 200) | def __init__(self, docpairs_dlc, docpair_cls=GenericDocPair):
    method docpairs_path (line 203) | def docpairs_path(self):
    method docpairs_iter (line 206) | def docpairs_iter(self):
    method docpairs_cls (line 209) | def docpairs_cls(self):

FILE: ir_datasets/formats/webarc.py
  class WarcDoc (line 10) | class WarcDoc(NamedTuple):
    method default_text (line 17) | def default_text(self):
  class WarcDocs (line 25) | class WarcDocs(BaseDocs):
    method __init__ (line 26) | def __init__(self, id_header='WARC-TREC-ID', warc_cw09=False, lang=None):
    method docs_iter (line 32) | def docs_iter(self):
    method _docs_warc_lib (line 35) | def _docs_warc_lib(self):
    method _docs_ctxt_iter_warc (line 40) | def _docs_ctxt_iter_warc(self, warcf):
    method docs_path (line 67) | def docs_path(self, force=True):
    method _docs_iter_source_files (line 70) | def _docs_iter_source_files(self):
    method _docs_id_to_source_file (line 73) | def _docs_id_to_source_file(self, doc_id):
    method _docs_warc_file_counts (line 77) | def _docs_warc_file_counts(self):
    method _docs_source_file_to_checkpoint (line 80) | def _docs_source_file_to_checkpoint(self, source_file):
    method docs_store (line 84) | def docs_store(self, options=ir_datasets.indices.DEFAULT_DOCSTORE_OPTI...
    method docs_cls (line 88) | def docs_cls(self):
    method docs_count (line 91) | def docs_count(self):
    method docs_lang (line 94) | def docs_lang(self):

FILE: ir_datasets/indices/base.py
  class FileAccess (line 5) | class FileAccess(Enum):
  class DocstoreOptions (line 12) | class DocstoreOptions:
  class Docstore (line 19) | class Docstore:
    method __init__ (line 20) | def __init__(self, doc_cls, id_field='doc_id', options: DocstoreOption...
    method get (line 26) | def get(self, doc_id, field=None):
    method get_many (line 32) | def get_many(self, doc_ids, field=None):
    method get_many_iter (line 42) | def get_many_iter(self, doc_ids):
    method clear_cache (line 45) | def clear_cache(self):

FILE: ir_datasets/indices/cache_docstore.py
  class CacheDocstore (line 7) | class CacheDocstore(Docstore):
    method __init__ (line 8) | def __init__(self, full_store, path, cache_cls=Lz4PickleLookup, option...
    method get_many_iter (line 14) | def get_many_iter(self, doc_ids):
    method clear_cache (line 26) | def clear_cache(self):

FILE: ir_datasets/indices/clueweb_warc.py
  class WarcIndexFile (line 8) | class WarcIndexFile:
    method __init__ (line 9) | def __init__(self, fileobj, mode, doc_id_size=25):
    method write (line 15) | def write(self, doc_id, doc_idx, state, pos, out_offset):
    method read (line 28) | def read(self):
    method peek_doc_id (line 45) | def peek_doc_id(self):
    method peek_doc_idx (line 48) | def peek_doc_idx(self):
    method __bool__ (line 54) | def __bool__(self):
    method __enter__ (line 58) | def __enter__(self):
    method __exit__ (line 61) | def __exit__(self, exc_type, exc_val, exc_tb):
    method close (line 64) | def close(self):
  class ClueWebWarcIndex (line 68) | class ClueWebWarcIndex:
    method __init__ (line 69) | def __init__(self, source_path, index_path, id_field='WARC-TREC-ID', w...
    method build (line 76) | def build(self, checkpoint_freq=8*1024*1024):
    method built (line 100) | def built(self):
    method get_many_iter (line 103) | def get_many_iter(self, doc_ids, docs_obj):
  class ClueWebWarcDocstore (line 132) | class ClueWebWarcDocstore(Docstore):
    method __init__ (line 133) | def __init__(self, warc_docs, options=None):
    method get_many_iter (line 139) | def get_many_iter(self, doc_ids):
  class WarcIter (line 162) | class WarcIter:
    method __init__ (line 163) | def __init__(self, warc_docs, slice):
    method __next__ (line 174) | def __next__(self):
    method close (line 221) | def close(self):
    method __iter__ (line 233) | def __iter__(self):
    method __del__ (line 236) | def __del__(self):
    method __getitem__ (line 239) | def __getitem__(self, key):

FILE: ir_datasets/indices/indexed_tsv_docstore.py
  class ZPickleKeyValueStore (line 14) | class ZPickleKeyValueStore:
    method __init__ (line 15) | def __init__(self, path, value_encoder=None):
    method built (line 20) | def built(self):
    method idx (line 23) | def idx(self):
    method bin (line 28) | def bin(self):
    method purge (line 33) | def purge(self):
    method transaction (line 42) | def transaction(self):
    method __getitem__ (line 47) | def __getitem__(self, value):
    method path (line 68) | def path(self, force=True):
    method __iter__ (line 71) | def __iter__(self):
    method __len__ (line 83) | def __len__(self):
  class IndexedTsvKeyValueStore (line 89) | class IndexedTsvKeyValueStore:
    method __init__ (line 90) | def __init__(self, path, value_encoder=None):
    method built (line 96) | def built(self):
    method idx (line 99) | def idx(self):
    method tsv (line 104) | def tsv(self):
    method purge (line 109) | def purge(self):
    method transaction (line 118) | def transaction(self):
    method __getitem__ (line 123) | def __getitem__(self, value):
    method path (line 156) | def path(self, force=True):
    method __iter__ (line 159) | def __iter__(self):
    method __len__ (line 179) | def __len__(self):
  class IndexedTsvDocStoreTransaction (line 184) | class IndexedTsvDocStoreTransaction:
    method __init__ (line 185) | def __init__(self, docstore):
    method __enter__ (line 191) | def __enter__(self):
    method __exit__ (line 194) | def __exit__(self, exc_type, exc_val, exc_tb):
    method commit (line 200) | def commit(self):
    method discard (line 207) | def discard(self):
    method add (line 210) | def add(self, key, fields):
  class ZPickleDocStoreTransaction (line 221) | class ZPickleDocStoreTransaction:
    method __init__ (line 222) | def __init__(self, docstore):
    method __enter__ (line 228) | def __enter__(self):
    method __exit__ (line 231) | def __exit__(self, exc_type, exc_val, exc_tb):
    method commit (line 237) | def commit(self):
    method discard (line 242) | def discard(self):
    method add (line 245) | def add(self, key, fields):
  class NumpyPosIndex (line 255) | class NumpyPosIndex:
    method __init__ (line 256) | def __init__(self, path):
    method add (line 265) | def add(self, did, idx):
    method commit (line 270) | def commit(self):
    method _lazy_load (line 287) | def _lazy_load(self):
    method get (line 295) | def get(self, did):
    method close (line 303) | def close(self):
    method __iter__ (line 312) | def __iter__(self):
    method __len__ (line 318) | def __len__(self):
  function dir_size (line 324) | def dir_size(path):
  class IndexedTsvDocstore (line 336) | class IndexedTsvDocstore:
    method __init__ (line 339) | def __init__(self, path, doc_cls, value_encoder='json', id_field='doc_...
    method built (line 346) | def built(self):
    method purge (line 349) | def purge(self):
    method build (line 352) | def build(self, documents):
    method get (line 357) | def get(self, did, field=None):
    method get_many (line 363) | def get_many(self, dids, field=None):
    method num_docs (line 372) | def num_docs(self):
    method docids (line 375) | def docids(self):
    method iter_docs (line 378) | def iter_docs(self):
    method path (line 382) | def path(self, force=True):
    method file_size (line 385) | def file_size(self):

FILE: ir_datasets/indices/lz4_pickle.py
  function _read_next (line 20) | def _read_next(f, data_cls):
  function _skip_next (line 29) | def _skip_next(f):
  function _write_next (line 34) | def _write_next(f, record):
  function safe_str (line 44) | def safe_str(s):
  class Lz4PickleIter (line 48) | class Lz4PickleIter:
    method __init__ (line 49) | def __init__(self, lookup, slice):
    method __next__ (line 56) | def __next__(self):
    method __iter__ (line 83) | def __iter__(self):
    method __del__ (line 86) | def __del__(self):
    method __getitem__ (line 95) | def __getitem__(self, key):
  class Lz4PickleLookup (line 111) | class Lz4PickleLookup:
    method __init__ (line 112) | def __init__(
    method bin (line 145) | def bin(self):
    method pos (line 166) | def pos(self):
    method idx (line 171) | def idx(self):
    method close (line 176) | def close(self):
    method clear (line 187) | def clear(self):
    method __del__ (line 195) | def __del__(self):
    method transaction (line 199) | def transaction(self):
    method __getitem__ (line 210) | def __getitem__(self, values):
    method path (line 231) | def path(self, force=True):
    method __iter__ (line 234) | def __iter__(self):
    method __len__ (line 237) | def __len__(self):
  class Lz4PickleTransaction (line 242) | class Lz4PickleTransaction:
    method __init__ (line 243) | def __init__(self, lookup):
    method __enter__ (line 251) | def __enter__(self):
    method __exit__ (line 263) | def __exit__(self, exc_type, exc_val, exc_tb):
    method commit (line 270) | def commit(self):
    method rollback (line 282) | def rollback(self):
    method add (line 294) | def add(self, record):
  class PickleLz4FullStore (line 307) | class PickleLz4FullStore(Docstore):
    method __init__ (line 308) | def __init__(
    method get_many_iter (line 334) | def get_many_iter(self, keys):
    method build (line 338) | def build(self):
    method built (line 355) | def built(self):
    method clear_cache (line 358) | def clear_cache(self):
    method __iter__ (line 361) | def __iter__(self):
    method count (line 365) | def count(self):

FILE: ir_datasets/indices/numpy_sorted_index.py
  class NumpySortedIndex (line 5) | class NumpySortedIndex:
    method __init__ (line 6) | def __init__(self, path, file_access=FileAccess.MMAP):
    method add (line 16) | def add(self, key, idx):
    method commit (line 21) | def commit(self):
    method _exists (line 46) | def _exists(self):
    method _lazy_load (line 49) | def _lazy_load(self):
    method __getitem__ (line 64) | def __getitem__(self, keys):
    method close (line 76) | def close(self):
    method clear (line 85) | def clear(self):
    method __del__ (line 92) | def __del__(self):
    method __iter__ (line 95) | def __iter__(self):
    method __len__ (line 102) | def __len__(self):
  class NumpyPosIndex (line 110) | class NumpyPosIndex:
    method __init__ (line 111) | def __init__(self, path, file_access=FileAccess.MMAP):
    method add (line 118) | def add(self, idx):
    method commit (line 123) | def commit(self):
    method _exists (line 139) | def _exists(self):
    method _lazy_load (line 142) | def _lazy_load(self):
    method __getitem__ (line 153) | def __getitem__(self, idxs):
    method close (line 163) | def close(self):
    method clear (line 168) | def clear(self):
    method __del__ (line 173) | def __del__(self):
    method __iter__ (line 176) | def __iter__(self):
    method __len__ (line 183) | def __len__(self):

FILE: ir_datasets/indices/zpickle_docstore.py
  class ZPickleKeyValueStore (line 15) | class ZPickleKeyValueStore:
    method __init__ (line 16) | def __init__(self, path, id_idx, doc_cls):
    method built (line 23) | def built(self):
    method idx (line 26) | def idx(self):
    method bin (line 31) | def bin(self):
    method purge (line 36) | def purge(self):
    method transaction (line 45) | def transaction(self):
    method __getitem__ (line 50) | def __getitem__(self, value):
    method path (line 72) | def path(self, force=True):
    method __iter__ (line 75) | def __iter__(self):
    method __len__ (line 88) | def __len__(self):
  class ZPickleDocStoreTransaction (line 93) | class ZPickleDocStoreTransaction:
    method __init__ (line 94) | def __init__(self, docstore):
    method __enter__ (line 100) | def __enter__(self):
    method __exit__ (line 103) | def __exit__(self, exc_type, exc_val, exc_tb):
    method commit (line 109) | def commit(self):
    method discard (line 114) | def discard(self):
    method add (line 117) | def add(self, key, fields):
  class ZPickleDocStore (line 127) | class ZPickleDocStore:
    method __init__ (line 130) | def __init__(self, path, doc_cls, id_field='doc_id'):
    method built (line 137) | def built(self):
    method purge (line 140) | def purge(self):
    method build (line 143) | def build(self, documents):
    method get (line 148) | def get(self, did, field=None):
    method get_many (line 153) | def get_many(self, dids, field=None):
    method num_docs (line 162) | def num_docs(self):
    method docids (line 165) | def docids(self):
    method __iter__ (line 168) | def __iter__(self):
    method path (line 171) | def path(self, force=True):

FILE: ir_datasets/lazy_libs.py
  function numpy (line 5) | def numpy():
  function tqdm (line 12) | def tqdm():
  function requests (line 19) | def requests():
  function bs4 (line 26) | def bs4():
  function inscriptis (line 36) | def inscriptis():
  function yaml (line 46) | def yaml():
  function json (line 53) | def json():
  function trec_car (line 60) | def trec_car():
  function warc (line 69) | def warc():
  function warc_clueweb09 (line 78) | def warc_clueweb09():
  function lz4_block (line 87) | def lz4_block():
  function lz4_frame (line 93) | def lz4_frame():
  function zlib_state (line 99) | def zlib_state():
  function xml_etree (line 108) | def xml_etree():
  function lxml_html (line 114) | def lxml_html():
  function ijson (line 120) | def ijson():
  function pyautocorpus (line 129) | def pyautocorpus():
  function unlzw3 (line 138) | def unlzw3():
  function pyarrow_parquet (line 147) | def pyarrow_parquet():

FILE: ir_datasets/log.py
  class TqdmHandler (line 9) | class TqdmHandler(logging.StreamHandler):
    method __init__ (line 10) | def __init__(self):
    method emit (line 13) | def emit(self, record):
  class Logger (line 36) | class Logger:
    method __init__ (line 37) | def __init__(self, name):
    method logger (line 41) | def logger(self):
    method debug (line 55) | def debug(self, text, **kwargs):
    method info (line 58) | def info(self, text, **kwargs):
    method warn (line 61) | def warn(self, text, **kwargs):
    method error (line 64) | def error(self, text, **kwargs):
    method critical (line 67) | def critical(self, text, **kwargs):
    method log (line 70) | def log(self, level, text, **kwargs):
    method pbar (line 73) | def pbar(self, it, *args, **kwargs):
    method pbar_raw (line 103) | def pbar_raw(self, *args, **kwargs):
    method duration (line 137) | def duration(self, message, level='INFO'):
  function easy (line 151) | def easy(name=None):
  function format_interval (line 158) | def format_interval(t):

FILE: ir_datasets/util/__init__.py
  function tmp_path (line 23) | def tmp_path():
  function home_path (line 30) | def home_path():
  function finialized_file (line 38) | def finialized_file(path, mode):
  class Lazy (line 55) | class Lazy:
    method __init__ (line 56) | def __init__(self, fn):
    method __call__ (line 62) | def __call__(self):
    method is_loaded (line 71) | def is_loaded(self):
  function apply_sub_slice (line 75) | def apply_sub_slice(orig_slice: slice, new_slice: slice):
  function slice_idx (line 130) | def slice_idx(orig_slice: slice, index: int):
  class DocstoreSplitter (line 138) | class DocstoreSplitter:
    method __init__ (line 139) | def __init__(self, it, docs_store):
    method __iter__ (line 143) | def __iter__(self):
    method __next__ (line 146) | def __next__(self):
    method __getitem__ (line 149) | def __getitem__(self, key):
  function use_docstore (line 153) | def use_docstore(fn):
  class Migrator (line 166) | class Migrator:
    method __init__ (line 167) | def __init__(self, version_file, version, affected_files, message=None...
    method __getattr__ (line 175) | def __getattr__(self, attr):
    method __call__ (line 181) | def __call__(self, wrapped):
    method _migrate (line 184) | def _migrate(self, fn):
    method _read_version (line 209) | def _read_version(self):
  function check_disk_free (line 214) | def check_disk_free(target_path, required_size, message='Insufficient di...
  function format_file_size (line 244) | def format_file_size(size):
  function ws_tok (line 253) | def ws_tok(s):

FILE: ir_datasets/util/docs/lazy.py
  class IRDSDocuments (line 13) | class IRDSDocuments(BaseDocs):
    method __init__ (line 16) | def __init__(self, ds_id: str):
    method docs (line 24) | def docs(self):
    method docs_cls (line 27) | def docs_cls(self):
    method docs_lang (line 30) | def docs_lang(self):
    method docs_count (line 33) | def docs_count(self):
    method docs_iter (line 36) | def docs_iter(self):
  class LazyDocs (line 40) | class LazyDocs(IRDSDocuments):
    method docs_store (line 42) | def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS):
  class DirectAccessDocs (line 46) | class DirectAccessDocs(Protocol):
    method __call__ (line 47) | def __call__(self) -> Sequence:
  class DocsListView (line 52) | class DocsListView:
    method __init__ (line 55) | def __init__(self, docs: "DocsList", slice: slice):
    method __getitem__ (line 59) | def __getitem__(self, slice: Union[int, slice]):
  class DocsList (line 66) | class DocsList(ABC):
    method get (line 70) | def get(self, ix: int):
    method __len__ (line 74) | def __len__(self):
    method __getitem__ (line 77) | def __getitem__(self, slice: Union[int, slice]):
  class LazyDocsIter (line 84) | class LazyDocsIter:
    method __init__ (line 88) | def __init__(self, _get_list_fn: DirectAccessDocs, iter):
    method _list (line 93) | def _list(self):
    method __getitem__ (line 96) | def __getitem__(self, slice: Union[int, slice]):
    method __iter__ (line 99) | def __iter__(self):
    method __next__ (line 102) | def __next__(self):
  class BaseTransformedDocs (line 106) | class BaseTransformedDocs(BaseDocs):
    method __init__ (line 107) | def __init__(self, docs: BaseDocs, cls, store_name, count=None):
    method docs_cls (line 118) | def docs_cls(self):
    method docs_lang (line 121) | def docs_lang(self):
    method docs_count (line 124) | def docs_count(self):
    method docs_store (line 128) | def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS):
  class TransformedDocs (line 140) | class TransformedDocs(BaseTransformedDocs):
    method __init__ (line 141) | def __init__(
    method docs_store (line 155) | def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_iter (line 160) | def docs_iter(self):
  class TransformedDocstore (line 166) | class TransformedDocstore(Docstore):
    method __init__ (line 169) | def __init__(self, store, transform):
    method get_many (line 173) | def get_many(self, doc_ids, field=None):
  class IterDocs (line 180) | class IterDocs(BaseDocs):
    method __init__ (line 183) | def __init__(
    method docs_count (line 200) | def docs_count(self):
    method docs_iter (line 205) | def docs_iter(self):
    method docs_cls (line 211) | def docs_cls(self):
    method docs_store (line 215) | def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS):
    method docs_namespace (line 226) | def docs_namespace(self):
    method docs_lang (line 229) | def docs_lang(self):

FILE: ir_datasets/util/docs/multiple.py
  class PrefixedDocsSpec (line 15) | class PrefixedDocsSpec:
    method length (line 26) | def length(self):
  class PrefixedDocstore (line 30) | class PrefixedDocstore(Docstore):
    method __init__ (line 31) | def __init__(self, docs_mapping: List[PrefixedDocsSpec], field="doc_id...
    method get_many (line 38) | def get_many(self, doc_ids: Sequence[str], field=None):
  class PrefixedDocs (line 68) | class PrefixedDocs(BaseDocs):
    method __init__ (line 71) | def __init__(self, store_name: Optional[str], *docs_mapping: PrefixedD...
    method lazy_self (line 83) | def lazy_self(self):
    method docs_cls (line 111) | def docs_cls(self):
    method docs_namespace (line 114) | def docs_namespace(self):
    method docs_lang (line 117) | def docs_lang(self):
    method __iter__ (line 120) | def __iter__(self):
    method _iter (line 123) | def _iter(self):
    method docs_iter (line 130) | def docs_iter(self):
    method docs_count (line 135) | def docs_count(self):
    method docs_store (line 142) | def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS):

FILE: ir_datasets/util/docs/subset.py
  class DocsSubsetList (line 15) | class DocsSubsetList(DocsList):
    method __init__ (line 18) | def __init__(self, main: "DocsSubset", indices: array.array):
    method get (line 22) | def get(self, ix: int):
    method __len__ (line 31) | def __len__(self):
  class Dupes (line 35) | class Dupes:
    method __init__ (line 36) | def __init__(self, base: BaseDownload, prefix: Optional[str] = None):
    method remove_prefix (line 42) | def remove_prefix(self, doc_id: str):
    method doc_ids (line 47) | def doc_ids(self):
    method has (line 56) | def has(self, doc_id: str):
    method __len__ (line 59) | def __len__(self):
  class ColonCommaDupes (line 63) | class ColonCommaDupes(Dupes):
    method doc_ids (line 70) | def doc_ids(self):
  class DocsSubset (line 83) | class DocsSubset(BaseDocs):
    method __init__ (line 86) | def __init__(self, store_name: str, docs: BaseDocs, removed_ids: "Dupe...
    method docs_list (line 92) | def docs_list(self):
    method docs_cls (line 118) | def docs_cls(self):
    method docs_lang (line 121) | def docs_lang(self):
    method docs_count (line 124) | def docs_count(self):
    method docs_iter (line 129) | def docs_iter(self):
    method docs_namespace (line 139) | def docs_namespace(self):
    method docs_store (line 142) | def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS):

FILE: ir_datasets/util/download.py
  class BaseDownload (line 20) | class BaseDownload:
    method stream (line 21) | def stream(self):
  class GoogleCloudBucketStream (line 24) | class GoogleCloudBucketStream(BaseDownload):
    method __init__ (line 25) | def __init__(self, url, tries=None):
    method __repr__ (line 29) | def __repr__(self):
    method stream (line 33) | def stream(self):
  class GoogleDriveDownload (line 39) | class GoogleDriveDownload(BaseDownload):
    method __init__ (line 40) | def __init__(self, url, tries=None):
    method stream (line 44) | def stream(self):
  class RequestsDownload (line 64) | class RequestsDownload(BaseDownload):
    method __init__ (line 65) | def __init__(self, url, tries=None, cookies=None, headers=None, auth=N...
    method stream (line 73) | def stream(self):
    method __iter__ (line 77) | def __iter__(self):
    method _iter_response_data (line 145) | def _iter_response_data(self, response, http_args, skip):
    method __repr__ (line 168) | def __repr__(self):
    method _handle_auth (line 171) | def _handle_auth(self, http_args):
  class LocalDownload (line 192) | class LocalDownload(BaseDownload):
    method __init__ (line 193) | def __init__(self, path, message=None, mkdir=True):
    method path (line 199) | def path(self, force=True):
    method stream (line 207) | def stream(self):
  function _cleanup_tmp (line 215) | def _cleanup_tmp(file):
  class Download (line 222) | class Download:
    method __init__ (line 225) | def __init__(self, mirrors, cache_path=None, expected_md5=None, dua=No...
    method path (line 234) | def path(self, force=True):
    method stream (line 283) | def stream(self):
    method dua_ctxt (line 295) | def dua_ctxt(cls, dua):
  class _DownloadConfig (line 301) | class _DownloadConfig:
    method __init__ (line 302) | def __init__(self, file=None, base_path=None, contents=None, dua=None,...
    method contents (line 311) | def contents(self):
    method context (line 317) | def context(self, key, base_path=None, dua=None):
    method get_home_path (line 321) | def get_home_path(self):
    method get_download_path (line 326) | def get_download_path(self):
    method __getitem__ (line 332) | def __getitem__(self, key):

FILE: ir_datasets/util/fileio.py
  class IterStream (line 21) | class IterStream(io.RawIOBase):
    method __init__ (line 22) | def __init__(self, it):
    method readable (line 27) | def readable(self):
    method readinto (line 30) | def readinto(self, b):
  class Cache (line 45) | class Cache:
    method __init__ (line 46) | def __init__(self, streamer, path):
    method verify (line 50) | def verify(self):
    method stream (line 77) | def stream(self):
    method path (line 82) | def path(self, force=True):
  class TarExtract (line 88) | class TarExtract:
    method __init__ (line 89) | def __init__(self, streamer, tar_path, compression='gz'):
    method stream (line 95) | def stream(self):
  class TarExtractAll (line 109) | class TarExtractAll:
    method __init__ (line 110) | def __init__(self, streamer, extract_path, compression='gz', path_glob...
    method path (line 116) | def path(self, force=True):
    method stream (line 134) | def stream(self):
  class RelativePath (line 138) | class RelativePath:
    method __init__ (line 139) | def __init__(self, streamer, path):
    method path (line 143) | def path(self, force=True):
    method stream (line 147) | def stream(self):
  class ReTar (line 152) | class ReTar:
    method __init__ (line 153) | def __init__(self, streamer, output_file, keep_globs, compression='gz'):
    method stream (line 160) | def stream(self):
  class GzipExtract (line 177) | class GzipExtract:
    method __init__ (line 178) | def __init__(self, streamer):
    method __getattr__ (line 181) | def __getattr__(self, attr):
    method stream (line 185) | def stream(self):
  class Bz2Extract (line 190) | class Bz2Extract:
    method __init__ (line 191) | def __init__(self, streamer):
    method __getattr__ (line 194) | def __getattr__(self, attr):
    method stream (line 198) | def stream(self):
  class Lz4Extract (line 203) | class Lz4Extract:
    method __init__ (line 204) | def __init__(self, streamer):
    method __getattr__ (line 207) | def __getattr__(self, attr):
    method stream (line 211) | def stream(self):
  class ZipExtract (line 217) | class ZipExtract:
    method __init__ (line 218) | def __init__(self, dlc, zip_path):
    method path (line 222) | def path(self, force=True):
    method stream (line 226) | def stream(self):
  class ZipExtractCache (line 234) | class ZipExtractCache:
    method __init__ (line 235) | def __init__(self, dlc, extract_path):
    method path (line 239) | def path(self, force=True):
    method stream (line 250) | def stream(self):
  class StringFile (line 254) | class StringFile:
    method __init__ (line 255) | def __init__(self, contents, path='MOCK'):
    method path (line 261) | def path(self, force=True):
    method stream (line 265) | def stream(self):
  class PackageDataFile (line 269) | class PackageDataFile:
    method __init__ (line 270) | def __init__(self, path, package='ir_datasets'):
    method path (line 274) | def path(self, force=True):
    method stream (line 278) | def stream(self):

FILE: ir_datasets/util/hash.py
  class HashVerificationError (line 10) | class HashVerificationError(IOError):
  class HashVerifier (line 14) | class HashVerifier:
    method __init__ (line 15) | def __init__(self, expected, algo='md5'):
    method update (line 20) | def update(self, b):
    method __enter__ (line 23) | def __enter__(self):
    method __exit__ (line 27) | def __exit__(self, exc_type, exc_val, exc_tb):
  class HashStream (line 37) | class HashStream(io.RawIOBase):
    method __init__ (line 38) | def __init__(self, stream, expected, algo='md5'):
    method readable (line 44) | def readable(self):
    method readinto (line 47) | def readinto(self, b):

FILE: ir_datasets/util/html_parsing.py
  function find_charset (line 7) | def find_charset(text):
  function decode_html (line 22) | def decode_html(body, headers=None):
  function sax_html_parser (line 31) | def sax_html_parser(body, headers=None, force_encoding=None, fields=None):
  class SaxExtractor (line 47) | class SaxExtractor:
    method __init__ (line 49) | def __init__(self, fields):
    method get_values (line 55) | def get_values(self):
    method _join_text (line 58) | def _join_text(self, text):
    method data (line 68) | def data(self, data):
    method start (line 76) | def start(self, tag, attrs):
    method end (line 84) | def end(self, tag):
    method close (line 92) | def close(self):
    method comment (line 95) | def comment(self, data):
    method doctype (line 98) | def doctype(self, *args):
    method pi (line 101) | def pi(self, *args):

FILE: ir_datasets/util/metadata.py
  class MetadataComponent (line 8) | class MetadataComponent:
    method __init__ (line 9) | def __init__(self, dataset_id, dataset, provider=None):
    method dataset_id (line 18) | def dataset_id(self):
    method metadata (line 21) | def metadata(self):
    method _metadata (line 28) | def _metadata(self, etype: ir_datasets.EntityType):
    method _count (line 31) | def _count(self, etype):
  class MetadataProvider (line 47) | class MetadataProvider:
    method __init__ (line 48) | def __init__(self, metadata_loader: Callable[[], Dict[str, Any]]):
    method get_metadata (line 52) | def get_metadata(self, dsid: str, entity_type: ir_datasets.EntityType)...
    method json_loader (line 62) | def json_loader(dlc):
  function count_hint (line 72) | def count_hint(

FILE: ir_datasets/util/registry.py
  class Registry (line 11) | class Registry:
    method __init__ (line 12) | def __init__(self, allow_overwrite=False):
    method __getitem__ (line 17) | def __getitem__(self, key):
    method __iter__ (line 31) | def __iter__(self):
    method register (line 34) | def register(self, name, obj):
    method register_pattern (line 45) | def register_pattern(self, pattern, initializer):

FILE: ir_datasets/wrappers/html_extractor.py
  function bs4_extract (line 11) | def bs4_extract(html):
  function inscriptis_extract (line 21) | def inscriptis_extract(html):
  class HtmlDocIter (line 26) | class HtmlDocIter:
    method __init__ (line 27) | def __init__(self, it, extractor):
    method __next__ (line 32) | def __next__(self):
    method __iter__ (line 35) | def __iter__(self):
    method __getitem__ (line 38) | def __getitem__(self, key):
  class HtmlDocExtractor (line 45) | class HtmlDocExtractor:
    method __init__ (line 46) | def __init__(self, dataset, extractor='bs4', parallel=0.8):
    method __getattr__ (line 68) | def __getattr__(self, attr):
    method docs_iter (line 71) | def docs_iter(self):
    method docs_store (line 74) | def docs_store(self, options=ir_datasets.indices.DEFAULT_DOCSTORE_OPTI...
  class HtmlDocExtractorDocStoreWrapper (line 78) | class HtmlDocExtractorDocStoreWrapper(ir_datasets.indices.Docstore):
    method __init__ (line 79) | def __init__(self, docstore, extractor):
    method get_many_iter (line 84) | def get_many_iter(self, doc_ids):
    method clear_cache (line 87) | def clear_cache(self):
  function _doc_map_it (line 92) | def _doc_map_it(it, extractor):
  function _doc_map (line 123) | def _doc_map(args):

FILE: test/downloads.py
  function tmp_environ (line 22) | def tmp_environ(**kwargs):
  class TestDownloads (line 38) | class TestDownloads(unittest.TestCase):
    method test_downloads (line 45) | def test_downloads(self):
    method _test_download_iter (line 68) | def _test_download_iter(self, data, prefix=''):
    method _test_download (line 81) | def _test_download(self, data, download_id):

FILE: test/formats/test_trec.py
  class TestTrec (line 8) | class TestTrec(unittest.TestCase):
    method test_qrels (line 10) | def test_qrels(self):
    method test_qrels_bad_line (line 33) | def test_qrels_bad_line(self):
    method test_queries (line 50) | def test_queries(self):
    method test_docs (line 89) | def test_docs(self):
    method tearDown (line 130) | def tearDown(self):

FILE: test/formats/test_tsv.py
  class TestTsv (line 9) | class TestTsv(unittest.TestCase):
    method test_core (line 11) | def test_core(self):
    method test_too_many_columns (line 41) | def test_too_many_columns(self):
    method test_too_few_columns (line 66) | def test_too_few_columns(self):
    method test_flex_columns (line 90) | def test_flex_columns(self):
    method tearDown (line 122) | def tearDown(self):

FILE: test/indices/lz4_pickle.py
  class TestLz4PickleLookup (line 8) | class TestLz4PickleLookup(unittest.TestCase):
    method test_lz4_pickle_lookup (line 9) | def test_lz4_pickle_lookup(self):

FILE: test/indices/numpy_sorted.py
  class TestNumpySortedIndex (line 7) | class TestNumpySortedIndex(unittest.TestCase):
    method test_numpy_sorted_index (line 8) | def test_numpy_sorted_index(self):

FILE: test/integration/antique.py
  class TestAntique (line 6) | class TestAntique(DatasetIntegrationTest):
    method test_antique (line 7) | def test_antique(self):
    method test_antique_train (line 14) | def test_antique_train(self):
    method test_antique_train_split200train (line 26) | def test_antique_train_split200train(self):
    method test_antique_train_split200valid (line 38) | def test_antique_train_split200valid(self):
    method test_antique_test (line 50) | def test_antique_test(self):
    method test_antique_test_nonoffensive (line 62) | def test_antique_test_nonoffensive(self):

FILE: test/integration/aol_ia.py
  class TestAolIa (line 10) | class TestAolIa(DatasetIntegrationTest):
    method test_docs (line 11) | def test_docs(self):
    method test_queries (line 18) | def test_queries(self):
    method test_qrels (line 25) | def test_qrels(self):
    method test_qlog (line 32) | def test_qlog(self):

FILE: test/integration/aquaint.py
  class TestAquaint (line 7) | class TestAquaint(DatasetIntegrationTest):
    method test_docs (line 8) | def test_docs(self):
    method test_queries (line 15) | def test_queries(self):
    method test_qrels (line 22) | def test_qrels(self):

FILE: test/integration/argsme.py
  class TestArgsMe (line 11) | class TestArgsMe(DatasetIntegrationTest):
    method test_docs (line 13) | def test_docs(self):

FILE: test/integration/base.py
  class DatasetIntegrationTest (line 9) | class DatasetIntegrationTest(unittest.TestCase):
    method _test_docs (line 10) | def _test_docs(self, dataset_name, count=None, items=None, test_docsto...
    method _test_queries (line 49) | def _test_queries(self, dataset_name, count=None, items=None):
    method _test_qrels (line 71) | def _test_qrels(self, dataset_name, count=None, items=None):
    method _test_qlogs (line 93) | def _test_qlogs(self, dataset_name, count=None, items=None):
    method _test_docpairs (line 115) | def _test_docpairs(self, dataset_name, count=None, items=None):
    method _build_test_docs (line 137) | def _build_test_docs(self, dataset_name, include_count=True, include_i...
    method _build_test_queries (line 157) | def _build_test_queries(self, dataset_name):
    method _build_test_qrels (line 173) | def _build_test_qrels(self, dataset_name):
    method _build_test_scoreddocs (line 189) | def _build_test_scoreddocs(self, dataset_name):
    method _build_test_docpairs (line 205) | def _build_test_docpairs(self, dataset_name):
    method _test_scoreddocs (line 217) | def _test_scoreddocs(self, dataset_name, count=None, items=None):
    method _build_test_qlogs (line 237) | def _build_test_qlogs(self, dataset_name):
    method _assert_namedtuple (line 249) | def _assert_namedtuple(self, a, b):
    method _replace_regex_namedtuple (line 268) | def _replace_regex_namedtuple(self, tup, maxlen=200):
    method _repr_namedtuples (line 287) | def _repr_namedtuples(self, items):
    method _repr_namedtuple (line 294) | def _repr_namedtuple(self, value):

FILE: test/integration/beir.py
  class TestBeir (line 8) | class TestBeir(DatasetIntegrationTest):
    method test_docs (line 9) | def test_docs(self):
    method test_queries (line 157) | def test_queries(self):
    method test_qrels (line 399) | def test_qrels(self):

FILE: test/integration/c4.py
  class TestCar (line 7) | class TestCar(DatasetIntegrationTest):
    method test_docs (line 8) | def test_docs(self):
    method test_queries (line 19) | def test_queries(self):

FILE: test/integration/car.py
  class TestCar (line 8) | class TestCar(DatasetIntegrationTest):
    method test_docs (line 9) | def test_docs(self):
    method test_queries (line 21) | def test_queries(self):
    method test_qrels (line 58) | def test_qrels(self):

FILE: test/integration/clinicaltrials.py
  class TestClinicalTrials (line 13) | class TestClinicalTrials(DatasetIntegrationTest):
    method test_docs (line 14) | def test_docs(self):
    method test_queries (line 31) | def test_queries(self):
    method test_qrels (line 58) | def test_qrels(self):

FILE: test/integration/clirmatrix.py
  class TestCLIRMatrix (line 12) | class TestCLIRMatrix(DatasetIntegrationTest):
    method test_docs (line 13) | def test_docs(self):
    method test_queries (line 35) | def test_queries(self):
    method test_qrels (line 82) | def test_qrels(self):

FILE: test/integration/clueweb09.py
  class TestClueWeb09 (line 12) | class TestClueWeb09(DatasetIntegrationTest):
    method test_clueweb09_docs (line 13) | def test_clueweb09_docs(self):
    method test_clueweb09_docstore (line 76) | def test_clueweb09_docstore(self):
    method test_clueweb09_queries (line 97) | def test_clueweb09_queries(self):
    method test_clueweb09_qrels (line 144) | def test_clueweb09_qrels(self):

FILE: test/integration/clueweb12.py
  class TestClueWeb12 (line 12) | class TestClueWeb12(DatasetIntegrationTest):
    method test_clueweb12_docs (line 13) | def test_clueweb12_docs(self):
    method test_clueweb12_docs_html (line 25) | def test_clueweb12_docs_html(self):
    method test_clueweb12_docstore (line 44) | def test_clueweb12_docstore(self):
    method test_clueweb12_queries (line 67) | def test_clueweb12_queries(self):
    method test_clueweb12_qrels (line 135) | def test_clueweb12_qrels(self):

FILE: test/integration/codec.py
  class TestCodec (line 8) | class TestCodec(DatasetIntegrationTest):
    method test_docs (line 9) | def test_docs(self):
    method test_queries (line 16) | def test_queries(self):
    method test_qrels (line 38) | def test_qrels(self):

FILE: test/integration/codesearchnet.py
  class TestCodeSearchNet (line 8) | class TestCodeSearchNet(DatasetIntegrationTest):
    method test_codesearchnet_docs (line 9) | def test_codesearchnet_docs(self):
    method test_codesearchnet_queries (line 16) | def test_codesearchnet_queries(self):
    method test_codesearchnet_qrels (line 38) | def test_codesearchnet_qrels(self):

FILE: test/integration/cord19.py
  class TestCord19 (line 8) | class TestCord19(DatasetIntegrationTest):
    method test_cord19_docs (line 9) | def test_cord19_docs(self):
    method test_cord19_queries (line 46) | def test_cord19_queries(self):
    method test_cord19_qrels (line 83) | def test_cord19_qrels(self):

FILE: test/integration/cranfield.py
  class TestCranfield (line 8) | class TestCranfield(DatasetIntegrationTest):
    method test_docs (line 9) | def test_docs(self):
    method test_queries (line 16) | def test_queries(self):
    method test_qrels (line 23) | def test_qrels(self):

FILE: test/integration/csl.py
  class TestCsl (line 8) | class TestCsl(DatasetIntegrationTest):
    method test_docs (line 9) | def test_docs(self):
    method test_queries (line 16) | def test_queries(self):
    method test_qrels (line 23) | def test_qrels(self):

FILE: test/integration/disks45.py
  class TestDisks45 (line 8) | class TestDisks45(DatasetIntegrationTest):
    method test_docs (line 9) | def test_docs(self):
    method test_queries (line 16) | def test_queries(self):
    method test_qrels (line 58) | def test_qrels(self):

FILE: test/integration/dpr_w100.py
  class TestDprW100 (line 8) | class TestDprW100(DatasetIntegrationTest):
    method test_docs (line 9) | def test_docs(self):
    method test_queries (line 16) | def test_queries(self):
    method test_qrels (line 38) | def test_qrels(self):

FILE: test/integration/dummy.py
  class TestDummy (line 10) | class TestDummy(DatasetIntegrationTest):
    method test_dummy_docs (line 11) | def test_dummy_docs(self):
    method test_dummy_queries (line 23) | def test_dummy_queries(self):
    method test_dummy_qrels (line 35) | def test_dummy_qrels(self):
    method tearDown (line 46) | def tearDown(self):

FILE: test/integration/gov.py
  class TestGov (line 12) | class TestGov(DatasetIntegrationTest):
    method test_docs (line 13) | def test_docs(self):
    method test_queries (line 20) | def test_queries(self):
    method test_gov2_qrels (line 47) | def test_gov2_qrels(self):

FILE: test/integration/gov2.py
  class TestGov2 (line 12) | class TestGov2(DatasetIntegrationTest):
    method test_gov2_docs (line 13) | def test_gov2_docs(self):
    method test_gov2_docstore (line 20) | def test_gov2_docstore(self):
    method test_gov2_queries (line 35) | def test_gov2_queries(self):
    method test_gov2_qrels (line 107) | def test_gov2_qrels(self):

FILE: test/integration/hc4.py
  class TestHC4 (line 6) | class TestHC4(DatasetIntegrationTest):
    method test_hc4_zh_docs (line 7) | def test_hc4_zh_docs(self):
    method test_hc4_fa_docs (line 14) | def test_hc4_fa_docs(self):
    method test_hc4_ru_docs (line 21) | def test_hc4_ru_docs(self):
    method test_hc4_zh_query (line 28) | def test_hc4_zh_query(self):
    method test_hc4_fa_query (line 40) | def test_hc4_fa_query(self):
    method test_hc4_ru_query (line 53) | def test_hc4_ru_query(self):
    method test_hc4_zh_qrels (line 64) | def test_hc4_zh_qrels(self):
    method test_hc4_fa_qrels (line 78) | def test_hc4_fa_qrels(self):
    method test_hc4_ru_qrels (line 92) | def test_hc4_ru_qrels(self):

FILE: test/integration/highwire.py
  class TestHighwire (line 12) | class TestHighwire(DatasetIntegrationTest):
    method test_highwire_docs (line 13) | def test_highwire_docs(self):
    method test_highwire_queries (line 21) | def test_highwire_queries(self):
    method test_highwire_qrels (line 34) | def test_highwire_qrels(self):

FILE: test/integration/istella22.py
  class TestIstella22 (line 9) | class TestIstella22(DatasetIntegrationTest):
    method test_docs (line 10) | def test_docs(self):
    method test_queries (line 17) | def test_queries(self):
    method test_qrels (line 49) | def test_qrels(self):

FILE: test/integration/kilt.py
  class TestKilt (line 9) | class TestKilt(DatasetIntegrationTest):
    method test_docs (line 10) | def test_docs(self):
    method test_queries (line 17) | def test_queries(self):
    method test_qrels (line 39) | def test_qrels(self):

FILE: test/integration/lotte.py
  class TestLotte (line 8) | class TestLotte(DatasetIntegrationTest):
    method test_docs (line 9) | def test_docs(self):
    method test_queries (line 71) | def test_queries(self):
    method test_qrels (line 193) | def test_qrels(self):

FILE: test/integration/medline.py
  class TestMedline (line 12) | class TestMedline(DatasetIntegrationTest):
    method test_medline_docs (line 13) | def test_medline_docs(self):
    method test_medline_queries (line 25) | def test_medline_queries(self):
    method test_medline_qrels (line 47) | def test_medline_qrels(self):

FILE: test/integration/miracl.py
  class TestMiracl (line 12) | class TestMiracl(DatasetIntegrationTest):
    method test_docs (line 13) | def test_docs(self):
    method test_queries (line 105) | def test_queries(self):
    method test_qrels (line 422) | def test_qrels(self):

FILE: test/integration/mmarco.py
  class TestMMarco (line 11) | class TestMMarco(DatasetIntegrationTest):
    method test_docs (line 12) | def test_docs(self):
    method test_queries (line 120) | def test_queries(self):
    method test_qrels (line 471) | def test_qrels(self):
    method test_scoreddocs (line 634) | def test_scoreddocs(self):
    method test_docpairs (line 742) | def test_docpairs(self):

FILE: test/integration/mr_tydi.py
  class TestMrTydi (line 8) | class TestMrTydi(DatasetIntegrationTest):
    method test_docs (line 9) | def test_docs(self):
    method test_queries (line 66) | def test_queries(self):
    method test_qrels (line 288) | def test_qrels(self):

FILE: test/integration/msmarco_document.py
  class TestMsMarcoDocument (line 8) | class TestMsMarcoDocument(DatasetIntegrationTest):
    method test_msmarco_document_docs (line 9) | def test_msmarco_document_docs(self):
    method test_msmarco_document_queries (line 16) | def test_msmarco_document_queries(self):
    method test_msmarco_document_qrels (line 89) | def test_msmarco_document_qrels(self):
    method test_msmarco_document_scoreddocs (line 156) | def test_msmarco_document_scoreddocs(self):
    method test_anchor_text (line 189) | def test_anchor_text(self):

FILE: test/integration/msmarco_document_v2.py
  class TestMsMarcoDocumentV2 (line 8) | class TestMsMarcoDocumentV2(DatasetIntegrationTest):
    method test_docs (line 9) | def test_docs(self):
    method test_queries (line 16) | def test_queries(self):
    method test_qrels (line 83) | def test_qrels(self):
    method test_scoreddocs (line 150) | def test_scoreddocs(self):
    method test_anchor_text (line 192) | def test_anchor_text(self):

FILE: test/integration/msmarco_passage.py
  class TestMsMarcoPassage (line 6) | class TestMsMarcoPassage(DatasetIntegrationTest):
    method test_msmarco_passage_docs (line 7) | def test_msmarco_passage_docs(self):
    method test_msmarco_passage_queries (line 24) | def test_msmarco_passage_queries(self):
    method test_msmarco_passage_qrels (line 131) | def test_msmarco_passage_qrels(self):
    method test_msmarco_passage_docpairs (line 229) | def test_msmarco_passage_docpairs(self):
    method test_msmarco_passage_scoreddocs (line 251) | def test_msmarco_passage_scoreddocs(self):

FILE: test/integration/msmarco_passage_v2.py
  class TestMsMarcoPassageV2 (line 9) | class TestMsMarcoPassageV2(DatasetIntegrationTest):
    method test_docs (line 10) | def test_docs(self):
    method test_queries (line 27) | def test_queries(self):
    method test_qrels (line 74) | def test_qrels(self):
    method test_scoreddocs (line 121) | def test_scoreddocs(self):

FILE: test/integration/msmarco_qna.py
  class TestMsMarcoQnA (line 8) | class TestMsMarcoQnA(DatasetIntegrationTest):
    method test_docs (line 9) | def test_docs(self):
    method test_queries (line 16) | def test_queries(self):
    method test_qrels (line 33) | def test_qrels(self):
    method test_scoreddocs (line 45) | def test_scoreddocs(self):

FILE: test/integration/nano_beir.py
  class TestBeir (line 8) | class TestBeir(DatasetIntegrationTest):
    method test_docs (line 9) | def test_docs(self):
    method test_queries (line 77) | def test_queries(self):
    method test_qrels (line 144) | def test_qrels(self):

FILE: test/integration/natural_questions.py
  class TestNq (line 8) | class TestNq(DatasetIntegrationTest):
    method test_docs (line 9) | def test_docs(self):
    method test_queries (line 16) | def test_queries(self):
    method test_qrels (line 28) | def test_qrels(self):
    method test_scoreddocs (line 40) | def test_scoreddocs(self):

FILE: test/integration/neuclir.py
  class TestNeuCLIR22 (line 7) | class TestNeuCLIR22(DatasetIntegrationTest):
    method test_docs (line 8) | def test_docs(self):
    method test_queries (line 45) | def test_queries(self):
    method test_qrels (line 97) | def test_qrels(self):

FILE: test/integration/neumarco.py
  class TestNeuMarco (line 7) | class TestNeuMarco(DatasetIntegrationTest):
    method test_docs (line 8) | def test_docs(self):
    method test_queries (line 25) | def test_queries(self):
    method test_qrels (line 53) | def test_qrels(self):
    method test_docpairs (line 81) | def test_docpairs(self):

FILE: test/integration/nfcorpus.py
  class TestNf (line 12) | class TestNf(DatasetIntegrationTest):
    method test_nf_docs (line 13) | def test_nf_docs(self):
    method test_nf_queries (line 20) | def test_nf_queries(self):
    method test_gov2_qrels (line 67) | def test_gov2_qrels(self):

FILE: test/integration/nyt.py
  class TestNyt (line 8) | class TestNyt(DatasetIntegrationTest):
    method test_nyt_docs (line 9) | def test_nyt_docs(self):
    method test_nyt_queries (line 16) | def test_nyt_queries(self):
    method test_nyt_qrels (line 38) | def test_nyt_qrels(self):

FILE: test/integration/pmc.py
  class TestPmc (line 12) | class TestPmc(DatasetIntegrationTest):
    method test_pmc_docs (line 13) | def test_pmc_docs(self):
    method test_pmc_queries (line 25) | def test_pmc_queries(self):
    method test_pmc_qrels (line 42) | def test_pmc_qrels(self):

FILE: test/integration/sara.py
  class TestSara (line 8) | class TestSara(DatasetIntegrationTest):
    method test_docs (line 9) | def test_docs(self):
    method test_queries (line 16) | def test_queries(self):
    method test_qrels (line 24) | def test_qrels(self):

FILE: test/integration/touche.py
  class TestTouche (line 11) | class TestTouche(DatasetIntegrationTest):
    method test_queries (line 14) | def test_queries(self):
    method test_qrels (line 177) | def test_qrels(self):

FILE: test/integration/touche_image.py
  class TestToucheImage (line 8) | class TestToucheImage(DatasetIntegrationTest):
    method test_docs (line 11) | def test_docs(self):

FILE: test/integration/trec_arabic.py
  class TestTrecArabic (line 7) | class TestTrecArabic(DatasetIntegrationTest):
    method test_trec_arabic_docs (line 8) | def test_trec_arabic_docs(self):
    method test_trec_arabic_queries (line 15) | def test_trec_arabic_queries(self):
    method test_trec_arabic_qrels (line 27) | def test_trec_arabic_qrels(self):

FILE: test/integration/trec_cast.py
  class TestTrecCast (line 8) | class TestTrecCast(DatasetIntegrationTest):
    method test_docs (line 9) | def test_docs(self):
    method test_queries (line 29) | def test_queries(self):
    method test_qrels (line 72) | def test_qrels(self):
    method test_scoreddocs (line 114) | def test_scoreddocs(self):

FILE: test/integration/trec_fair.py
  class TestFairTrec (line 8) | class TestFairTrec(DatasetIntegrationTest):
    method test_docs (line 9) | def test_docs(self):
    method test_queries (line 21) | def test_queries(self):
    method test_qrels (line 38) | def test_qrels(self):

FILE: test/integration/trec_mandarin.py
  class TestTrecMandarin (line 8) | class TestTrecMandarin(DatasetIntegrationTest):
    method test_trec_mandarin_docs (line 9) | def test_trec_mandarin_docs(self):
    method test_trec_mandarin_queries (line 16) | def test_trec_mandarin_queries(self):
    method test_trec_mandarin_qrels (line 28) | def test_trec_mandarin_qrels(self):

FILE: test/integration/trec_robust04.py
  class TestTrecRobust04 (line 7) | class TestTrecRobust04(DatasetIntegrationTest):
    method test_trec_robust04_docs (line 8) | def test_trec_robust04_docs(self):
    method test_trec_robust04_queries (line 16) | def test_trec_robust04_queries(self):
    method test_trec_robust04_qrels (line 48) | def test_trec_robust04_qrels(self):

FILE: test/integration/trec_spanish.py
  class TestTrecSpanish (line 8) | class TestTrecSpanish(DatasetIntegrationTest):
    method test_trec_spanish_docs (line 9) | def test_trec_spanish_docs(self):
    method test_trec_spanish_queries (line 16) | def test_trec_spanish_queries(self):
    method test_trec_spanish_qrels (line 28) | def test_trec_spanish_qrels(self):

FILE: test/integration/trec_tot.py
  class TestTipOfTheTongue (line 9) | class TestTipOfTheTongue(DatasetIntegrationTest):
    method test_tip_of_the_tongue_docs (line 10) | def test_tip_of_the_tongue_docs(self):
    method test_test_tip_of_the_tongue_queries_train (line 17) | def test_test_tip_of_the_tongue_queries_train(self):
    method test_test_tip_of_the_tongue_queries_dev (line 24) | def test_test_tip_of_the_tongue_queries_dev(self):
    method test_test_tip_of_the_tongue_qrels_train (line 31) | def test_test_tip_of_the_tongue_qrels_train(self):
    method test_test_tip_of_the_tongue_qrels_dev (line 39) | def test_test_tip_of_the_tongue_qrels_dev(self):

FILE: test/integration/trec_tot_2024.py
  class TestTipOfTheTongue (line 9) | class TestTipOfTheTongue(DatasetIntegrationTest):
    method test_tip_of_the_tongue_docs (line 10) | def test_tip_of_the_tongue_docs(self):
    method test_tip_of_the_tongue_queries (line 16) | def test_tip_of_the_tongue_queries(self):

FILE: test/integration/trec_tot_2025/test_docs_iter.py
  function load_dataset (line 3) | def load_dataset():
  function load_doc_number (line 7) | def load_doc_number(num):
  class TestDocsIter (line 14) | class TestDocsIter(unittest.TestCase):
    method test_dataset_can_be_loaded (line 15) | def test_dataset_can_be_loaded(self):
    method test_first_doc (line 19) | def test_first_doc(self):
    method test_third_doc (line 30) | def test_third_doc(self):

FILE: test/integration/trec_tot_2025/test_docs_store.py
  function load_docs_store (line 3) | def load_docs_store():
  class TestDocsStore (line 7) | class TestDocsStore(unittest.TestCase):
    method test_docs_store_can_be_loaded (line 8) | def test_docs_store_can_be_loaded(self):
    method test_first_doc (line 12) | def test_first_doc(self):
    method test_third_doc (line 23) | def test_third_doc(self):
    method test_some_random_doc (line 33) | def test_some_random_doc(self):

FILE: test/integration/trec_tot_2025/test_qrel_iter.py
  function load_dataset (line 3) | def load_dataset(dataset_id):
  function load_qrel_number (line 7) | def load_qrel_number(dataset_id, num):
  class TestQrelIter (line 14) | class TestQrelIter(unittest.TestCase):
    method test_train_dataset_can_be_loaded (line 15) | def test_train_dataset_can_be_loaded(self):
    method test_dev1_dataset_can_be_loaded (line 19) | def test_dev1_dataset_can_be_loaded(self):
    method test_dev2_dataset_can_be_loaded (line 23) | def test_dev2_dataset_can_be_loaded(self):
    method test_dev3_dataset_can_be_loaded (line 27) | def test_dev3_dataset_can_be_loaded(self):
    method test_train_qrel_iter (line 31) | def test_train_qrel_iter(self):
    method test_dev1_qrel_iter (line 37) | def test_dev1_qrel_iter(self):
    method test_dev2_qrel_iter (line 43) | def test_dev2_qrel_iter(self):
    method test_dev3_qrel_iter (line 49) | def test_dev3_qrel_iter(self):

FILE: test/integration/trec_tot_2025/test_queries_iter.py
  function load_dataset (line 3) | def load_dataset(dataset_id):
  function load_query_number (line 7) | def load_query_number(dataset_id, num):
  class TestQueriesIter (line 14) | class TestQueriesIter(unittest.TestCase):
    method test_train_dataset_can_be_loaded (line 15) | def test_train_dataset_can_be_loaded(self):
    method test_dev1_dataset_can_be_loaded (line 19) | def test_dev1_dataset_can_be_loaded(self):
    method test_dev2_dataset_can_be_loaded (line 23) | def test_dev2_dataset_can_be_loaded(self):
    method test_dev3_dataset_can_be_loaded (line 27) | def test_dev3_dataset_can_be_loaded(self):
    method test_query_from_train_dataset_can_be_loaded_01 (line 31) | def test_query_from_train_dataset_can_be_loaded_01(self):
    method test_query_from_train_dataset_can_be_loaded_02 (line 37) | def test_query_from_train_dataset_can_be_loaded_02(self):
    method test_query_from_dev1_dataset_can_be_loaded_01 (line 43) | def test_query_from_dev1_dataset_can_be_loaded_01(self):
    method test_query_from_dev1_dataset_can_be_loaded_02 (line 49) | def test_query_from_dev1_dataset_can_be_loaded_02(self):
    method test_query_from_dev2_dataset_can_be_loaded_01 (line 55) | def test_query_from_dev2_dataset_can_be_loaded_01(self):
    method test_query_from_dev2_dataset_can_be_loaded_02 (line 61) | def test_query_from_dev2_dataset_can_be_loaded_02(self):
    method test_query_from_dev3_dataset_can_be_loaded_01 (line 67) | def test_query_from_dev3_dataset_can_be_loaded_01(self):
    method test_query_from_dev3_dataset_can_be_loaded_02 (line 73) | def test_query_from_dev3_dataset_can_be_loaded_02(self):
    method test_query_from_test_dataset_can_be_loaded_01 (line 79) | def test_query_from_test_dataset_can_be_loaded_01(self):
    method test_query_from_test_dataset_can_be_loaded_02 (line 85) | def test_query_from_test_dataset_can_be_loaded_02(self):

FILE: test/integration/tripclick.py
  class TestTripclick (line 9) | class TestTripclick(DatasetIntegrationTest):
    method test_docs (line 10) | def test_docs(self):
    method test_queries (line 22) | def test_queries(self):
    method test_qlogs (line 94) | def test_qlogs(self):
    method test_qrels (line 103) | def test_qrels(self):
    method test_scoreddocs (line 155) | def test_scoreddocs(self):
    method test_docpairs (line 202) | def test_docpairs(self):

FILE: test/integration/tweets2013_ia.py
  class TestTweets2013Ia (line 8) | class TestTweets2013Ia(DatasetIntegrationTest):
    method test_docs (line 9) | def test_docs(self):
    method test_queries (line 16) | def test_queries(self):
    method test_qrels (line 28) | def test_qrels(self):

FILE: test/integration/vaswani.py
  class TestVaswani (line 7) | class TestVaswani(DatasetIntegrationTest):
    method test_vaswani_docs (line 8) | def test_vaswani_docs(self):
    method test_vaswani_queries (line 15) | def test_vaswani_queries(self):
    method test_vaswani_qrels (line 22) | def test_vaswani_qrels(self):

FILE: test/integration/wapo.py
  class TestWapo (line 8) | class TestWapo(DatasetIntegrationTest):
    method test_docs (line 9) | def test_docs(self):
    method test_queries (line 23) | def test_queries(self):
    method test_qrels (line 45) | def test_qrels(self):

FILE: test/integration/wikiclir.py
  class TestWikiclir (line 8) | class TestWikiclir(DatasetIntegrationTest):
    method test_docs (line 9) | def test_docs(self):
    method test_queries (line 136) | def test_queries(self):
    method test_qrels (line 264) | def test_qrels(self):

FILE: test/integration/wikir.py
  class TestWikir (line 7) | class TestWikir(DatasetIntegrationTest):
    method test_docs (line 8) | def test_docs(self):
    method test_queries (line 45) | def test_queries(self):
    method test_qrels (line 152) | def test_qrels(self):
    method test_scoreddocs (line 259) | def test_scoreddocs(self):

FILE: test/metadata.py
  class TestMetadata (line 5) | class TestMetadata(unittest.TestCase):
    method test_all_metadata_available (line 6) | def test_all_metadata_available(self):
    method _test_ds (line 24) | def _test_ds(self, dsid):

FILE: test/test_defaulttext.py
  function template_instance (line 4) | def template_instance(Cls):
  class TestMetadata (line 21) | class TestMetadata(unittest.TestCase):
    method test_all_defualttext (line 22) | def test_all_defualttext(self):
    method _test_defaulttet (line 26) | def _test_defaulttet(self, dsid):

FILE: test/util.py
  class TestUtil (line 5) | class TestUtil(unittest.TestCase):
    method test_apply_sub_slice (line 6) | def test_apply_sub_slice(self):
    method test_corpus_id (line 23) | def test_corpus_id(self):
    method test_html_find_charset (line 38) | def test_html_find_charset(self):
    method test_decode_html (line 48) | def test_decode_html(self):
    method test_sax_html_parser (line 52) | def test_sax_html_parser(self):

FILE: test/util/docs/data.py
  class OtherDoc (line 8) | class OtherDoc:
    method __init__ (line 9) | def __init__(self, id: str, text: str):
  class FakeDocs (line 14) | class FakeDocs(BaseDocs):
    method __init__ (line 15) | def __init__(self, n_docs: int, namespace = 'test', lang='en', docs_cl...
    method docs_count (line 27) | def docs_count(self):
    method docs_iter (line 30) | def docs_iter(self):
    method docs_cls (line 33) | def docs_cls(self):
    method docs_lang (line 36) | def docs_lang(self):
    method docs_namepace (line 39) | def docs_namepace(self):
    method docs_store (line 42) | def docs_store(self, field="doc_id", options=DEFAULT_DOCSTORE_OPTIONS)...
  class FakeDocstore (line 46) | class FakeDocstore(Docstore):
    method __init__ (line 47) | def __init__(self, docs: FakeDocs):
    method get_many (line 50) | def get_many(self, doc_ids, field=None):

FILE: test/util/docs/test_multiple.py
  function test_multiple_prefixes (line 8) | def test_multiple_prefixes():
  function test_multiple_prefixes_inlined (line 62) | def test_multiple_prefixes_inlined():

FILE: test/util/docs/test_subset.py
  class SimpleDupes (line 4) | class SimpleDupes(Dupes):
    method __init__ (line 5) | def __init__(self, doc_ids):
  function test_subset_simple (line 9) | def test_subset_simple():
Condensed preview — 269 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (3,416K chars).
[
  {
    "path": ".github/ISSUE_TEMPLATE/bug_report.md",
    "chars": 484,
    "preview": "---\nname: Bug report\nabout: Errors in behavior or functionality\ntitle: ''\nlabels: bug\nassignees: ''\n\n---\n\n**Describe the"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/dataset-addition.md",
    "chars": 1468,
    "preview": "---\nname: Dataset Addition\nabout: Propose adding a new dataset, collection of related datasets, or feature to\n  existing"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/documentation.md",
    "chars": 183,
    "preview": "---\nname: Documentation\nabout: Additions to or improvmenets to the documentation\ntitle: ''\nlabels: documentation\nassigne"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/feature_request.md",
    "chars": 604,
    "preview": "---\nname: Feature request\nabout: Suggest an idea for this project\ntitle: ''\nlabels: enhancement\nassignees: ''\n\n---\n\n**Is"
  },
  {
    "path": ".github/workflows/deploy.yml",
    "chars": 576,
    "preview": "name: deploy\n\non:\n  release:\n    types: [created]\n\njobs:\n  pypi:\n    runs-on: ubuntu-latest\n    steps:\n    - uses: actio"
  },
  {
    "path": ".github/workflows/test.yml",
    "chars": 1244,
    "preview": "name: test\n\non:\n  push: {branches: [master]} # pushes to master\n  pull_request: {} # all PRs\n\njobs:\n  pytest:\n    strate"
  },
  {
    "path": ".gitignore",
    "chars": 2046,
    "preview": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packagi"
  },
  {
    "path": "LICENSE",
    "chars": 11358,
    "preview": "\n                                 Apache License\n                           Version 2.0, January 2004\n                  "
  },
  {
    "path": "MANIFEST.in",
    "chars": 110,
    "preview": "recursive-include ir_datasets *.yaml\nrecursive-include ir_datasets *.bib\nrecursive-include ir_datasets *.json\n"
  },
  {
    "path": "README.md",
    "chars": 11554,
    "preview": "# ir_datasets\n\n`ir_datasets` is a python package that provides a common interface to many IR ad-hoc ranking\nbenchmarks, "
  },
  {
    "path": "examples/adding_datasets.ipynb",
    "chars": 8734,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# ir_datasets - Adding Datasets\"\n  "
  },
  {
    "path": "examples/clirmatrix_example.py",
    "chars": 2106,
    "preview": "import ir_datasets\n\"\"\"\ndataset name\nclirmatrix/[query language code]/dataset/[doc language code]/[split]\n\noptions:\n-----"
  },
  {
    "path": "examples/ir_datasets.ipynb",
    "chars": 114685,
    "preview": "{\n  \"nbformat\": 4,\n  \"nbformat_minor\": 0,\n  \"metadata\": {\n    \"colab\": {\n      \"name\": \"ir-datasets.ipynb\",\n      \"prove"
  },
  {
    "path": "examples/ir_datasets_cli.ipynb",
    "chars": 50773,
    "preview": "{\n  \"nbformat\": 4,\n  \"nbformat_minor\": 0,\n  \"metadata\": {\n    \"colab\": {\n      \"name\": \"ir_datasets-cli.ipynb\",\n      \"p"
  },
  {
    "path": "ir_datasets/__init__.py",
    "chars": 3287,
    "preview": "from enum import Enum\nclass EntityType(Enum):\n    docs = \"docs\"\n    queries = \"queries\"\n    qrels = \"qrels\"\n    scoreddo"
  },
  {
    "path": "ir_datasets/__main__.py",
    "chars": 74,
    "preview": "import ir_datasets\n\nif __name__ == '__main__':\n    ir_datasets.main_cli()\n"
  },
  {
    "path": "ir_datasets/commands/__init__.py",
    "chars": 638,
    "preview": "from . import doc_fifos\nfrom . import export\nfrom . import lookup\nfrom . import list as list_cmd\nfrom . import build_clu"
  },
  {
    "path": "ir_datasets/commands/build_c4_checkpoints.py",
    "chars": 3161,
    "preview": "import os\nimport sys\nimport multiprocessing\nfrom pathlib import Path\nimport gzip\nimport hashlib\nimport json\nimport pickl"
  },
  {
    "path": "ir_datasets/commands/build_clueweb_warc_indexes.py",
    "chars": 1728,
    "preview": "import sys\nimport multiprocessing\nfrom pathlib import Path\nimport argparse\nimport ir_datasets\n\n\n_logger = ir_datasets.lo"
  },
  {
    "path": "ir_datasets/commands/build_download_cache.py",
    "chars": 2441,
    "preview": "import sys\nimport time\nimport io\nimport os\nimport argparse\nimport json\nfrom contextlib import contextmanager\nimport ir_d"
  },
  {
    "path": "ir_datasets/commands/clean.py",
    "chars": 4090,
    "preview": "import sys\nimport os\nimport argparse\nimport multiprocessing\nfrom collections import deque\nimport ir_datasets\nfrom ir_dat"
  },
  {
    "path": "ir_datasets/commands/doc_fifos.py",
    "chars": 2983,
    "preview": "import sys\nimport os\nimport select\nimport tempfile\nimport contextlib\nimport json\nimport argparse\nimport multiprocessing\n"
  },
  {
    "path": "ir_datasets/commands/export.py",
    "chars": 10515,
    "preview": "import sys\nimport json\nimport argparse\nimport ir_datasets\n\n\n_logger = ir_datasets.log.easy()\n\n\ndef main_docs(dataset, ar"
  },
  {
    "path": "ir_datasets/commands/generate_metadata.py",
    "chars": 3293,
    "preview": "import time\nimport sys\nimport os\nimport json\nimport argparse\nfrom pathlib import Path\nfrom fnmatch import fnmatch\nimport"
  },
  {
    "path": "ir_datasets/commands/list.py",
    "chars": 476,
    "preview": "import sys\nimport argparse\nimport ir_datasets\nfrom ir_datasets.commands.export import DEFAULT_EXPORTERS\n\n\n_logger = ir_d"
  },
  {
    "path": "ir_datasets/commands/lookup.py",
    "chars": 1910,
    "preview": "import sys\nimport argparse\nimport ir_datasets\nfrom ir_datasets.commands.export import DEFAULT_EXPORTERS\n\n\n_logger = ir_d"
  },
  {
    "path": "ir_datasets/datasets/__init__.py",
    "chars": 1478,
    "preview": "from . import base\nfrom . import antique\nfrom . import aol_ia\nfrom . import aquaint\nfrom . import argsme\nfrom . import b"
  },
  {
    "path": "ir_datasets/datasets/antique.py",
    "chars": 5400,
    "preview": "import io\nimport ir_datasets\nfrom ir_datasets.formats import TsvDocs, TrecQrels, TsvQueries\nfrom ir_datasets.util import"
  },
  {
    "path": "ir_datasets/datasets/aol_ia.py",
    "chars": 8476,
    "preview": "from datetime import datetime\nimport json\nimport pickle\nimport re\nimport contextlib\nfrom collections import Counter\nfrom"
  },
  {
    "path": "ir_datasets/datasets/aquaint.py",
    "chars": 1413,
    "preview": "import ir_datasets\nfrom ir_datasets.util import DownloadConfig\nfrom ir_datasets.formats import TrecQrels, TrecDocs, Trec"
  },
  {
    "path": "ir_datasets/datasets/argsme.py",
    "chars": 3747,
    "preview": "from itertools import chain\nfrom typing import Dict\n\nfrom ir_datasets import registry\nfrom ir_datasets.datasets.base imp"
  },
  {
    "path": "ir_datasets/datasets/base.py",
    "chars": 13934,
    "preview": "import pkgutil\nimport contextlib\nimport itertools\nfrom pathlib import Path\nimport ir_datasets\nfrom ir_datasets.formats i"
  },
  {
    "path": "ir_datasets/datasets/beir.py",
    "chars": 10129,
    "preview": "import json\nimport codecs\nfrom typing import NamedTuple, Dict, List\nimport ir_datasets\nfrom ir_datasets.util import ZipE"
  },
  {
    "path": "ir_datasets/datasets/c4.py",
    "chars": 9363,
    "preview": "import re\nimport os\nimport json\nimport pickle\nfrom pathlib import Path\nfrom typing import NamedTuple, Tuple\nimport ir_da"
  },
  {
    "path": "ir_datasets/datasets/car.py",
    "chars": 6211,
    "preview": "from typing import NamedTuple, Tuple\nimport ir_datasets\nfrom ir_datasets.util import DownloadConfig, TarExtract, ReTar\nf"
  },
  {
    "path": "ir_datasets/datasets/clinicaltrials.py",
    "chars": 6369,
    "preview": "import codecs\nimport itertools\nimport io\nimport gzip\nfrom contextlib import ExitStack\nimport itertools\nfrom typing impor"
  },
  {
    "path": "ir_datasets/datasets/clirmatrix.py",
    "chars": 4642,
    "preview": "import json\nimport contextlib\nfrom pathlib import Path\nfrom typing import NamedTuple\nimport ir_datasets\nfrom ir_datasets"
  },
  {
    "path": "ir_datasets/datasets/clueweb09.py",
    "chars": 13622,
    "preview": "import os\nimport codecs\nfrom pathlib import Path\nfrom typing import NamedTuple, Tuple\nfrom glob import glob\nimport ir_da"
  },
  {
    "path": "ir_datasets/datasets/clueweb12.py",
    "chars": 17599,
    "preview": "import codecs\nimport io\nimport os\nimport gzip\nimport contextlib\nfrom typing import NamedTuple, Tuple\nfrom glob import gl"
  },
  {
    "path": "ir_datasets/datasets/codec.py",
    "chars": 2906,
    "preview": "import json\nfrom typing import NamedTuple\nimport ir_datasets\nfrom ir_datasets.util import Lazy\nfrom ir_datasets.formats "
  },
  {
    "path": "ir_datasets/datasets/codesearchnet.py",
    "chars": 7252,
    "preview": "import json\nimport csv\nimport gzip\nfrom typing import NamedTuple\nimport io\nimport itertools\nfrom pathlib import Path\nimp"
  },
  {
    "path": "ir_datasets/datasets/cord19.py",
    "chars": 10327,
    "preview": "import io\nimport codecs\nimport json\nimport csv\nimport contextlib\nimport os\nimport shutil\nimport tarfile\nfrom collections"
  },
  {
    "path": "ir_datasets/datasets/cranfield.py",
    "chars": 5861,
    "preview": "import io\nimport codecs\nimport itertools\nimport ir_datasets\nfrom typing import NamedTuple\nfrom ir_datasets.util import D"
  },
  {
    "path": "ir_datasets/datasets/csl.py",
    "chars": 1936,
    "preview": "from typing import List, NamedTuple\nfrom enum import Enum\n\nimport ir_datasets\nfrom ir_datasets.util import DownloadConfi"
  },
  {
    "path": "ir_datasets/datasets/disks45.py",
    "chars": 5049,
    "preview": "import ir_datasets\nfrom ir_datasets.util import GzipExtract, TarExtract, Lazy, DownloadConfig\nfrom ir_datasets.formats i"
  },
  {
    "path": "ir_datasets/datasets/dpr_w100.py",
    "chars": 5964,
    "preview": "from typing import NamedTuple, Tuple\nimport contextlib\nimport itertools\nimport ir_datasets\nfrom ir_datasets.util import "
  },
  {
    "path": "ir_datasets/datasets/gov.py",
    "chars": 7318,
    "preview": "import re\nimport io\nimport os\nimport gzip\nimport codecs\nfrom collections import Counter\nfrom contextlib import contextma"
  },
  {
    "path": "ir_datasets/datasets/gov2.py",
    "chars": 17531,
    "preview": "import re\nimport io\nimport os\nimport gzip\nimport codecs\nfrom collections import Counter\nfrom contextlib import contextma"
  },
  {
    "path": "ir_datasets/datasets/hc4.py",
    "chars": 1760,
    "preview": "import ir_datasets\nfrom ir_datasets.util import DownloadConfig\nfrom ir_datasets.formats import TrecQrels\nfrom ir_dataset"
  },
  {
    "path": "ir_datasets/datasets/highwire.py",
    "chars": 7846,
    "preview": "import codecs\nfrom typing import NamedTuple, Tuple\nfrom zipfile import ZipFile\nimport ir_datasets\nfrom ir_datasets.util "
  },
  {
    "path": "ir_datasets/datasets/istella22.py",
    "chars": 2687,
    "preview": "import json\nimport codecs\nfrom typing import NamedTuple, Dict, List\nimport ir_datasets\nfrom ir_datasets.util import TarE"
  },
  {
    "path": "ir_datasets/datasets/kilt.py",
    "chars": 4598,
    "preview": "import json\nimport codecs\nfrom typing import NamedTuple, Tuple\nimport ir_datasets\nfrom ir_datasets.util import TarExtrac"
  },
  {
    "path": "ir_datasets/datasets/lotte.py",
    "chars": 2474,
    "preview": "import json\nimport codecs\nfrom typing import NamedTuple, Dict, List\nimport ir_datasets\nfrom ir_datasets.util import TarE"
  },
  {
    "path": "ir_datasets/datasets/medline.py",
    "chars": 10104,
    "preview": "import codecs\nimport itertools\nimport io\nimport gzip\nfrom contextlib import ExitStack\nimport itertools\nfrom typing impor"
  },
  {
    "path": "ir_datasets/datasets/miracl.py",
    "chars": 3618,
    "preview": "import ir_datasets\nfrom typing import NamedTuple\nfrom ir_datasets.util import DownloadConfig, GzipExtract\nfrom ir_datase"
  },
  {
    "path": "ir_datasets/datasets/mmarco.py",
    "chars": 4466,
    "preview": "import io\nimport codecs\nimport re\nimport ir_datasets\nfrom ir_datasets.util import DownloadConfig, Lazy\nfrom ir_datasets."
  },
  {
    "path": "ir_datasets/datasets/mr_tydi.py",
    "chars": 4162,
    "preview": "import json\nimport codecs\nfrom typing import NamedTuple, Dict\nimport ir_datasets\nfrom ir_datasets.util import TarExtract"
  },
  {
    "path": "ir_datasets/datasets/msmarco_document.py",
    "chars": 9031,
    "preview": "from typing import NamedTuple, List\nimport json\nimport ir_datasets\nfrom ir_datasets.indices import PickleLz4FullStore, D"
  },
  {
    "path": "ir_datasets/datasets/msmarco_document_v2.py",
    "chars": 10130,
    "preview": "import contextlib\nimport gzip\nimport io\nfrom pathlib import Path\nimport json\nfrom typing import NamedTuple, Tuple, List\n"
  },
  {
    "path": "ir_datasets/datasets/msmarco_passage.py",
    "chars": 19477,
    "preview": "import hashlib\nimport io\nimport codecs\nimport re\nimport ir_datasets\nfrom ir_datasets.util import Cache, TarExtract, Iter"
  },
  {
    "path": "ir_datasets/datasets/msmarco_passage_v2.py",
    "chars": 14189,
    "preview": "import re\nimport os\nimport contextlib\nimport gzip\nimport io\nfrom pathlib import Path\nimport json\nfrom typing import Name"
  },
  {
    "path": "ir_datasets/datasets/msmarco_qna.py",
    "chars": 16944,
    "preview": "import hashlib\nimport re\nimport itertools\nimport contextlib\nimport io\nimport codecs\nfrom typing import NamedTuple, Tuple"
  },
  {
    "path": "ir_datasets/datasets/nano_beir.py",
    "chars": 4150,
    "preview": "import ir_datasets\nfrom ir_datasets.datasets.base import Dataset, YamlDocumentation\nfrom ir_datasets.formats import (\n  "
  },
  {
    "path": "ir_datasets/datasets/natural_questions.py",
    "chars": 9935,
    "preview": "from typing import NamedTuple, List\nimport json\nimport contextlib\nimport ir_datasets\nfrom ir_datasets.datasets.base impo"
  },
  {
    "path": "ir_datasets/datasets/neuclir.py",
    "chars": 5979,
    "preview": "import gzip\nimport json\nfrom functools import lru_cache\n\nimport ir_datasets\nfrom ir_datasets.util import DownloadConfig,"
  },
  {
    "path": "ir_datasets/datasets/neumarco.py",
    "chars": 2522,
    "preview": "import io\nimport codecs\nimport re\nimport ir_datasets\nfrom ir_datasets.util import DownloadConfig, TarExtract, Cache\nfrom"
  },
  {
    "path": "ir_datasets/datasets/nfcorpus.py",
    "chars": 7548,
    "preview": "import io\nimport codecs\nimport re\nfrom typing import NamedTuple\nimport ir_datasets\nfrom ir_datasets.util import Cache, T"
  },
  {
    "path": "ir_datasets/datasets/nyt.py",
    "chars": 15807,
    "preview": "import io\nimport tarfile\nfrom typing import NamedTuple\nimport ir_datasets\nfrom ir_datasets.indices import PickleLz4FullS"
  },
  {
    "path": "ir_datasets/datasets/pmc.py",
    "chars": 6450,
    "preview": "import codecs\nimport tarfile\nimport itertools\nfrom typing import NamedTuple, Tuple\nfrom zipfile import ZipFile\nimport xm"
  },
  {
    "path": "ir_datasets/datasets/sara.py",
    "chars": 3346,
    "preview": "import ir_datasets\nfrom ir_datasets.datasets.base import Dataset, YamlDocumentation\nfrom ir_datasets.datasets.base impor"
  },
  {
    "path": "ir_datasets/datasets/touche.py",
    "chars": 8556,
    "preview": "from typing import Dict\n\nfrom ir_datasets import registry\nfrom ir_datasets.datasets.base import Dataset, YamlDocumentati"
  },
  {
    "path": "ir_datasets/datasets/touche_image.py",
    "chars": 1395,
    "preview": "from ir_datasets import registry\nfrom ir_datasets.datasets.base import Dataset, YamlDocumentation\nfrom ir_datasets.forma"
  },
  {
    "path": "ir_datasets/datasets/trec_arabic.py",
    "chars": 1614,
    "preview": "import ir_datasets\nfrom ir_datasets.util import DownloadConfig\nfrom ir_datasets.formats import TrecQrels, TrecDocs, Trec"
  },
  {
    "path": "ir_datasets/datasets/trec_cast.py",
    "chars": 26279,
    "preview": "import gzip\nfrom hashlib import md5\nimport os\nfrom functools import cached_property, lru_cache, partial\nfrom collections"
  },
  {
    "path": "ir_datasets/datasets/trec_fair.py",
    "chars": 11842,
    "preview": "import json\nimport codecs\nfrom typing import NamedTuple, Dict, List, Optional\nimport ir_datasets\nfrom ir_datasets.util i"
  },
  {
    "path": "ir_datasets/datasets/trec_mandarin.py",
    "chars": 2244,
    "preview": "from typing import NamedTuple\nimport ir_datasets\nfrom ir_datasets.util import GzipExtract, DownloadConfig\nfrom ir_datase"
  },
  {
    "path": "ir_datasets/datasets/trec_robust04.py",
    "chars": 3916,
    "preview": "import ir_datasets\nfrom ir_datasets.util import GzipExtract, Lazy, DownloadConfig\nfrom ir_datasets.formats import TrecQr"
  },
  {
    "path": "ir_datasets/datasets/trec_spanish.py",
    "chars": 3993,
    "preview": "from typing import NamedTuple\nimport ir_datasets\nfrom ir_datasets.util import GzipExtract, DownloadConfig\nfrom ir_datase"
  },
  {
    "path": "ir_datasets/datasets/trec_tot.py",
    "chars": 3743,
    "preview": "import ir_datasets\nfrom ir_datasets.util import ZipExtract, Cache, Lazy, DownloadConfig\nfrom ir_datasets.formats import "
  },
  {
    "path": "ir_datasets/datasets/trec_tot_2025.py",
    "chars": 4606,
    "preview": "from ir_datasets import registry\nfrom ir_datasets.datasets.base import Dataset, YamlDocumentation\nfrom ir_datasets.util."
  },
  {
    "path": "ir_datasets/datasets/tripclick.py",
    "chars": 16373,
    "preview": "from pathlib import Path\nimport json\nimport re\nimport os\nimport io\nimport hashlib\nfrom datetime import datetime\nfrom typ"
  },
  {
    "path": "ir_datasets/datasets/tweets2013_ia.py",
    "chars": 19234,
    "preview": "import os\nimport itertools\nimport contextlib\nimport shutil\nimport tarfile\nfrom collections import Counter\nfrom pathlib i"
  },
  {
    "path": "ir_datasets/datasets/vaswani.py",
    "chars": 3969,
    "preview": "import io\nimport itertools\nimport ir_datasets\nfrom ir_datasets.util import DownloadConfig, TarExtract, Cache\nfrom ir_dat"
  },
  {
    "path": "ir_datasets/datasets/wapo.py",
    "chars": 7511,
    "preview": "import io\nimport json\nimport tarfile\nfrom typing import NamedTuple, Tuple, Optional\nimport ir_datasets\nfrom ir_datasets."
  },
  {
    "path": "ir_datasets/datasets/wikiclir.py",
    "chars": 3077,
    "preview": "import contextlib\nfrom pathlib import Path\nfrom typing import NamedTuple\nimport ir_datasets\nfrom ir_datasets.util import"
  },
  {
    "path": "ir_datasets/datasets/wikir.py",
    "chars": 2129,
    "preview": "import contextlib\nfrom pathlib import Path\nfrom typing import NamedTuple\nimport ir_datasets\nfrom ir_datasets.util import"
  },
  {
    "path": "ir_datasets/docs/antique.yaml",
    "chars": 1265,
    "preview": "_:\n  pretty_name: 'ANTIQUE'\n  desc: '\n<p>\n\"ANTIQUE is a non-factoid quesiton answering dataset based on the questions an"
  },
  {
    "path": "ir_datasets/docs/aol-ia.yaml",
    "chars": 1291,
    "preview": "_:\n  pretty_name: 'AOL-IA (Internet Archive)'\n  desc: '\n<p>\nThis is a version of the AOL Query Log. Documents use versio"
  },
  {
    "path": "ir_datasets/docs/aquaint.yaml",
    "chars": 1689,
    "preview": "_:\n  pretty_name: 'AQUAINT'\n  desc: '\n<p>\nA document collection of about 1M English newswire text. Sources are the Xinhu"
  },
  {
    "path": "ir_datasets/docs/argsme.yaml",
    "chars": 4214,
    "preview": "_:\n  pretty_name: \"args.me\"\n  desc: |\n    <p>\n    The args.me corpus is one of the largest argument resources available\n"
  },
  {
    "path": "ir_datasets/docs/beir.yaml",
    "chars": 16887,
    "preview": "_:\n  pretty_name: 'Beir (benchmark suite)'\n  desc: '\n<p>\nBeir is a suite of benchmarks to test zero-shot transfer.\n</p>\n"
  },
  {
    "path": "ir_datasets/docs/bibliography.bib",
    "chars": 36793,
    "preview": "@inproceedings{Hashemi2020Antique,\n  title={ANTIQUE: A Non-Factoid Question Answering Benchmark},\n  author={Helia Hashem"
  },
  {
    "path": "ir_datasets/docs/c4.yaml",
    "chars": 643,
    "preview": "_:\n  pretty_name: 'C4'\n  desc: '\n<p>\nA version of <a href=\"https://www.tensorflow.org/datasets/catalog/c4\">Google''s C4 "
  },
  {
    "path": "ir_datasets/docs/car.yaml",
    "chars": 2537,
    "preview": "_:\n  pretty_name: 'TREC CAR'\n  desc: '\n<p>\nAn ad-hoc passage retrieval collection, constructed from Wikipedia and used a"
  },
  {
    "path": "ir_datasets/docs/clinicaltrials.yaml",
    "chars": 2705,
    "preview": "_:\n  pretty_name: 'Clinical Trials'\n  desc: '\n<p>\nClinical trial information from <a href=\"https://clinicaltrials.gov/\">"
  },
  {
    "path": "ir_datasets/docs/clirmatrix.yaml",
    "chars": 2195,
    "preview": "_:\n  pretty_name: 'CLIRMatrix'\n  desc: '\n<p>\nCLIRMatrix contains is massively large collection of bilingual and multilin"
  },
  {
    "path": "ir_datasets/docs/clueweb09.yaml",
    "chars": 4530,
    "preview": "_:\n  pretty_name: 'ClueWeb09'\n  desc: '\n<p>\nClueWeb 2009 web document collection. Contains over 1B web pages, in 10 lang"
  },
  {
    "path": "ir_datasets/docs/clueweb12.yaml",
    "chars": 7470,
    "preview": "_:\n  pretty_name: 'ClueWeb12'\n  desc: '\n<p>\nClueWeb 2012 web document collection. Contains 733M web pages.\n</p>\n<p>\nThe "
  },
  {
    "path": "ir_datasets/docs/codec.yaml",
    "chars": 1349,
    "preview": "_:\n  pretty_name: 'CODEC'\n  desc: '\n<p>\nCODEC Document Ranking sub-task.\n</p>\n<ul>\n<li>Documents: curated web articles</"
  },
  {
    "path": "ir_datasets/docs/codesearchnet.yaml",
    "chars": 1022,
    "preview": "_:\n  pretty_name: 'CodeSearchNet'\n  desc: '\n<p>\nA benchmark for semantic code search. Uses \n</p>\n<ul>\n  <li>Documents: C"
  },
  {
    "path": "ir_datasets/docs/cord19.yaml",
    "chars": 5002,
    "preview": "_:\n  pretty_name: 'CORD-19'\n  desc: '\n<p>\nCollection of scientific articles related to COVID-19.\n</p>\n<p>\nUses the 2020-"
  },
  {
    "path": "ir_datasets/docs/cranfield.yaml",
    "chars": 300,
    "preview": "_:\n  pretty_name: 'Cranfield'\n  desc: '\n<p>\nA small corpus of 1,400 scientific abstracts.\n</p>\n<ul>\n  <li>Documents: Sci"
  },
  {
    "path": "ir_datasets/docs/csl.yaml",
    "chars": 192,
    "preview": "_:\n  pretty_name: 'CSL'\n  desc: '\n<p>\nThe CSL dataset, used for the TREC NueCLIR technical document task.\n</p>\n'\n\ntrec-2"
  },
  {
    "path": "ir_datasets/docs/disks45.yaml",
    "chars": 4479,
    "preview": "_:\n  pretty_name: 'TREC Disks 4 and 5'\n  desc: '\n<p>\nTREC Disks 4 and 5, including documents from the Financial Times, t"
  },
  {
    "path": "ir_datasets/docs/dpr-w100.yaml",
    "chars": 2184,
    "preview": "_:\n  pretty_name: 'DPR Wiki100'\n  desc: '\n<p>\nA wikipedia dump from 20 December, 2018, split into passages of 100 words."
  },
  {
    "path": "ir_datasets/docs/gov.yaml",
    "chars": 3154,
    "preview": "_:\n  pretty_name: 'GOV'\n  desc: '\n<p>\nGOV web document collection. Used for early TREC Web Tracks. Not to be confused wi"
  },
  {
    "path": "ir_datasets/docs/gov2.yaml",
    "chars": 6202,
    "preview": "_:\n  pretty_name: 'GOV2'\n  desc: '\n<p>\nGOV2 web document collection. Used for the TREC Terabyte Track.\n</p>\n<p>\nThe data"
  },
  {
    "path": "ir_datasets/docs/hc4.yaml",
    "chars": 5257,
    "preview": "_:\n  pretty_name: 'HC4 (HLTCOE CLIR Common-Crawl Collection)'\n  desc: '\n<p>\nHC4 is a new suite of test collections for a"
  },
  {
    "path": "ir_datasets/docs/highwire.yaml",
    "chars": 1574,
    "preview": "_:\n  pretty_name: 'Highwire (TREC Genomics 2006-07)'\n  desc: '\n<p>\nMedical document collection from <a href=\"https://www"
  },
  {
    "path": "ir_datasets/docs/istella22.yaml",
    "chars": 1169,
    "preview": "_:\n  pretty_name: 'Istella22'\n  desc: '\n<p>\nThe Istella22 dataset facilitates comparisions between traditional and neura"
  },
  {
    "path": "ir_datasets/docs/kilt.yaml",
    "chars": 1144,
    "preview": "_:\n  pretty_name: 'KILT'\n  desc: '\n<p>\nKILT is a corpus used for various \"knowledge intensive language tasks\".\n</p>\n<ul>"
  },
  {
    "path": "ir_datasets/docs/lotte.yaml",
    "chars": 7868,
    "preview": "_:\n  pretty_name: 'LoTTE'\n  bibtex_ids: ['Santhanam2021ColBERTv2']\n  desc: '\n<p>\nLoTTE (Long-Tail Topic-stratified Evalu"
  },
  {
    "path": "ir_datasets/docs/medline.yaml",
    "chars": 3031,
    "preview": "_:\n  pretty_name: 'Medline'\n  desc: '\n<p>\nMedical articles from <a href=\"https://www.nlm.nih.gov/bsd/medline.html\">Medli"
  },
  {
    "path": "ir_datasets/docs/miracl.yaml",
    "chars": 8198,
    "preview": "_:\n  pretty_name: 'MIRACL'\n  desc: '\n<p>\nMIRACL is a multilingual adhoc retrieval dataset covering 18 languages.\nThe doc"
  },
  {
    "path": "ir_datasets/docs/mmarco.yaml",
    "chars": 17454,
    "preview": "_:\n  pretty_name: 'mMARCO'\n  desc: '\n<p>\nA version of the MS MARCO passage dataset (<a class=\"ds-ref\">msmarco-passage</a"
  },
  {
    "path": "ir_datasets/docs/mr-tydi.yaml",
    "chars": 5608,
    "preview": "_:\n  pretty_name: 'Mr. TyDi'\n  bibtex_ids: ['Zhang2021MrTyDi', 'Clark2020TyDiQa']\n  desc: '\n<p>\nA multi-lingual benchmar"
  },
  {
    "path": "ir_datasets/docs/msmarco-document-v2.yaml",
    "chars": 4301,
    "preview": "_:\n  pretty_name: 'MSMARCO (document, version 2)'\n  desc: '\n  <p>\nVersion 2 of the MS MARCO document ranking dataset. Th"
  },
  {
    "path": "ir_datasets/docs/msmarco-document.yaml",
    "chars": 5810,
    "preview": "_:\n  pretty_name: 'MSMARCO (document)'\n  desc: '\n  <p>\n\"Based the questions in the [MS-MARCO] Question Answering Dataset"
  },
  {
    "path": "ir_datasets/docs/msmarco-passage-v2.yaml",
    "chars": 3711,
    "preview": "_:\n  pretty_name: 'MSMARCO (passage, version 2)'\n  desc: '\n  <p>\nVersion 2 of the MS MARCO passage ranking dataset. The "
  },
  {
    "path": "ir_datasets/docs/msmarco-passage.yaml",
    "chars": 8734,
    "preview": "_:\n  pretty_name: 'MSMARCO (passage)'\n  desc: '\n<p>\nA passage ranking benchmark with a collection of 8.8 million passage"
  },
  {
    "path": "ir_datasets/docs/msmarco-qna.yaml",
    "chars": 2590,
    "preview": "_:\n  pretty_name: 'MSMARCO (QnA)'\n  desc: '\n<p>\nThe MS MARCO Question Answering dataset. This is the source collection o"
  },
  {
    "path": "ir_datasets/docs/nano-beir.yaml",
    "chars": 5988,
    "preview": "_:\n  pretty_name: 'Nano Beir (benchmark suite)'\n  desc: '\n<p>\nNano Beir is a smaller version (max 50 queries per benchma"
  },
  {
    "path": "ir_datasets/docs/natural-questions.yaml",
    "chars": 1381,
    "preview": "_:\n  pretty_name: 'Natural Questions'\n  desc: '\n<p>\nGoogle Natural Questions is a Q&amp;A dataset containing long, short"
  },
  {
    "path": "ir_datasets/docs/neuclir.yaml",
    "chars": 3992,
    "preview": "_:\n  pretty_name: 'NeuCLIR Corpus'\n  desc: '\n<p>\nThis is the dataset created for <a href=\"https://neuclir.github.io/\">TR"
  },
  {
    "path": "ir_datasets/docs/neumarco.yaml",
    "chars": 2860,
    "preview": "_:\n  pretty_name: \"neuMARCO\"\n  desc: '\n<p>\nA version of <a class=\"ds-ref\">msmarco-passage</a> for cross-language\ninforma"
  },
  {
    "path": "ir_datasets/docs/nfcorpus.yaml",
    "chars": 2124,
    "preview": "_:\n  pretty_name: 'NFCorpus (NutritionFacts)'\n  desc: '\n<p>\n\"NFCorpus is a full-text English retrieval data set for Medi"
  },
  {
    "path": "ir_datasets/docs/nyt.yaml",
    "chars": 2818,
    "preview": "_:\n  pretty_name: 'NYT'\n  desc: '\n  <p>\nThe New York Times Annotated Corpus. Consists of articles published between 1987"
  },
  {
    "path": "ir_datasets/docs/pmc.yaml",
    "chars": 1787,
    "preview": "_:\n  pretty_name: 'PubMed Central (TREC CDS)'\n  desc: '\n<p>\nBio-medical articles from <a href=\"https://www.ncbi.nlm.nih."
  },
  {
    "path": "ir_datasets/docs/sara.yaml",
    "chars": 372,
    "preview": "_: # matches documentation key above\n  pretty_name: 'SARA' # a more human-readable way to present this dataset than the "
  },
  {
    "path": "ir_datasets/docs/touche-image.yaml",
    "chars": 1846,
    "preview": "_:\n  pretty_name: \"Touché Image Search\"\n  desc: |\n    <p>\n    Focused crawl of about 23 841 images (and associated web p"
  },
  {
    "path": "ir_datasets/docs/touche.yaml",
    "chars": 13216,
    "preview": "2020/task-1:\n  pretty_name: \"Touché 2020 Task 1: Argument Retrieval for Controversial Questions\"\n  desc: |\n    <p>\n    D"
  },
  {
    "path": "ir_datasets/docs/trec-arabic.yaml",
    "chars": 1489,
    "preview": "_:\n  pretty_name: 'TREC Arabic'\n  desc: '\n<p>\nA collection of news articles in Arabic, used for multi-lingual evaluation"
  },
  {
    "path": "ir_datasets/docs/trec-cast.yaml",
    "chars": 4543,
    "preview": "_:\n  pretty_name: 'TREC CAsT (Conversational Assistance)'\n  desc: '\n<p>\nThe TREC Conversational Assistance Track (CAsT) "
  },
  {
    "path": "ir_datasets/docs/trec-fair.yaml",
    "chars": 1000,
    "preview": "_:\n  pretty_name: \"TREC Fair Ranking\"\n  desc: '\n<p>\nThe TREC Fair Ranking track evaluates systems according to how well "
  },
  {
    "path": "ir_datasets/docs/trec-mandarin.yaml",
    "chars": 1536,
    "preview": "_:\n  pretty_name: 'TREC Mandarin'\n  desc: '\n<p>\nA collection of news articles in Mandarin in Simplified Chinese, used fo"
  },
  {
    "path": "ir_datasets/docs/trec-robust04.yaml",
    "chars": 2876,
    "preview": "_:\n  pretty_name: 'TREC Robust 2004'\n  desc: '\n<p>\nThe TREC Robust retrieval task focuses on \"improving the consistency "
  },
  {
    "path": "ir_datasets/docs/trec-spanish.yaml",
    "chars": 1458,
    "preview": "_:\n  pretty_name: 'TREC Spanish'\n  desc: '\n<p>\nA collection of news articles in Spanish, used for multi-lingual evaluati"
  },
  {
    "path": "ir_datasets/docs/trec-tot-2025.yaml",
    "chars": 913,
    "preview": "_:\n  pretty_name: 'TREC Tip-of-the-Tongue'\n  desc: '\n<p>\nTip of the tongue: The phenomenon of failing to retrieve someth"
  },
  {
    "path": "ir_datasets/docs/trec-tot.yaml",
    "chars": 813,
    "preview": "_:\n  pretty_name: 'TREC Tip-of-the-Tongue'\n  desc: '\n<p>\nTip of the tongue: The phenomenon of failing to retrieve someth"
  },
  {
    "path": "ir_datasets/docs/tripclick.yaml",
    "chars": 9087,
    "preview": "_:\n  pretty_name: 'TripClick'\n  desc: '\n<p>\nTripClick is a large collection from the <a href=\"https://www.tripdatabase.c"
  },
  {
    "path": "ir_datasets/docs/tweets2013-ia.yaml",
    "chars": 1721,
    "preview": "_:\n  pretty_name: 'Tweets 2013 (Internet Archive)'\n  desc: '\n<p>\nA collection of tweets from a 2-month window achived by"
  },
  {
    "path": "ir_datasets/docs/vaswani.yaml",
    "chars": 305,
    "preview": "_:\n  pretty_name: 'Vaswani'\n  desc: '\n<p>\nA small corpus of roughly 11,000 scientific abstracts.\n</p>\n<ul>\n  <li>Documen"
  },
  {
    "path": "ir_datasets/docs/wapo.yaml",
    "chars": 3079,
    "preview": "_:\n  pretty_name: 'Washington Post'\n  desc: '\n<p>\nThe Washington Post collection.\n</p>'\n  docs_instructions: &inst \"docs"
  },
  {
    "path": "ir_datasets/docs/wikiclir.yaml",
    "chars": 3046,
    "preview": "_:\n  pretty_name: \"WikiCLIR\"\n  desc: '\n<p>\nA Cross-Language IR (CLIR) collection between English queries and other langu"
  },
  {
    "path": "ir_datasets/docs/wikir.yaml",
    "chars": 5125,
    "preview": "_:\n  pretty_name: \"WikIR\"\n  desc: '\n<p>\nA suite of IR benchmarks in multiple languages built from Wikipeida.\n</p>\n<ul>\n<"
  },
  {
    "path": "ir_datasets/etc/downloads.json",
    "chars": 286747,
    "preview": "{\n  \"antique\": {\n    \"docs\": {\n      \"url\": \"https://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt\",\n      "
  },
  {
    "path": "ir_datasets/etc/metadata.json",
    "chars": 135539,
    "preview": "{\n  \"antique\": {\"docs\": {\"count\": 403666, \"fields\": {\"doc_id\": {\"max_len\": 10, \"common_prefix\": \"\"}}}},\n  \"antique/test\""
  },
  {
    "path": "ir_datasets/formats/__init__.py",
    "chars": 1721,
    "preview": "from .base import GenericDoc, GenericQuery, GenericQrel, GenericScoredDoc, GenericDocPair, DocstoreBackedDocs, DocSource"
  },
  {
    "path": "ir_datasets/formats/argsme.py",
    "chars": 16778,
    "preview": "from ast import literal_eval\nfrom csv import DictReader, field_size_limit\nfrom datetime import datetime\nfrom enum import"
  },
  {
    "path": "ir_datasets/formats/base.py",
    "chars": 11804,
    "preview": "import hashlib\nimport json\nimport types\nimport itertools\nfrom typing import NamedTuple\nimport ir_datasets\n\n_logger = ir_"
  },
  {
    "path": "ir_datasets/formats/clirmatrix.py",
    "chars": 1299,
    "preview": "import codecs\nimport json\nfrom . import TrecQrels, TrecQrel\nfrom .base import GenericQuery, BaseQueries\n\n\nclass CLIRMatr"
  },
  {
    "path": "ir_datasets/formats/csv_fmt.py",
    "chars": 3251,
    "preview": "import sys\nimport codecs\nimport contextlib\nimport csv\nfrom typing import Tuple\nimport io\nimport ir_datasets\nfrom .base i"
  },
  {
    "path": "ir_datasets/formats/extracted_cc.py",
    "chars": 8735,
    "preview": "from typing import Dict, NamedTuple\nimport json\n\nimport ir_datasets\nfrom ir_datasets.formats.base import BaseDocs, BaseQ"
  },
  {
    "path": "ir_datasets/formats/jsonl.py",
    "chars": 3067,
    "preview": "import sys\nimport codecs\nimport contextlib\nimport json\nfrom typing import Tuple\nimport io\nimport ir_datasets\nfrom .base "
  },
  {
    "path": "ir_datasets/formats/ntcir.py",
    "chars": 593,
    "preview": "import codecs\nfrom . import TrecQrels, TrecQrel\n\n\nclass NtcirQrels(TrecQrels):\n    def qrels_iter(self):\n        with se"
  },
  {
    "path": "ir_datasets/formats/touche.py",
    "chars": 23436,
    "preview": "from enum import Enum\nfrom io import TextIOWrapper\nfrom json import loads\nfrom typing import NamedTuple, Any, Optional, "
  },
  {
    "path": "ir_datasets/formats/touche_image.py",
    "chars": 8782,
    "preview": "from io import TextIOWrapper\nfrom itertools import takewhile\nfrom json import loads\nfrom re import compile\nfrom typing i"
  },
  {
    "path": "ir_datasets/formats/trec.py",
    "chars": 20308,
    "preview": "import io\nimport codecs\nimport tarfile\nimport re\nimport gzip\nfrom glob import glob as fnglob\nimport xml.etree.ElementTre"
  },
  {
    "path": "ir_datasets/formats/tsv.py",
    "chars": 7144,
    "preview": "import contextlib\nfrom typing import Tuple\nimport io\nimport ir_datasets\nfrom .base import GenericDoc, GenericQuery, Gene"
  },
  {
    "path": "ir_datasets/formats/webarc.py",
    "chars": 3204,
    "preview": "import gzip\nimport re\nfrom contextlib import contextmanager, ExitStack\nfrom typing import NamedTuple\nimport ir_datasets\n"
  },
  {
    "path": "ir_datasets/indices/__init__.py",
    "chars": 421,
    "preview": "from .base import Docstore, DEFAULT_DOCSTORE_OPTIONS, DocstoreOptions, FileAccess\nfrom .indexed_tsv_docstore import Inde"
  },
  {
    "path": "ir_datasets/indices/base.py",
    "chars": 1310,
    "preview": "\nfrom dataclasses import dataclass, field\nfrom enum import Enum\n\nclass FileAccess(Enum):\n    FILE = 0\n    MMAP = 1\n    M"
  },
  {
    "path": "ir_datasets/indices/cache_docstore.py",
    "chars": 1116,
    "preview": "import os\nfrom contextlib import contextmanager\nimport ir_datasets\nfrom . import Docstore, Lz4PickleLookup, DEFAULT_DOCS"
  },
  {
    "path": "ir_datasets/indices/clueweb_warc.py",
    "chars": 11467,
    "preview": "import io\nimport os\nimport gzip\nfrom contextlib import ExitStack\nimport ir_datasets\nfrom . import Docstore\n\nclass WarcIn"
  },
  {
    "path": "ir_datasets/indices/indexed_tsv_docstore.py",
    "chars": 11612,
    "preview": "import os\nimport shutil\nimport json\nimport zlib\nimport pickle\nfrom contextlib import contextmanager\nimport ir_datasets\n\n"
  },
  {
    "path": "ir_datasets/indices/lz4_pickle.py",
    "chars": 11614,
    "preview": "import io\nimport mmap\nimport os\nimport pickle\n\nfrom ir_datasets.indices import DEFAULT_DOCSTORE_OPTIONS, FileAccess\n\ntry"
  },
  {
    "path": "ir_datasets/indices/numpy_sorted_index.py",
    "chars": 6825,
    "preview": "import os\nimport ir_datasets\nfrom ir_datasets.indices import FileAccess\n\nclass NumpySortedIndex:\n    def __init__(self, "
  },
  {
    "path": "ir_datasets/indices/zpickle_docstore.py",
    "chars": 4824,
    "preview": "import os\nimport shutil\nimport json\nimport zlib\nimport pickle\nfrom contextlib import contextmanager\nfrom .indexed_tsv_do"
  },
  {
    "path": "ir_datasets/lazy_libs.py",
    "chars": 4917,
    "preview": "# These libraries can add a bunch of overhead when imported -- which is bad for command line\n# utilities. This file load"
  },
  {
    "path": "ir_datasets/log.py",
    "chars": 5726,
    "preview": "import sys\nimport logging\nimport operator\nfrom contextlib import contextmanager\nfrom time import time\nimport ir_datasets"
  },
  {
    "path": "ir_datasets/util/__init__.py",
    "chars": 9550,
    "preview": "import re\nimport os\nimport math\nimport functools\nimport shutil\nfrom contextlib import contextmanager\nfrom threading impo"
  },
  {
    "path": "ir_datasets/util/docs/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "ir_datasets/util/docs/lazy.py",
    "chars": 6463,
    "preview": "from abc import ABC, abstractmethod\nfrom functools import cached_property, lru_cache\nfrom typing import Iterator, Protoc"
  },
  {
    "path": "ir_datasets/util/docs/multiple.py",
    "chars": 5584,
    "preview": "from typing import List, Sequence, Tuple, Optional\nfrom functools import cached_property, lru_cache\nfrom dataclasses imp"
  },
  {
    "path": "ir_datasets/util/docs/subset.py",
    "chars": 4245,
    "preview": "import array\nimport os\nfrom functools import cached_property, lru_cache\nfrom typing import Optional\n\nimport ir_datasets\n"
  },
  {
    "path": "ir_datasets/util/download.py",
    "chars": 16850,
    "preview": "import json\nimport pkgutil\nimport os\nimport sys\nfrom pathlib import Path\nimport atexit\nfrom collections import deque\nimp"
  },
  {
    "path": "ir_datasets/util/fileio.py",
    "chars": 9099,
    "preview": "import os\nimport contextlib\nimport shutil\nfrom pathlib import Path\nfrom fnmatch import fnmatch\nimport tarfile\nimport gzi"
  },
  {
    "path": "ir_datasets/util/hash.py",
    "chars": 1433,
    "preview": "import io\nimport hashlib\nimport ir_datasets\n\n\n__all__ = ['HashVerificationError', 'HashVerifier', 'HashStream']\n_logger "
  },
  {
    "path": "ir_datasets/util/html_parsing.py",
    "chars": 3341,
    "preview": "from collections import deque\nimport re\nimport io\nimport ir_datasets\n\n\ndef find_charset(text):\n    if text is None:\n    "
  },
  {
    "path": "ir_datasets/util/metadata.py",
    "chars": 3316,
    "preview": "import json\nfrom typing import Callable, Optional, Dict, Any\nfrom functools import partial\nimport ir_datasets\nfrom .file"
  },
  {
    "path": "ir_datasets/util/registry.py",
    "chars": 1591,
    "preview": "import os\nimport re\nimport ir_datasets\nfrom .metadata import MetadataComponent\n\n\n__all__ = 'Registry'\n_logger = ir_datas"
  },
  {
    "path": "ir_datasets/wrappers/__init__.py",
    "chars": 45,
    "preview": "from .html_extractor import HtmlDocExtractor\n"
  },
  {
    "path": "ir_datasets/wrappers/html_extractor.py",
    "chars": 4864,
    "preview": "import math\nimport os\nimport multiprocessing\nfrom threading import Semaphore\nimport ir_datasets\n\n\n_logger = ir_datasets."
  },
  {
    "path": "pyproject.toml",
    "chars": 1898,
    "preview": "[build-system]\nrequires = [\"setuptools>=42\", \"wheel\"]\nbuild-backend = \"setuptools.build_meta\"\n\n[project]\nname = \"ir_data"
  },
  {
    "path": "requirements-test.txt",
    "chars": 20,
    "preview": "pyautocorpus>=0.1.1\n"
  },
  {
    "path": "requirements.txt",
    "chars": 89,
    "preview": "lxml>=4.5.2,<6.0.0\nnumpy>=1.18.1\npyyaml>=5.3.1\nrequests>=2.22.0\ntqdm>=4.38.0\nlz4>=3.1.10\n"
  },
  {
    "path": "test/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "test/downloads.py",
    "chars": 5901,
    "preview": "import requests\nimport gzip\nimport io\nimport random\nimport sys\nimport json\nimport time\nimport datetime\nfrom contextlib i"
  },
  {
    "path": "test/dummy/docs.tsv",
    "chars": 1455,
    "preview": "T1\tCUT, CAP AND BALANCE. TAXED ENOUGH ALREADY!\nT2\tTake a look at and to see these beautiful hotels.\nT3\tUS News named the"
  },
  {
    "path": "test/dummy/qrels",
    "chars": 515,
    "preview": "1 0 T1 0\n1 0 T2 0\n1 0 T4 1\n1 0 T5 0\n1 0 T6 0\n1 0 T7 0\n1 0 T8 1\n1 0 T9 1\n1 0 T10 0\n1 0 T11 0\n1 0 T12 0\n1 0 T13 0\n1 0 T14 "
  },
  {
    "path": "test/dummy/queries.tsv",
    "chars": 72,
    "preview": "1\trepublican party\n2\thospitality industry\n3\tgovernment spending\n4\tmedia\n"
  },
  {
    "path": "test/formats/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "test/formats/test_trec.py",
    "chars": 3281,
    "preview": "import os\nimport shutil\nimport unittest\nfrom ir_datasets.formats import TrecQrel, TrecQrels, TrecQuery, TrecQueries, Tre"
  },
  {
    "path": "test/formats/test_tsv.py",
    "chars": 4027,
    "preview": "import os\nfrom typing import NamedTuple, Tuple\nimport shutil\nimport unittest\nfrom ir_datasets.formats import TsvDocs, Ts"
  },
  {
    "path": "test/indices/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "test/indices/lz4_pickle.py",
    "chars": 3078,
    "preview": "import tempfile\nimport unittest\nimport numpy as np\nfrom ir_datasets.indices import Lz4PickleLookup, FileAccess\nfrom ir_d"
  },
  {
    "path": "test/indices/numpy_sorted.py",
    "chars": 1847,
    "preview": "import tempfile\nimport unittest\nimport numpy as np\nfrom ir_datasets.indices import NumpySortedIndex\n\n\nclass TestNumpySor"
  },
  {
    "path": "test/integration/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "test/integration/antique.py",
    "chars": 6771,
    "preview": "import unittest\nimport ir_datasets\nfrom ir_datasets.formats import GenericDoc, GenericQuery, TrecQrel\nfrom .base import "
  },
  {
    "path": "test/integration/aol_ia.py",
    "chars": 2819,
    "preview": "import re\nimport unittest\nimport datetime\nimport ir_datasets\nfrom ir_datasets.formats import GenericQuery, TrecQrel\nfrom"
  }
]

// ... and 69 more files (download for full content)

About this extraction

This page contains the full source code of the allenai/ir_datasets GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 269 files (3.1 MB), approximately 812.3k tokens, and a symbol index with 2346 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Copied to clipboard!